Add files using upload-large-folder tool
Browse files- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/debug-internal.log +63 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/debug.log +23 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/files/output.log +0 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/logs/debug-internal.log +82 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/logs/debug.log +94 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/run-ln8ma2mo.wandb +0 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/files/output.log +0 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/logs/debug-internal.log +82 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/logs/debug.log +94 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/run-mz3ej8ig.wandb +0 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/config.yaml +129 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/output.log +21 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/requirements.txt +225 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/wandb-metadata.json +99 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/wandb-summary.json +1 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/logs/debug-internal.log +18 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/logs/debug.log +23 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/run-d21a8n96.wandb +0 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/files/output.log +0 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/logs/debug-internal.log +16 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/logs/debug.log +16 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/run-y2lylvs5.wandb +0 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/config.yaml +237 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/output.log +0 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/requirements.txt +225 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/wandb-metadata.json +99 -0
- ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/wandb-summary.json +1 -0
- ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/debug-internal.log +42 -0
- ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/debug.log +24 -0
- ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/config.yaml +236 -0
- ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/output.log +21 -0
- ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/requirements.txt +225 -0
- ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-metadata.json +97 -0
- ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-summary.json +1 -0
- ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log +42 -0
- ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log +24 -0
- ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/debug-internal.log +87 -0
- ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/debug.log +24 -0
- ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/config.yaml +236 -0
- ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/output.log +21 -0
- ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/requirements.txt +225 -0
- ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/wandb-metadata.json +98 -0
- ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/wandb-summary.json +1 -0
- ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug-internal.log +87 -0
- ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug.log +24 -0
- ProtT3/all_checkpoints/stage1_ckpt/wandb/debug-internal.log +19 -0
- ProtT3/all_checkpoints/stage1_ckpt/wandb/debug.log +23 -0
- ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/config.yaml +129 -0
- ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/output.log +2 -0
- ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/requirements.txt +225 -0
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-29T00:07:02.130913564+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-29T00:07:16.339720801+08:00","level":"INFO","msg":"created new stream","id":"rypk39yq"}
|
| 3 |
+
{"time":"2025-06-29T00:07:16.340562919+08:00","level":"INFO","msg":"stream: started","id":"rypk39yq"}
|
| 4 |
+
{"time":"2025-06-29T00:07:16.340584288+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"rypk39yq"}
|
| 5 |
+
{"time":"2025-06-29T00:07:16.340617888+08:00","level":"INFO","msg":"sender: started","stream_id":"rypk39yq"}
|
| 6 |
+
{"time":"2025-06-29T00:07:16.340654242+08:00","level":"INFO","msg":"handler: started","stream_id":"rypk39yq"}
|
| 7 |
+
{"time":"2025-06-29T00:07:28.033909694+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-06-29T00:12:24.114755958+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:52688->104.21.20.172:443: read: connection timed out"}
|
| 9 |
+
{"time":"2025-06-29T00:15:17.682707235+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:43992->104.21.20.172:443: read: connection timed out"}
|
| 10 |
+
{"time":"2025-06-29T00:16:13.20335199+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 11 |
+
{"time":"2025-06-29T00:16:45.525802023+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 12 |
+
{"time":"2025-06-29T00:17:19.98711773+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 13 |
+
{"time":"2025-06-29T00:18:06.642780387+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:36080->172.67.193.61:443: read: connection timed out"}
|
| 14 |
+
{"time":"2025-06-29T00:22:43.123257688+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:52664->172.67.193.61:443: read: connection timed out"}
|
| 15 |
+
{"time":"2025-06-29T00:26:08.434737599+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:42534->172.67.193.61:443: read: connection timed out"}
|
| 16 |
+
{"time":"2025-06-29T00:27:44.454100719+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:53006->104.21.20.172:443: read: connection reset by peer"}
|
| 17 |
+
{"time":"2025-06-29T00:29:13.211268181+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 18 |
+
{"time":"2025-06-29T00:29:45.68436365+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 19 |
+
{"time":"2025-06-29T00:30:19.759580601+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 20 |
+
{"time":"2025-06-29T00:30:33.650730605+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:38754->172.67.193.61:443: read: connection timed out"}
|
| 21 |
+
{"time":"2025-06-29T00:30:58.011093426+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 22 |
+
{"time":"2025-06-29T00:34:39.922752645+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:35350->172.67.193.61:443: read: connection timed out"}
|
| 23 |
+
{"time":"2025-06-29T00:36:41.88529828+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": unexpected EOF"}
|
| 24 |
+
{"time":"2025-06-29T00:37:20.878368218+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:46470->104.21.20.172:443: read: connection reset by peer"}
|
| 25 |
+
{"time":"2025-06-29T00:38:49.414424011+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": unexpected EOF"}
|
| 26 |
+
{"time":"2025-06-29T00:38:58.216757113+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 27 |
+
{"time":"2025-06-29T00:39:20.141003198+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:53708->104.21.20.172:443: read: connection reset by peer"}
|
| 28 |
+
{"time":"2025-06-29T00:41:33.299264534+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:44198->104.21.20.172:443: read: connection reset by peer"}
|
| 29 |
+
{"time":"2025-06-29T00:47:37.138754922+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:39138->172.67.193.61:443: read: connection timed out"}
|
| 30 |
+
{"time":"2025-06-29T00:54:28.224811124+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 31 |
+
{"time":"2025-06-29T00:55:15.429710397+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:49584->104.21.20.172:443: read: connection reset by peer"}
|
| 32 |
+
{"time":"2025-06-29T00:55:36.251525534+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:55184->104.21.20.172:443: read: connection reset by peer"}
|
| 33 |
+
{"time":"2025-06-29T00:56:12.092902722+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": http2: client conn is closed"}
|
| 34 |
+
{"time":"2025-06-29T00:59:32.604209299+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:44582->172.67.193.61:443: read: connection reset by peer"}
|
| 35 |
+
{"time":"2025-06-29T01:00:43.231046844+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 36 |
+
{"time":"2025-06-29T01:05:28.234577388+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 37 |
+
{"time":"2025-06-29T01:06:00.428439859+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 38 |
+
{"time":"2025-06-29T01:06:35.403033399+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 39 |
+
{"time":"2025-06-29T01:07:13.835463934+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 40 |
+
{"time":"2025-06-29T01:12:30.014897464+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": unexpected EOF"}
|
| 41 |
+
{"time":"2025-06-29T01:14:58.239397356+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 42 |
+
{"time":"2025-06-29T01:15:30.658073848+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 43 |
+
{"time":"2025-06-29T01:16:05.133874663+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 44 |
+
{"time":"2025-06-29T01:16:43.256922452+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 45 |
+
{"time":"2025-06-29T01:17:07.122753765+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:42208->172.67.193.61:443: read: connection timed out"}
|
| 46 |
+
{"time":"2025-06-29T01:17:31.631854783+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 47 |
+
{"time":"2025-06-29T01:18:38.479583401+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 48 |
+
{"time":"2025-06-29T01:20:08.481626584+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 49 |
+
{"time":"2025-06-29T01:21:38.483904393+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 50 |
+
{"time":"2025-06-29T01:22:09.185192206+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 51 |
+
{"time":"2025-06-29T01:28:06.578759778+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:39408->172.67.193.61:443: read: connection timed out"}
|
| 52 |
+
{"time":"2025-06-29T02:00:40.766530394+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.bandw.top/graphql","body":"error code: 502"}
|
| 53 |
+
{"time":"2025-06-29T08:45:43.611887283+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 54 |
+
{"time":"2025-06-29T08:45:55.061157169+08:00","level":"INFO","msg":"api: retrying HTTP error","status":520,"url":"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream","body":"error code: 520"}
|
| 55 |
+
{"time":"2025-06-29T08:51:23.638432293+08:00","level":"INFO","msg":"api: retrying HTTP error","status":524,"url":"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream","body":"error code: 524"}
|
| 56 |
+
{"time":"2025-06-29T10:16:08.309722526+08:00","level":"INFO","msg":"stream: closing","id":"rypk39yq"}
|
| 57 |
+
{"time":"2025-06-29T10:16:08.309813211+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 58 |
+
{"time":"2025-06-29T10:16:08.311047133+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 59 |
+
{"time":"2025-06-29T10:16:10.887637294+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 60 |
+
{"time":"2025-06-29T10:16:11.831362524+08:00","level":"INFO","msg":"handler: closed","stream_id":"rypk39yq"}
|
| 61 |
+
{"time":"2025-06-29T10:16:11.831401295+08:00","level":"INFO","msg":"sender: closed","stream_id":"rypk39yq"}
|
| 62 |
+
{"time":"2025-06-29T10:16:11.831391+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"rypk39yq"}
|
| 63 |
+
{"time":"2025-06-29T10:16:11.835883161+08:00","level":"INFO","msg":"stream: closed","id":"rypk39yq"}
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/debug.log
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Configure stats pid to 938398
|
| 3 |
+
2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug.log
|
| 7 |
+
2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug-internal.log
|
| 8 |
+
2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-29 00:07:02,122 INFO MainThread:938398 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-29 00:07:02,125 INFO MainThread:938398 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-29 00:07:02,126 INFO MainThread:938398 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-29 00:07:02,129 INFO MainThread:938398 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-29 00:07:27,982 INFO MainThread:938398 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-06-29 00:07:28,171 INFO MainThread:938398 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-06-29 00:07:28,172 INFO MainThread:938398 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-06-29 00:07:28,176 INFO MainThread:938398 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-06-29 00:07:28,176 INFO MainThread:938398 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-06-29 00:07:28,177 INFO MainThread:938398 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-06-29 10:16:08,240 INFO MsgRouterThr:938398 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/files/output.log
ADDED
|
File without changes
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-28T23:51:50.395952363+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-28T23:52:20.501357263+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 3 |
+
{"time":"2025-06-28T23:52:52.730097178+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 4 |
+
{"time":"2025-06-28T23:53:27.526291573+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 5 |
+
{"time":"2025-06-28T23:54:05.599297713+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 6 |
+
{"time":"2025-06-28T23:54:54.222254418+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 7 |
+
{"time":"2025-06-28T23:56:01.770405008+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 8 |
+
{"time":"2025-06-28T23:57:31.772851692+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 9 |
+
{"time":"2025-06-28T23:59:01.774763186+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 10 |
+
{"time":"2025-06-29T00:00:31.777124138+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 11 |
+
{"time":"2025-06-29T00:02:01.779769246+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 12 |
+
{"time":"2025-06-29T00:03:31.781963651+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 13 |
+
{"time":"2025-06-29T00:05:01.784626624+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 14 |
+
{"time":"2025-06-29T00:06:31.787236433+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 15 |
+
{"time":"2025-06-29T00:08:01.789516886+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 16 |
+
{"time":"2025-06-29T00:09:31.792010561+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 17 |
+
{"time":"2025-06-29T00:11:01.793420114+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 18 |
+
{"time":"2025-06-29T00:12:31.795510043+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 19 |
+
{"time":"2025-06-29T00:14:01.797888169+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 20 |
+
{"time":"2025-06-29T00:15:31.800223897+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 21 |
+
{"time":"2025-06-29T00:17:01.802475149+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 22 |
+
{"time":"2025-06-29T00:18:31.804423686+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 23 |
+
{"time":"2025-06-29T00:18:31.805466058+08:00","level":"ERROR","msg":"Failed to load features, feature will default to disabled","error":"api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s): Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 24 |
+
{"time":"2025-06-29T00:18:31.817218548+08:00","level":"INFO","msg":"created new stream","id":"ln8ma2mo"}
|
| 25 |
+
{"time":"2025-06-29T00:18:31.817240068+08:00","level":"INFO","msg":"stream: started","id":"ln8ma2mo"}
|
| 26 |
+
{"time":"2025-06-29T00:18:31.817280389+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"ln8ma2mo"}
|
| 27 |
+
{"time":"2025-06-29T00:18:31.817301971+08:00","level":"INFO","msg":"sender: started","stream_id":"ln8ma2mo"}
|
| 28 |
+
{"time":"2025-06-29T00:18:31.817292844+08:00","level":"INFO","msg":"handler: started","stream_id":"ln8ma2mo"}
|
| 29 |
+
{"time":"2025-06-29T00:18:31.817927543+08:00","level":"INFO","msg":"stream: closing","id":"ln8ma2mo"}
|
| 30 |
+
{"time":"2025-06-29T00:19:01.824830492+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 31 |
+
{"time":"2025-06-29T00:19:34.122683021+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 32 |
+
{"time":"2025-06-29T00:20:08.859081137+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 33 |
+
{"time":"2025-06-29T00:20:47.142810832+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 34 |
+
{"time":"2025-06-29T00:21:34.117938021+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 35 |
+
{"time":"2025-06-29T00:22:42.456861603+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 36 |
+
{"time":"2025-06-29T00:24:12.458977097+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 37 |
+
{"time":"2025-06-29T00:25:42.460628156+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 38 |
+
{"time":"2025-06-29T00:27:12.462982763+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 39 |
+
{"time":"2025-06-29T00:28:31.82404899+08:00","level":"WARN","msg":"sender: taking a long time","seconds":600.000158503,"work":"WorkRecord(*service_go_proto.Record_Run); Control(mailbox_slot:\"elq8to0u7ndt\" always_send:true connection_id:\"127.0.0.1:48626\")"}
|
| 40 |
+
{"time":"2025-06-29T00:28:42.464486984+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 41 |
+
{"time":"2025-06-29T00:30:12.465370037+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 42 |
+
{"time":"2025-06-29T00:31:42.467621727+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 43 |
+
{"time":"2025-06-29T00:33:12.470149312+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 44 |
+
{"time":"2025-06-29T00:34:42.471834466+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 45 |
+
{"time":"2025-06-29T00:36:12.474312455+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 46 |
+
{"time":"2025-06-29T00:37:42.477105562+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 47 |
+
{"time":"2025-06-29T00:38:31.82690936+08:00","level":"WARN","msg":"sender: taking a long time","seconds":1200.003036845,"work":"WorkRecord(*service_go_proto.Record_Run); Control(mailbox_slot:\"elq8to0u7ndt\" always_send:true connection_id:\"127.0.0.1:48626\")"}
|
| 48 |
+
{"time":"2025-06-29T00:39:12.479327707+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 49 |
+
{"time":"2025-06-29T00:40:42.479854506+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 50 |
+
{"time":"2025-06-29T00:42:12.482469938+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 51 |
+
{"time":"2025-06-29T00:43:42.484924406+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 52 |
+
{"time":"2025-06-29T00:45:12.487708397+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 53 |
+
{"time":"2025-06-29T00:45:12.48877363+08:00","level":"ERROR","msg":"send: sendRun: failed to update run state: api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s): Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 54 |
+
{"time":"2025-06-29T00:45:12.489160854+08:00","level":"ERROR","msg":"sender: upsertConfig: RunRecord is nil"}
|
| 55 |
+
{"time":"2025-06-29T00:45:12.489257088+08:00","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":1600.665365029,"work":"WorkRecord(*service_go_proto.Record_Run); Control(mailbox_slot:\"elq8to0u7ndt\" always_send:true connection_id:\"127.0.0.1:48626\")"}
|
| 56 |
+
{"time":"2025-06-29T00:45:42.54537918+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 57 |
+
{"time":"2025-06-29T00:46:14.577405421+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 58 |
+
{"time":"2025-06-29T00:46:49.560523299+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 59 |
+
{"time":"2025-06-29T00:47:29.044951023+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 60 |
+
{"time":"2025-06-29T00:48:18.616524778+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 61 |
+
{"time":"2025-06-29T00:49:24.200671698+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 62 |
+
{"time":"2025-06-29T00:50:54.203239638+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 63 |
+
{"time":"2025-06-29T00:52:24.20538484+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 64 |
+
{"time":"2025-06-29T00:53:54.207195938+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 65 |
+
{"time":"2025-06-29T00:55:24.209241621+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 66 |
+
{"time":"2025-06-29T00:56:54.212135665+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 67 |
+
{"time":"2025-06-29T00:58:24.214809507+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 68 |
+
{"time":"2025-06-29T00:59:54.217111858+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 69 |
+
{"time":"2025-06-29T01:01:24.219045106+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 70 |
+
{"time":"2025-06-29T01:02:54.221102245+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 71 |
+
{"time":"2025-06-29T01:04:24.22398357+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 72 |
+
{"time":"2025-06-29T01:05:54.226547409+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 73 |
+
{"time":"2025-06-29T01:07:24.228484779+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 74 |
+
{"time":"2025-06-29T01:08:54.231281118+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 75 |
+
{"time":"2025-06-29T01:10:24.233099781+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 76 |
+
{"time":"2025-06-29T01:11:54.236192516+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 77 |
+
{"time":"2025-06-29T01:11:54.236277338+08:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s): Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 78 |
+
{"time":"2025-06-29T01:11:54.236659368+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 79 |
+
{"time":"2025-06-29T01:11:54.239839801+08:00","level":"INFO","msg":"handler: closed","stream_id":"ln8ma2mo"}
|
| 80 |
+
{"time":"2025-06-29T01:11:54.239896009+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"ln8ma2mo"}
|
| 81 |
+
{"time":"2025-06-29T01:11:54.239911685+08:00","level":"INFO","msg":"sender: closed","stream_id":"ln8ma2mo"}
|
| 82 |
+
{"time":"2025-06-29T01:11:54.242908101+08:00","level":"INFO","msg":"stream: closed","id":"ln8ma2mo"}
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/logs/debug.log
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_setup.py:_flush():70] Configure stats pid to 906089
|
| 3 |
+
2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/logs/debug.log
|
| 7 |
+
2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/logs/debug-internal.log
|
| 8 |
+
2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-28 23:51:50,389 INFO MainThread:906089 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-28 23:51:50,390 INFO MainThread:906089 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-28 23:51:50,391 INFO MainThread:906089 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-28 23:51:50,393 INFO MainThread:906089 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-28 23:51:59,762 WARNING MainThread:906089 [wandb_init.py:init():1681] [no run ID] interrupted
|
| 18 |
+
Traceback (most recent call last):
|
| 19 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1677, in init
|
| 20 |
+
return wi.init(run_settings, run_config, run_printer)
|
| 21 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1055, in init
|
| 22 |
+
result = wait_with_progress(
|
| 23 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress
|
| 24 |
+
return wait_all_with_progress(
|
| 25 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress
|
| 26 |
+
return asyncio_compat.run(progress_loop_with_timeout)
|
| 27 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/asyncio_compat.py", line 30, in run
|
| 28 |
+
return future.result()
|
| 29 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/concurrent/futures/_base.py", line 440, in result
|
| 30 |
+
self._condition.wait(timeout)
|
| 31 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/threading.py", line 320, in wait
|
| 32 |
+
waiter.acquire()
|
| 33 |
+
KeyboardInterrupt
|
| 34 |
+
2025-06-28 23:52:00,349 INFO MsgRouterThr:906089 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
|
| 35 |
+
2025-06-28 23:52:21,541 INFO Thread-3 (wrapped_target):906089 [retry.py:__call__():175] [no run ID] Retry attempt failed:
|
| 36 |
+
Traceback (most recent call last):
|
| 37 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 198, in _new_conn
|
| 38 |
+
sock = connection.create_connection(
|
| 39 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/connection.py", line 85, in create_connection
|
| 40 |
+
raise err
|
| 41 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/connection.py", line 73, in create_connection
|
| 42 |
+
sock.connect(sa)
|
| 43 |
+
TimeoutError: timed out
|
| 44 |
+
|
| 45 |
+
The above exception was the direct cause of the following exception:
|
| 46 |
+
|
| 47 |
+
Traceback (most recent call last):
|
| 48 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 787, in urlopen
|
| 49 |
+
response = self._make_request(
|
| 50 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 488, in _make_request
|
| 51 |
+
raise new_e
|
| 52 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 464, in _make_request
|
| 53 |
+
self._validate_conn(conn)
|
| 54 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1093, in _validate_conn
|
| 55 |
+
conn.connect()
|
| 56 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 704, in connect
|
| 57 |
+
self.sock = sock = self._new_conn()
|
| 58 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 207, in _new_conn
|
| 59 |
+
raise ConnectTimeoutError(
|
| 60 |
+
urllib3.exceptions.ConnectTimeoutError: (<urllib3.connection.HTTPSConnection object at 0x7f9f447cebc0>, 'Connection to api.wandb.ai timed out. (connect timeout=20)')
|
| 61 |
+
|
| 62 |
+
The above exception was the direct cause of the following exception:
|
| 63 |
+
|
| 64 |
+
Traceback (most recent call last):
|
| 65 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/adapters.py", line 667, in send
|
| 66 |
+
resp = conn.urlopen(
|
| 67 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 841, in urlopen
|
| 68 |
+
retries = retries.increment(
|
| 69 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/retry.py", line 519, in increment
|
| 70 |
+
raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]
|
| 71 |
+
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='api.wandb.ai', port=443): Max retries exceeded with url: /graphql (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f9f447cebc0>, 'Connection to api.wandb.ai timed out. (connect timeout=20)'))
|
| 72 |
+
|
| 73 |
+
During handling of the above exception, another exception occurred:
|
| 74 |
+
|
| 75 |
+
Traceback (most recent call last):
|
| 76 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/retry.py", line 134, in __call__
|
| 77 |
+
result = self._call_fn(*args, **kwargs)
|
| 78 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/internal/internal_api.py", line 398, in execute
|
| 79 |
+
return self.client.execute(*args, **kwargs) # type: ignore
|
| 80 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 52, in execute
|
| 81 |
+
result = self._get_result(document, *args, **kwargs)
|
| 82 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 60, in _get_result
|
| 83 |
+
return self.transport.execute(document, *args, **kwargs)
|
| 84 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/gql_request.py", line 58, in execute
|
| 85 |
+
request = self.session.post(self.url, **post_args)
|
| 86 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 637, in post
|
| 87 |
+
return self.request("POST", url, data=data, json=json, **kwargs)
|
| 88 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 589, in request
|
| 89 |
+
resp = self.send(prep, **send_kwargs)
|
| 90 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 703, in send
|
| 91 |
+
r = adapter.send(request, **kwargs)
|
| 92 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/adapters.py", line 688, in send
|
| 93 |
+
raise ConnectTimeout(e, request=request)
|
| 94 |
+
requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='api.wandb.ai', port=443): Max retries exceeded with url: /graphql (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f9f447cebc0>, 'Connection to api.wandb.ai timed out. (connect timeout=20)'))
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/run-ln8ma2mo.wandb
ADDED
|
Binary file (402 Bytes). View file
|
|
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/files/output.log
ADDED
|
File without changes
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-28T23:53:30.951932242+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-28T23:54:01.056320508+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 3 |
+
{"time":"2025-06-28T23:54:33.136125443+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 4 |
+
{"time":"2025-06-28T23:55:07.262654908+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 5 |
+
{"time":"2025-06-28T23:55:47.196366157+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 6 |
+
{"time":"2025-06-28T23:56:33.830440313+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 7 |
+
{"time":"2025-06-28T23:57:41.624410258+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 8 |
+
{"time":"2025-06-28T23:59:11.626386567+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 9 |
+
{"time":"2025-06-29T00:00:41.62885613+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 10 |
+
{"time":"2025-06-29T00:02:11.630480713+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 11 |
+
{"time":"2025-06-29T00:03:41.632896773+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 12 |
+
{"time":"2025-06-29T00:05:11.634787095+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 13 |
+
{"time":"2025-06-29T00:06:41.637469615+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 14 |
+
{"time":"2025-06-29T00:08:11.640346542+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 15 |
+
{"time":"2025-06-29T00:09:41.642435888+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 16 |
+
{"time":"2025-06-29T00:11:11.643824358+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 17 |
+
{"time":"2025-06-29T00:12:41.645847752+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 18 |
+
{"time":"2025-06-29T00:14:11.647375061+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 19 |
+
{"time":"2025-06-29T00:15:41.649750172+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 20 |
+
{"time":"2025-06-29T00:17:11.652403406+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 21 |
+
{"time":"2025-06-29T00:18:41.655408766+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 22 |
+
{"time":"2025-06-29T00:20:11.657578168+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 23 |
+
{"time":"2025-06-29T00:20:11.658626379+08:00","level":"ERROR","msg":"Failed to load features, feature will default to disabled","error":"api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s): Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 24 |
+
{"time":"2025-06-29T00:20:11.681430338+08:00","level":"INFO","msg":"created new stream","id":"mz3ej8ig"}
|
| 25 |
+
{"time":"2025-06-29T00:20:11.681462583+08:00","level":"INFO","msg":"stream: started","id":"mz3ej8ig"}
|
| 26 |
+
{"time":"2025-06-29T00:20:11.681507089+08:00","level":"INFO","msg":"sender: started","stream_id":"mz3ej8ig"}
|
| 27 |
+
{"time":"2025-06-29T00:20:11.68149584+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"mz3ej8ig"}
|
| 28 |
+
{"time":"2025-06-29T00:20:11.681547365+08:00","level":"INFO","msg":"handler: started","stream_id":"mz3ej8ig"}
|
| 29 |
+
{"time":"2025-06-29T00:20:11.682347217+08:00","level":"INFO","msg":"stream: closing","id":"mz3ej8ig"}
|
| 30 |
+
{"time":"2025-06-29T00:20:41.686911252+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 31 |
+
{"time":"2025-06-29T00:21:14.167970232+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 32 |
+
{"time":"2025-06-29T00:21:48.35484514+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 33 |
+
{"time":"2025-06-29T00:22:27.366864931+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 34 |
+
{"time":"2025-06-29T00:23:15.010485407+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 35 |
+
{"time":"2025-06-29T00:24:21.797181116+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 36 |
+
{"time":"2025-06-29T00:25:51.799191068+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 37 |
+
{"time":"2025-06-29T00:27:21.801584003+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 38 |
+
{"time":"2025-06-29T00:28:51.804025943+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 39 |
+
{"time":"2025-06-29T00:30:11.686396042+08:00","level":"WARN","msg":"sender: taking a long time","seconds":600.000441033,"work":"WorkRecord(*service_go_proto.Record_Run); Control(mailbox_slot:\"5dtwv5qkpd53\" always_send:true connection_id:\"127.0.0.1:33438\")"}
|
| 40 |
+
{"time":"2025-06-29T00:30:21.806550019+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 41 |
+
{"time":"2025-06-29T00:31:51.807720521+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 42 |
+
{"time":"2025-06-29T00:33:21.80993678+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 43 |
+
{"time":"2025-06-29T00:34:51.811863783+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 44 |
+
{"time":"2025-06-29T00:36:21.813546407+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 45 |
+
{"time":"2025-06-29T00:37:51.815079734+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 46 |
+
{"time":"2025-06-29T00:39:21.816778971+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 47 |
+
{"time":"2025-06-29T00:40:11.688234649+08:00","level":"WARN","msg":"sender: taking a long time","seconds":1200.002273841,"work":"WorkRecord(*service_go_proto.Record_Run); Control(mailbox_slot:\"5dtwv5qkpd53\" always_send:true connection_id:\"127.0.0.1:33438\")"}
|
| 48 |
+
{"time":"2025-06-29T00:40:51.81937773+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 49 |
+
{"time":"2025-06-29T00:42:21.820944763+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 50 |
+
{"time":"2025-06-29T00:43:51.823708778+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 51 |
+
{"time":"2025-06-29T00:45:21.825965407+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 52 |
+
{"time":"2025-06-29T00:46:51.827707527+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 53 |
+
{"time":"2025-06-29T00:46:51.828793803+08:00","level":"ERROR","msg":"send: sendRun: failed to update run state: api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s): Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 54 |
+
{"time":"2025-06-29T00:46:51.829193166+08:00","level":"ERROR","msg":"sender: upsertConfig: RunRecord is nil"}
|
| 55 |
+
{"time":"2025-06-29T00:46:51.82925862+08:00","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":1600.143330565,"work":"WorkRecord(*service_go_proto.Record_Run); Control(mailbox_slot:\"5dtwv5qkpd53\" always_send:true connection_id:\"127.0.0.1:33438\")"}
|
| 56 |
+
{"time":"2025-06-29T00:47:21.884399165+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 57 |
+
{"time":"2025-06-29T00:47:54.221719208+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 58 |
+
{"time":"2025-06-29T00:48:28.514834678+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 59 |
+
{"time":"2025-06-29T00:49:08.340678437+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 60 |
+
{"time":"2025-06-29T00:49:57.755779717+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 61 |
+
{"time":"2025-06-29T00:51:02.131711303+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 62 |
+
{"time":"2025-06-29T00:52:32.133820589+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 63 |
+
{"time":"2025-06-29T00:54:02.13693015+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 64 |
+
{"time":"2025-06-29T00:55:32.139789811+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 65 |
+
{"time":"2025-06-29T00:57:02.14632236+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 66 |
+
{"time":"2025-06-29T00:58:32.148909001+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 67 |
+
{"time":"2025-06-29T01:00:02.154970223+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 68 |
+
{"time":"2025-06-29T01:01:32.157829588+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 69 |
+
{"time":"2025-06-29T01:03:02.159580228+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 70 |
+
{"time":"2025-06-29T01:04:32.161980354+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 71 |
+
{"time":"2025-06-29T01:06:02.163980101+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 72 |
+
{"time":"2025-06-29T01:07:32.166420035+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 73 |
+
{"time":"2025-06-29T01:09:02.168842314+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 74 |
+
{"time":"2025-06-29T01:10:32.171375938+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 75 |
+
{"time":"2025-06-29T01:12:02.17356805+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 76 |
+
{"time":"2025-06-29T01:13:32.174403974+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 77 |
+
{"time":"2025-06-29T01:13:32.175440214+08:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s): Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 78 |
+
{"time":"2025-06-29T01:13:32.175867374+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 79 |
+
{"time":"2025-06-29T01:13:32.175926547+08:00","level":"INFO","msg":"handler: closed","stream_id":"mz3ej8ig"}
|
| 80 |
+
{"time":"2025-06-29T01:13:32.175943588+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"mz3ej8ig"}
|
| 81 |
+
{"time":"2025-06-29T01:13:32.176007588+08:00","level":"INFO","msg":"sender: closed","stream_id":"mz3ej8ig"}
|
| 82 |
+
{"time":"2025-06-29T01:13:32.180310341+08:00","level":"INFO","msg":"stream: closed","id":"mz3ej8ig"}
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/logs/debug.log
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_setup.py:_flush():70] Configure stats pid to 907243
|
| 3 |
+
2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/logs/debug.log
|
| 7 |
+
2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/logs/debug-internal.log
|
| 8 |
+
2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-28 23:53:30,943 INFO MainThread:907243 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-28 23:53:30,943 INFO MainThread:907243 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-28 23:53:30,944 INFO MainThread:907243 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-28 23:53:30,946 INFO MainThread:907243 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-28 23:53:30,950 INFO MainThread:907243 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-28 23:53:30,953 INFO MainThread:907243 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-28 23:54:01,206 WARNING MainThread:907243 [wandb_init.py:init():1681] [no run ID] interrupted
|
| 18 |
+
Traceback (most recent call last):
|
| 19 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1677, in init
|
| 20 |
+
return wi.init(run_settings, run_config, run_printer)
|
| 21 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1055, in init
|
| 22 |
+
result = wait_with_progress(
|
| 23 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress
|
| 24 |
+
return wait_all_with_progress(
|
| 25 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress
|
| 26 |
+
return asyncio_compat.run(progress_loop_with_timeout)
|
| 27 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/asyncio_compat.py", line 30, in run
|
| 28 |
+
return future.result()
|
| 29 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/concurrent/futures/_base.py", line 440, in result
|
| 30 |
+
self._condition.wait(timeout)
|
| 31 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/threading.py", line 320, in wait
|
| 32 |
+
waiter.acquire()
|
| 33 |
+
KeyboardInterrupt
|
| 34 |
+
2025-06-28 23:54:01,916 INFO MsgRouterThr:907243 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
|
| 35 |
+
2025-06-28 23:54:02,523 INFO Thread-3 (wrapped_target):907243 [retry.py:__call__():175] [no run ID] Retry attempt failed:
|
| 36 |
+
Traceback (most recent call last):
|
| 37 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 198, in _new_conn
|
| 38 |
+
sock = connection.create_connection(
|
| 39 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/connection.py", line 85, in create_connection
|
| 40 |
+
raise err
|
| 41 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/connection.py", line 73, in create_connection
|
| 42 |
+
sock.connect(sa)
|
| 43 |
+
TimeoutError: timed out
|
| 44 |
+
|
| 45 |
+
The above exception was the direct cause of the following exception:
|
| 46 |
+
|
| 47 |
+
Traceback (most recent call last):
|
| 48 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 787, in urlopen
|
| 49 |
+
response = self._make_request(
|
| 50 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 488, in _make_request
|
| 51 |
+
raise new_e
|
| 52 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 464, in _make_request
|
| 53 |
+
self._validate_conn(conn)
|
| 54 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1093, in _validate_conn
|
| 55 |
+
conn.connect()
|
| 56 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 704, in connect
|
| 57 |
+
self.sock = sock = self._new_conn()
|
| 58 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 207, in _new_conn
|
| 59 |
+
raise ConnectTimeoutError(
|
| 60 |
+
urllib3.exceptions.ConnectTimeoutError: (<urllib3.connection.HTTPSConnection object at 0x7f87fd2d6350>, 'Connection to api.wandb.ai timed out. (connect timeout=20)')
|
| 61 |
+
|
| 62 |
+
The above exception was the direct cause of the following exception:
|
| 63 |
+
|
| 64 |
+
Traceback (most recent call last):
|
| 65 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/adapters.py", line 667, in send
|
| 66 |
+
resp = conn.urlopen(
|
| 67 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 841, in urlopen
|
| 68 |
+
retries = retries.increment(
|
| 69 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/retry.py", line 519, in increment
|
| 70 |
+
raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]
|
| 71 |
+
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='api.wandb.ai', port=443): Max retries exceeded with url: /graphql (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f87fd2d6350>, 'Connection to api.wandb.ai timed out. (connect timeout=20)'))
|
| 72 |
+
|
| 73 |
+
During handling of the above exception, another exception occurred:
|
| 74 |
+
|
| 75 |
+
Traceback (most recent call last):
|
| 76 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/retry.py", line 134, in __call__
|
| 77 |
+
result = self._call_fn(*args, **kwargs)
|
| 78 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/internal/internal_api.py", line 398, in execute
|
| 79 |
+
return self.client.execute(*args, **kwargs) # type: ignore
|
| 80 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 52, in execute
|
| 81 |
+
result = self._get_result(document, *args, **kwargs)
|
| 82 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 60, in _get_result
|
| 83 |
+
return self.transport.execute(document, *args, **kwargs)
|
| 84 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/gql_request.py", line 58, in execute
|
| 85 |
+
request = self.session.post(self.url, **post_args)
|
| 86 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 637, in post
|
| 87 |
+
return self.request("POST", url, data=data, json=json, **kwargs)
|
| 88 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 589, in request
|
| 89 |
+
resp = self.send(prep, **send_kwargs)
|
| 90 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 703, in send
|
| 91 |
+
r = adapter.send(request, **kwargs)
|
| 92 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/adapters.py", line 688, in send
|
| 93 |
+
raise ConnectTimeout(e, request=request)
|
| 94 |
+
requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='api.wandb.ai', port=443): Max retries exceeded with url: /graphql (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f87fd2d6350>, 'Connection to api.wandb.ai timed out. (connect timeout=20)'))
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/run-mz3ej8ig.wandb
ADDED
|
Binary file (402 Bytes). View file
|
|
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/config.yaml
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.11
|
| 4 |
+
m:
|
| 5 |
+
- "1": trainer/global_step
|
| 6 |
+
"6":
|
| 7 |
+
- 3
|
| 8 |
+
"7": []
|
| 9 |
+
python_version: 3.10.0
|
| 10 |
+
t:
|
| 11 |
+
"1":
|
| 12 |
+
- 1
|
| 13 |
+
- 5
|
| 14 |
+
- 9
|
| 15 |
+
- 11
|
| 16 |
+
- 33
|
| 17 |
+
- 41
|
| 18 |
+
- 49
|
| 19 |
+
- 53
|
| 20 |
+
- 55
|
| 21 |
+
- 63
|
| 22 |
+
- 103
|
| 23 |
+
"2":
|
| 24 |
+
- 1
|
| 25 |
+
- 5
|
| 26 |
+
- 9
|
| 27 |
+
- 11
|
| 28 |
+
- 33
|
| 29 |
+
- 41
|
| 30 |
+
- 49
|
| 31 |
+
- 53
|
| 32 |
+
- 55
|
| 33 |
+
- 63
|
| 34 |
+
- 103
|
| 35 |
+
"3":
|
| 36 |
+
- 7
|
| 37 |
+
- 23
|
| 38 |
+
- 33
|
| 39 |
+
- 55
|
| 40 |
+
- 66
|
| 41 |
+
"4": 3.10.0
|
| 42 |
+
"5": 0.19.11
|
| 43 |
+
"6": 4.52.3
|
| 44 |
+
"8":
|
| 45 |
+
- 5
|
| 46 |
+
"12": 0.19.11
|
| 47 |
+
"13": linux-x86_64
|
| 48 |
+
accelerator:
|
| 49 |
+
value: gpu
|
| 50 |
+
batch_size:
|
| 51 |
+
value: 96
|
| 52 |
+
bert_hidden_dim:
|
| 53 |
+
value: 768
|
| 54 |
+
bert_name:
|
| 55 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
|
| 56 |
+
check_val_every_n_epoch:
|
| 57 |
+
value: 1
|
| 58 |
+
cross_attention_freq:
|
| 59 |
+
value: 2
|
| 60 |
+
devices:
|
| 61 |
+
value: 0,1,2,3,4,5,6,7
|
| 62 |
+
filename:
|
| 63 |
+
value: stage1_06282348_ddp
|
| 64 |
+
init_checkpoint:
|
| 65 |
+
value: ""
|
| 66 |
+
init_lr:
|
| 67 |
+
value: 0.0001
|
| 68 |
+
lm:
|
| 69 |
+
value: true
|
| 70 |
+
load_4bit:
|
| 71 |
+
value: false
|
| 72 |
+
lr_decay_rate:
|
| 73 |
+
value: 0.9
|
| 74 |
+
match_batch_size:
|
| 75 |
+
value: 64
|
| 76 |
+
max_epochs:
|
| 77 |
+
value: 20
|
| 78 |
+
min_lr:
|
| 79 |
+
value: 1e-05
|
| 80 |
+
mix_dataset:
|
| 81 |
+
value: true
|
| 82 |
+
mode:
|
| 83 |
+
value: train
|
| 84 |
+
num_query_token:
|
| 85 |
+
value: 8
|
| 86 |
+
num_workers:
|
| 87 |
+
value: 8
|
| 88 |
+
plm_name:
|
| 89 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
|
| 90 |
+
plm_tune:
|
| 91 |
+
value: freeze
|
| 92 |
+
pool_size:
|
| 93 |
+
value: 0
|
| 94 |
+
precision:
|
| 95 |
+
value: bf16-mixed
|
| 96 |
+
projection_dim:
|
| 97 |
+
value: 256
|
| 98 |
+
prot_aug:
|
| 99 |
+
value: None
|
| 100 |
+
prot_max_len:
|
| 101 |
+
value: 1024
|
| 102 |
+
ptm:
|
| 103 |
+
value: true
|
| 104 |
+
rerank_cand_num:
|
| 105 |
+
value: 128
|
| 106 |
+
retrieval_eval_epoch:
|
| 107 |
+
value: 10
|
| 108 |
+
root:
|
| 109 |
+
value: data
|
| 110 |
+
save_every_n_epochs:
|
| 111 |
+
value: 5
|
| 112 |
+
scheduler:
|
| 113 |
+
value: linear_warmup_cosine_lr
|
| 114 |
+
seed:
|
| 115 |
+
value: 42
|
| 116 |
+
strategy:
|
| 117 |
+
value: ddp
|
| 118 |
+
temperature:
|
| 119 |
+
value: 0.1
|
| 120 |
+
text_max_len:
|
| 121 |
+
value: 128
|
| 122 |
+
use_wandb_logger:
|
| 123 |
+
value: true
|
| 124 |
+
warmup_lr:
|
| 125 |
+
value: 1e-06
|
| 126 |
+
warmup_steps:
|
| 127 |
+
value: 1000
|
| 128 |
+
weight_decay:
|
| 129 |
+
value: 0.05
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/output.log
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0629 00:01:34.010646 908035 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 908727 via signal SIGTERM
|
| 2 |
+
W0629 00:01:34.011594 908035 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 908857 via signal SIGTERM
|
| 3 |
+
W0629 00:01:34.011932 908035 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 909000 via signal SIGTERM
|
| 4 |
+
W0629 00:01:34.012163 908035 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 909139 via signal SIGTERM
|
| 5 |
+
W0629 00:01:34.012400 908035 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 909288 via signal SIGTERM
|
| 6 |
+
W0629 00:01:34.012614 908035 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 909427 via signal SIGTERM
|
| 7 |
+
W0629 00:01:34.012998 908035 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 909567 via signal SIGTERM
|
| 8 |
+
Traceback (most recent call last):
|
| 9 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 135, in <module>
|
| 10 |
+
main(args)
|
| 11 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 101, in main
|
| 12 |
+
trainer.fit(model, datamodule=dm)
|
| 13 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
|
| 14 |
+
call._call_and_handle_interrupt(
|
| 15 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
|
| 16 |
+
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
|
| 17 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py", line 144, in launch
|
| 18 |
+
while not process_context.join():
|
| 19 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 215, in join
|
| 20 |
+
raise ProcessRaisedException(msg, error_index, failed_process.pid)
|
| 21 |
+
torch.multiprocessing.spawn.ProcessRaisedException: It looks like your LightningModule has parameters that were not used in producing the loss returned by training_step. If this is intentional, you must enable the detection of unused parameters in DDP, either by setting the string value `strategy='ddp_find_unused_parameters_true'` or by setting the flag in the strategy with `strategy=DDPStrategy(find_unused_parameters=True)`.
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/requirements.txt
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
stack-data==0.6.3
|
| 2 |
+
yarl==1.20.0
|
| 3 |
+
setuptools==78.1.1
|
| 4 |
+
cloudpathlib==0.21.1
|
| 5 |
+
pytz==2025.2
|
| 6 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 7 |
+
shellingham==1.5.4
|
| 8 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 9 |
+
Jinja2==3.1.6
|
| 10 |
+
pycocotools==2.0.8
|
| 11 |
+
pandas==2.2.3
|
| 12 |
+
scipy==1.15.3
|
| 13 |
+
tenacity==9.1.2
|
| 14 |
+
lightning-utilities==0.14.3
|
| 15 |
+
cfgv==3.4.0
|
| 16 |
+
hf-xet==1.1.2
|
| 17 |
+
platformdirs==4.3.8
|
| 18 |
+
smart-open==7.1.0
|
| 19 |
+
text-unidecode==1.3
|
| 20 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 21 |
+
omegaconf==2.3.0
|
| 22 |
+
referencing==0.36.2
|
| 23 |
+
mdurl==0.1.2
|
| 24 |
+
gitdb==4.0.12
|
| 25 |
+
identify==2.6.12
|
| 26 |
+
ipython==8.36.0
|
| 27 |
+
spacy-loggers==1.0.5
|
| 28 |
+
distlib==0.3.9
|
| 29 |
+
typing-inspection==0.4.1
|
| 30 |
+
antlr4-python3-runtime==4.9.3
|
| 31 |
+
multidict==6.4.4
|
| 32 |
+
nvidia-curand-cu12==10.3.5.147
|
| 33 |
+
prompt_toolkit==3.0.51
|
| 34 |
+
Pygments==2.19.1
|
| 35 |
+
numpy==2.2.6
|
| 36 |
+
decord==0.6.0
|
| 37 |
+
srsly==2.5.1
|
| 38 |
+
watchdog==6.0.0
|
| 39 |
+
pure_eval==0.2.3
|
| 40 |
+
virtualenv==20.31.2
|
| 41 |
+
altair==5.5.0
|
| 42 |
+
matplotlib-inline==0.1.7
|
| 43 |
+
bleach==6.2.0
|
| 44 |
+
exceptiongroup==1.3.0
|
| 45 |
+
fairscale==0.4.4
|
| 46 |
+
confection==0.1.5
|
| 47 |
+
fonttools==4.58.0
|
| 48 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 49 |
+
ptyprocess==0.7.0
|
| 50 |
+
pytorch-lightning==2.5.1.post0
|
| 51 |
+
nodeenv==1.9.1
|
| 52 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 53 |
+
requests==2.32.3
|
| 54 |
+
marisa-trie==1.2.1
|
| 55 |
+
cachetools==5.5.2
|
| 56 |
+
matplotlib==3.10.3
|
| 57 |
+
typing_extensions==4.13.2
|
| 58 |
+
asttokens==3.0.0
|
| 59 |
+
torch==2.6.0
|
| 60 |
+
PyYAML==6.0.2
|
| 61 |
+
tifffile==2025.5.10
|
| 62 |
+
spacy==3.8.7
|
| 63 |
+
braceexpand==0.1.7
|
| 64 |
+
plotly==6.1.1
|
| 65 |
+
attrs==25.3.0
|
| 66 |
+
py-cpuinfo==9.0.0
|
| 67 |
+
frozenlist==1.6.0
|
| 68 |
+
catalogue==2.0.10
|
| 69 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 70 |
+
traitlets==5.14.3
|
| 71 |
+
annotated-types==0.7.0
|
| 72 |
+
language_data==1.3.0
|
| 73 |
+
thinc==8.3.6
|
| 74 |
+
imageio==2.37.0
|
| 75 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 76 |
+
certifi==2025.4.26
|
| 77 |
+
smmap==5.0.2
|
| 78 |
+
python-magic==0.4.27
|
| 79 |
+
triton==3.2.0
|
| 80 |
+
weasel==0.4.1
|
| 81 |
+
async-timeout==5.0.1
|
| 82 |
+
wcwidth==0.2.13
|
| 83 |
+
pillow==11.2.1
|
| 84 |
+
torchmetrics==1.7.1
|
| 85 |
+
kaggle==1.7.4.5
|
| 86 |
+
regex==2024.11.6
|
| 87 |
+
aiosignal==1.3.2
|
| 88 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 89 |
+
scikit-image==0.25.2
|
| 90 |
+
nvidia-nvtx-cu12==12.4.127
|
| 91 |
+
opendatasets==0.1.22
|
| 92 |
+
iopath==0.1.10
|
| 93 |
+
pyparsing==3.2.3
|
| 94 |
+
portalocker==3.1.1
|
| 95 |
+
executing==2.2.0
|
| 96 |
+
contexttimer==0.3.3
|
| 97 |
+
lazy_loader==0.4
|
| 98 |
+
wrapt==1.17.2
|
| 99 |
+
webdataset==0.2.111
|
| 100 |
+
blis==1.3.0
|
| 101 |
+
idna==3.10
|
| 102 |
+
timm==0.4.12
|
| 103 |
+
einops==0.8.1
|
| 104 |
+
packaging==24.2
|
| 105 |
+
decorator==5.2.1
|
| 106 |
+
filelock==3.18.0
|
| 107 |
+
python-slugify==8.0.4
|
| 108 |
+
cycler==0.12.1
|
| 109 |
+
charset-normalizer==3.4.2
|
| 110 |
+
pydantic==2.11.5
|
| 111 |
+
pydeck==0.9.1
|
| 112 |
+
tzdata==2025.2
|
| 113 |
+
jedi==0.19.2
|
| 114 |
+
aiohappyeyeballs==2.6.1
|
| 115 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 116 |
+
salesforce-lavis==1.0.2
|
| 117 |
+
parso==0.8.4
|
| 118 |
+
nvidia-nccl-cu12==2.21.5
|
| 119 |
+
toml==0.10.2
|
| 120 |
+
python-dateutil==2.9.0.post0
|
| 121 |
+
rich==14.0.0
|
| 122 |
+
tqdm==4.67.1
|
| 123 |
+
rpds-py==0.25.1
|
| 124 |
+
opencv-python-headless==4.5.5.64
|
| 125 |
+
tornado==6.5.1
|
| 126 |
+
propcache==0.3.1
|
| 127 |
+
webencodings==0.5.1
|
| 128 |
+
murmurhash==1.0.13
|
| 129 |
+
contourpy==1.3.2
|
| 130 |
+
joblib==1.5.1
|
| 131 |
+
networkx==3.4.2
|
| 132 |
+
six==1.17.0
|
| 133 |
+
markdown-it-py==3.0.0
|
| 134 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 135 |
+
msgpack==1.1.0
|
| 136 |
+
sentencepiece==0.2.0
|
| 137 |
+
cymem==2.0.11
|
| 138 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 139 |
+
absl-py==2.2.2
|
| 140 |
+
hjson==3.1.0
|
| 141 |
+
mpmath==1.3.0
|
| 142 |
+
pydantic_core==2.33.2
|
| 143 |
+
psutil==7.0.0
|
| 144 |
+
nvidia-ml-py==12.575.51
|
| 145 |
+
pyarrow==20.0.0
|
| 146 |
+
kiwisolver==1.4.8
|
| 147 |
+
sympy==1.13.1
|
| 148 |
+
ninja==1.11.1.4
|
| 149 |
+
rouge_score==0.1.2
|
| 150 |
+
deepspeed==0.16.10+b666844f
|
| 151 |
+
spacy-legacy==3.0.12
|
| 152 |
+
pycocoevalcap==1.2
|
| 153 |
+
pexpect==4.9.0
|
| 154 |
+
ftfy==6.3.1
|
| 155 |
+
protobuf==6.31.0
|
| 156 |
+
urllib3==2.4.0
|
| 157 |
+
wheel==0.45.1
|
| 158 |
+
nltk==3.9.1
|
| 159 |
+
streamlit==1.45.1
|
| 160 |
+
wasabi==1.1.3
|
| 161 |
+
pre_commit==4.2.0
|
| 162 |
+
safetensors==0.5.3
|
| 163 |
+
jsonschema-specifications==2025.4.1
|
| 164 |
+
langcodes==3.5.0
|
| 165 |
+
GitPython==3.1.44
|
| 166 |
+
blinker==1.9.0
|
| 167 |
+
torchvision==0.21.0
|
| 168 |
+
MarkupSafe==3.0.2
|
| 169 |
+
dill==0.3.8
|
| 170 |
+
yacs==0.1.8
|
| 171 |
+
pathlib==1.0.1
|
| 172 |
+
scikit-learn==1.6.1
|
| 173 |
+
cffi==1.17.1
|
| 174 |
+
pycparser==2.22
|
| 175 |
+
flash-attn==2.7.1.post1
|
| 176 |
+
cryptography==45.0.3
|
| 177 |
+
pycryptodome==3.23.0
|
| 178 |
+
cheroot==10.0.1
|
| 179 |
+
more-itertools==10.7.0
|
| 180 |
+
setproctitle==1.3.6
|
| 181 |
+
delta-center-client==0.0.4
|
| 182 |
+
jmespath==0.10.0
|
| 183 |
+
xxhash==3.5.0
|
| 184 |
+
pip==25.1.1
|
| 185 |
+
aliyun-python-sdk-core==2.16.0
|
| 186 |
+
jaraco.functools==4.1.0
|
| 187 |
+
bigmodelvis==0.0.1
|
| 188 |
+
aiohttp==3.12.2
|
| 189 |
+
multiprocess==0.70.16
|
| 190 |
+
opendelta==0.3.2
|
| 191 |
+
docker-pycreds==0.4.0
|
| 192 |
+
threadpoolctl==3.6.0
|
| 193 |
+
click==8.2.1
|
| 194 |
+
oss2==2.15.0
|
| 195 |
+
crcmod==1.7
|
| 196 |
+
transformers==4.52.3
|
| 197 |
+
datasets==3.6.0
|
| 198 |
+
jsonschema==4.24.0
|
| 199 |
+
opencv-python==4.11.0.86
|
| 200 |
+
wandb==0.19.11
|
| 201 |
+
fsspec==2025.3.0
|
| 202 |
+
tokenizers==0.21.1
|
| 203 |
+
sentry-sdk==2.29.1
|
| 204 |
+
preshed==3.0.10
|
| 205 |
+
aliyun-python-sdk-kms==2.16.5
|
| 206 |
+
huggingface-hub==0.32.1
|
| 207 |
+
typer==0.16.0
|
| 208 |
+
narwhals==1.41.0
|
| 209 |
+
web.py==0.62
|
| 210 |
+
autocommand==2.2.2
|
| 211 |
+
importlib_metadata==8.0.0
|
| 212 |
+
zipp==3.19.2
|
| 213 |
+
jaraco.context==5.3.0
|
| 214 |
+
typeguard==4.3.0
|
| 215 |
+
jaraco.collections==5.1.0
|
| 216 |
+
typing_extensions==4.12.2
|
| 217 |
+
backports.tarfile==1.2.0
|
| 218 |
+
jaraco.functools==4.0.1
|
| 219 |
+
more-itertools==10.3.0
|
| 220 |
+
platformdirs==4.2.2
|
| 221 |
+
packaging==24.2
|
| 222 |
+
tomli==2.0.1
|
| 223 |
+
jaraco.text==3.12.1
|
| 224 |
+
wheel==0.45.1
|
| 225 |
+
inflect==7.3.1
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-06-28T15:54:48.951303Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--devices",
|
| 7 |
+
"0,1,2,3,4,5,6,7",
|
| 8 |
+
"--mode",
|
| 9 |
+
"train",
|
| 10 |
+
"--filename",
|
| 11 |
+
"stage1_06282348_ddp",
|
| 12 |
+
"--num_query_token",
|
| 13 |
+
"8",
|
| 14 |
+
"--plm_name",
|
| 15 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
|
| 16 |
+
"--bert_name",
|
| 17 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
|
| 18 |
+
"--save_every_n_epochs",
|
| 19 |
+
"5",
|
| 20 |
+
"--max_epochs",
|
| 21 |
+
"20",
|
| 22 |
+
"--batch_size",
|
| 23 |
+
"96",
|
| 24 |
+
"--precision",
|
| 25 |
+
"bf16-mixed",
|
| 26 |
+
"--mix_dataset",
|
| 27 |
+
"--num_workers",
|
| 28 |
+
"8",
|
| 29 |
+
"--use_wandb_logger",
|
| 30 |
+
"--strategy",
|
| 31 |
+
"ddp"
|
| 32 |
+
],
|
| 33 |
+
"program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
|
| 34 |
+
"codePath": "stage1.py",
|
| 35 |
+
"root": "./all_checkpoints/stage1_06282348_ddp/",
|
| 36 |
+
"host": "dsw-265304-57b7b77cbc-vwbwc",
|
| 37 |
+
"executable": "/root/miniconda3/envs/protT3/bin/python",
|
| 38 |
+
"codePathLocal": "stage1.py",
|
| 39 |
+
"cpu_count": 64,
|
| 40 |
+
"cpu_count_logical": 64,
|
| 41 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 42 |
+
"gpu_count": 8,
|
| 43 |
+
"disk": {
|
| 44 |
+
"/": {
|
| 45 |
+
"total": "1623302262784",
|
| 46 |
+
"used": "1285226496"
|
| 47 |
+
}
|
| 48 |
+
},
|
| 49 |
+
"memory": {
|
| 50 |
+
"total": "549755813888"
|
| 51 |
+
},
|
| 52 |
+
"cpu": {
|
| 53 |
+
"count": 64,
|
| 54 |
+
"countLogical": 64
|
| 55 |
+
},
|
| 56 |
+
"gpu_nvidia": [
|
| 57 |
+
{
|
| 58 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 59 |
+
"memoryTotal": "85198045184",
|
| 60 |
+
"architecture": "Ampere"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 64 |
+
"memoryTotal": "85198045184",
|
| 65 |
+
"architecture": "Ampere"
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 69 |
+
"memoryTotal": "85198045184",
|
| 70 |
+
"architecture": "Ampere"
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 74 |
+
"memoryTotal": "85198045184",
|
| 75 |
+
"architecture": "Ampere"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 79 |
+
"memoryTotal": "85198045184",
|
| 80 |
+
"architecture": "Ampere"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 84 |
+
"memoryTotal": "85198045184",
|
| 85 |
+
"architecture": "Ampere"
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 89 |
+
"memoryTotal": "85198045184",
|
| 90 |
+
"architecture": "Ampere"
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 94 |
+
"memoryTotal": "85198045184",
|
| 95 |
+
"architecture": "Ampere"
|
| 96 |
+
}
|
| 97 |
+
],
|
| 98 |
+
"cudaVersion": "12.1"
|
| 99 |
+
}
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":408}}
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-28T23:54:48.952903363+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-28T23:55:08.706410301+08:00","level":"INFO","msg":"created new stream","id":"d21a8n96"}
|
| 3 |
+
{"time":"2025-06-28T23:55:08.70719709+08:00","level":"INFO","msg":"stream: started","id":"d21a8n96"}
|
| 4 |
+
{"time":"2025-06-28T23:55:08.707235941+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"d21a8n96"}
|
| 5 |
+
{"time":"2025-06-28T23:55:08.707321639+08:00","level":"INFO","msg":"handler: started","stream_id":"d21a8n96"}
|
| 6 |
+
{"time":"2025-06-28T23:55:08.707259857+08:00","level":"INFO","msg":"sender: started","stream_id":"d21a8n96"}
|
| 7 |
+
{"time":"2025-06-28T23:55:15.734662691+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-06-28T23:58:30.738393146+08:00","level":"ERROR","msg":"filestream: json decode error: context deadline exceeded (Client.Timeout or context cancellation while reading body)"}
|
| 9 |
+
{"time":"2025-06-28T23:58:30.746811098+08:00","level":"ERROR","msg":"filestream: error closing response body: net/http: request canceled"}
|
| 10 |
+
{"time":"2025-06-29T00:01:37.883490068+08:00","level":"INFO","msg":"stream: closing","id":"d21a8n96"}
|
| 11 |
+
{"time":"2025-06-29T00:01:37.883589533+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 12 |
+
{"time":"2025-06-29T00:01:37.884424806+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 13 |
+
{"time":"2025-06-29T00:01:42.08475624+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 14 |
+
{"time":"2025-06-29T00:02:28.146748335+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/d21a8n96/file_stream\": read tcp 10.1.6.17:45786->172.67.193.61:443: read: connection timed out"}
|
| 15 |
+
{"time":"2025-06-29T00:02:54.866501576+08:00","level":"INFO","msg":"handler: closed","stream_id":"d21a8n96"}
|
| 16 |
+
{"time":"2025-06-29T00:02:54.86656561+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"d21a8n96"}
|
| 17 |
+
{"time":"2025-06-29T00:02:54.86658917+08:00","level":"INFO","msg":"sender: closed","stream_id":"d21a8n96"}
|
| 18 |
+
{"time":"2025-06-29T00:02:54.871635053+08:00","level":"INFO","msg":"stream: closed","id":"d21a8n96"}
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/logs/debug.log
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_setup.py:_flush():70] Configure stats pid to 908035
|
| 3 |
+
2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/logs/debug.log
|
| 7 |
+
2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/logs/debug-internal.log
|
| 8 |
+
2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-28 23:54:48,946 INFO MainThread:908035 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-28 23:54:48,948 INFO MainThread:908035 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-28 23:54:48,952 INFO MainThread:908035 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-28 23:54:48,955 INFO MainThread:908035 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-28 23:55:15,683 INFO MainThread:908035 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-06-28 23:55:15,875 INFO MainThread:908035 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-06-28 23:55:15,875 INFO MainThread:908035 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-06-28 23:55:15,879 INFO MainThread:908035 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-06-28 23:55:15,879 INFO MainThread:908035 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-06-28 23:55:15,880 INFO MainThread:908035 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-06-29 00:01:37,881 INFO MsgRouterThr:908035 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/run-d21a8n96.wandb
ADDED
|
Binary file (91.6 kB). View file
|
|
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/files/output.log
ADDED
|
File without changes
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-29T00:04:21.83840344+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-29T00:04:46.244498134+08:00","level":"INFO","msg":"created new stream","id":"y2lylvs5"}
|
| 3 |
+
{"time":"2025-06-29T00:04:46.245490243+08:00","level":"INFO","msg":"stream: started","id":"y2lylvs5"}
|
| 4 |
+
{"time":"2025-06-29T00:04:46.245519608+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"y2lylvs5"}
|
| 5 |
+
{"time":"2025-06-29T00:04:46.245575002+08:00","level":"INFO","msg":"handler: started","stream_id":"y2lylvs5"}
|
| 6 |
+
{"time":"2025-06-29T00:04:46.245579305+08:00","level":"INFO","msg":"sender: started","stream_id":"y2lylvs5"}
|
| 7 |
+
{"time":"2025-06-29T00:05:16.250479528+08:00","level":"ERROR","msg":"send: sendRun: failed to update run state: context deadline exceeded (Client.Timeout or context cancellation while reading body)"}
|
| 8 |
+
{"time":"2025-06-29T00:05:17.262869937+08:00","level":"INFO","msg":"stream: closing","id":"y2lylvs5"}
|
| 9 |
+
{"time":"2025-06-29T00:05:17.262985061+08:00","level":"ERROR","msg":"sender: upsertConfig: RunRecord is nil"}
|
| 10 |
+
{"time":"2025-06-29T00:05:22.061017174+08:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.bandw.top/graphql"}
|
| 11 |
+
{"time":"2025-06-29T00:05:22.061118361+08:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: returned error 404: {\"data\":{\"createRunFiles\":null},\"errors\":[{\"message\":\"project /stage1_06282348_ddp not found during createRunFiles\",\"path\":[\"createRunFiles\"]}]}"}
|
| 12 |
+
{"time":"2025-06-29T00:05:22.066030873+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 13 |
+
{"time":"2025-06-29T00:05:22.067356057+08:00","level":"INFO","msg":"handler: closed","stream_id":"y2lylvs5"}
|
| 14 |
+
{"time":"2025-06-29T00:05:22.067390632+08:00","level":"INFO","msg":"sender: closed","stream_id":"y2lylvs5"}
|
| 15 |
+
{"time":"2025-06-29T00:05:22.067390315+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"y2lylvs5"}
|
| 16 |
+
{"time":"2025-06-29T00:05:22.070426682+08:00","level":"INFO","msg":"stream: closed","id":"y2lylvs5"}
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/logs/debug.log
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_setup.py:_flush():70] Configure stats pid to 937487
|
| 3 |
+
2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/logs/debug.log
|
| 7 |
+
2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/logs/debug-internal.log
|
| 8 |
+
2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-29 00:04:21,828 INFO MainThread:937487 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-29 00:04:21,828 INFO MainThread:937487 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-29 00:04:21,828 INFO MainThread:937487 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-29 00:04:21,829 INFO MainThread:937487 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-29 00:04:21,832 INFO MainThread:937487 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-29 00:04:21,838 INFO MainThread:937487 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-29 00:04:21,842 INFO MainThread:937487 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/run-y2lylvs5.wandb
ADDED
|
Binary file (404 Bytes). View file
|
|
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/config.yaml
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.11
|
| 4 |
+
m:
|
| 5 |
+
- "1": lr
|
| 6 |
+
"5": 2
|
| 7 |
+
"6":
|
| 8 |
+
- 1
|
| 9 |
+
- 3
|
| 10 |
+
"7": []
|
| 11 |
+
- "1": trainer/global_step
|
| 12 |
+
"6":
|
| 13 |
+
- 3
|
| 14 |
+
"7": []
|
| 15 |
+
- "1": loader0/val_loss_ptm/dataloader_idx_0
|
| 16 |
+
"5": 2
|
| 17 |
+
"6":
|
| 18 |
+
- 1
|
| 19 |
+
- 3
|
| 20 |
+
"7": []
|
| 21 |
+
- "1": loader2/val_loss/dataloader_idx_2
|
| 22 |
+
"5": 2
|
| 23 |
+
"6":
|
| 24 |
+
- 1
|
| 25 |
+
- 3
|
| 26 |
+
"7": []
|
| 27 |
+
- "1": loader2/val_loss_lm/dataloader_idx_2
|
| 28 |
+
"5": 2
|
| 29 |
+
"6":
|
| 30 |
+
- 1
|
| 31 |
+
- 3
|
| 32 |
+
"7": []
|
| 33 |
+
- "1": epoch
|
| 34 |
+
"5": 2
|
| 35 |
+
"6":
|
| 36 |
+
- 1
|
| 37 |
+
- 3
|
| 38 |
+
"7": []
|
| 39 |
+
- "1": loader2/val_loss_ptc/dataloader_idx_2
|
| 40 |
+
"5": 2
|
| 41 |
+
"6":
|
| 42 |
+
- 1
|
| 43 |
+
- 3
|
| 44 |
+
"7": []
|
| 45 |
+
- "1": loader0/val_loss_ptc/dataloader_idx_0
|
| 46 |
+
"5": 2
|
| 47 |
+
"6":
|
| 48 |
+
- 1
|
| 49 |
+
- 3
|
| 50 |
+
"7": []
|
| 51 |
+
- "1": train_loss_lm
|
| 52 |
+
"5": 2
|
| 53 |
+
"6":
|
| 54 |
+
- 1
|
| 55 |
+
- 3
|
| 56 |
+
"7": []
|
| 57 |
+
- "1": train_loss
|
| 58 |
+
"5": 2
|
| 59 |
+
"6":
|
| 60 |
+
- 1
|
| 61 |
+
- 3
|
| 62 |
+
"7": []
|
| 63 |
+
- "1": loader1/val_loss_ptc/dataloader_idx_1
|
| 64 |
+
"5": 2
|
| 65 |
+
"6":
|
| 66 |
+
- 1
|
| 67 |
+
- 3
|
| 68 |
+
"7": []
|
| 69 |
+
- "1": loader1/val_loss_ptm/dataloader_idx_1
|
| 70 |
+
"5": 2
|
| 71 |
+
"6":
|
| 72 |
+
- 1
|
| 73 |
+
- 3
|
| 74 |
+
"7": []
|
| 75 |
+
- "1": loader0/val_loss/dataloader_idx_0
|
| 76 |
+
"5": 2
|
| 77 |
+
"6":
|
| 78 |
+
- 1
|
| 79 |
+
- 3
|
| 80 |
+
"7": []
|
| 81 |
+
- "1": loader0/val_loss_lm/dataloader_idx_0
|
| 82 |
+
"5": 2
|
| 83 |
+
"6":
|
| 84 |
+
- 1
|
| 85 |
+
- 3
|
| 86 |
+
"7": []
|
| 87 |
+
- "1": train_loss_ptm
|
| 88 |
+
"5": 2
|
| 89 |
+
"6":
|
| 90 |
+
- 1
|
| 91 |
+
- 3
|
| 92 |
+
"7": []
|
| 93 |
+
- "1": train_loss_ptc
|
| 94 |
+
"5": 2
|
| 95 |
+
"6":
|
| 96 |
+
- 1
|
| 97 |
+
- 3
|
| 98 |
+
"7": []
|
| 99 |
+
- "1": loader1/val_loss_lm/dataloader_idx_1
|
| 100 |
+
"5": 2
|
| 101 |
+
"6":
|
| 102 |
+
- 1
|
| 103 |
+
- 3
|
| 104 |
+
"7": []
|
| 105 |
+
- "1": loader1/val_loss/dataloader_idx_1
|
| 106 |
+
"5": 2
|
| 107 |
+
"6":
|
| 108 |
+
- 1
|
| 109 |
+
- 3
|
| 110 |
+
"7": []
|
| 111 |
+
- "1": loader2/val_loss_ptm/dataloader_idx_2
|
| 112 |
+
"5": 2
|
| 113 |
+
"6":
|
| 114 |
+
- 1
|
| 115 |
+
- 3
|
| 116 |
+
"7": []
|
| 117 |
+
python_version: 3.10.0
|
| 118 |
+
t:
|
| 119 |
+
"1":
|
| 120 |
+
- 1
|
| 121 |
+
- 5
|
| 122 |
+
- 9
|
| 123 |
+
- 11
|
| 124 |
+
- 33
|
| 125 |
+
- 41
|
| 126 |
+
- 49
|
| 127 |
+
- 53
|
| 128 |
+
- 55
|
| 129 |
+
- 63
|
| 130 |
+
- 103
|
| 131 |
+
"2":
|
| 132 |
+
- 1
|
| 133 |
+
- 5
|
| 134 |
+
- 9
|
| 135 |
+
- 11
|
| 136 |
+
- 33
|
| 137 |
+
- 41
|
| 138 |
+
- 49
|
| 139 |
+
- 53
|
| 140 |
+
- 55
|
| 141 |
+
- 63
|
| 142 |
+
- 103
|
| 143 |
+
"3":
|
| 144 |
+
- 7
|
| 145 |
+
- 23
|
| 146 |
+
- 33
|
| 147 |
+
- 55
|
| 148 |
+
- 66
|
| 149 |
+
"4": 3.10.0
|
| 150 |
+
"5": 0.19.11
|
| 151 |
+
"6": 4.52.3
|
| 152 |
+
"8":
|
| 153 |
+
- 5
|
| 154 |
+
"12": 0.19.11
|
| 155 |
+
"13": linux-x86_64
|
| 156 |
+
accelerator:
|
| 157 |
+
value: gpu
|
| 158 |
+
batch_size:
|
| 159 |
+
value: 96
|
| 160 |
+
bert_hidden_dim:
|
| 161 |
+
value: 768
|
| 162 |
+
bert_name:
|
| 163 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
|
| 164 |
+
check_val_every_n_epoch:
|
| 165 |
+
value: 1
|
| 166 |
+
cross_attention_freq:
|
| 167 |
+
value: 2
|
| 168 |
+
devices:
|
| 169 |
+
value: 0,1,2,3,4,5,6,7
|
| 170 |
+
filename:
|
| 171 |
+
value: stage1_06282348_ddp
|
| 172 |
+
init_checkpoint:
|
| 173 |
+
value: ""
|
| 174 |
+
init_lr:
|
| 175 |
+
value: 0.0001
|
| 176 |
+
lm:
|
| 177 |
+
value: true
|
| 178 |
+
load_4bit:
|
| 179 |
+
value: false
|
| 180 |
+
lr_decay_rate:
|
| 181 |
+
value: 0.9
|
| 182 |
+
match_batch_size:
|
| 183 |
+
value: 64
|
| 184 |
+
max_epochs:
|
| 185 |
+
value: 20
|
| 186 |
+
min_lr:
|
| 187 |
+
value: 1e-05
|
| 188 |
+
mix_dataset:
|
| 189 |
+
value: true
|
| 190 |
+
mode:
|
| 191 |
+
value: train
|
| 192 |
+
num_query_token:
|
| 193 |
+
value: 8
|
| 194 |
+
num_workers:
|
| 195 |
+
value: 8
|
| 196 |
+
plm_name:
|
| 197 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
|
| 198 |
+
plm_tune:
|
| 199 |
+
value: freeze
|
| 200 |
+
pool_size:
|
| 201 |
+
value: 0
|
| 202 |
+
precision:
|
| 203 |
+
value: bf16-mixed
|
| 204 |
+
projection_dim:
|
| 205 |
+
value: 256
|
| 206 |
+
prot_aug:
|
| 207 |
+
value: None
|
| 208 |
+
prot_max_len:
|
| 209 |
+
value: 1024
|
| 210 |
+
ptm:
|
| 211 |
+
value: true
|
| 212 |
+
rerank_cand_num:
|
| 213 |
+
value: 128
|
| 214 |
+
retrieval_eval_epoch:
|
| 215 |
+
value: 10
|
| 216 |
+
root:
|
| 217 |
+
value: data
|
| 218 |
+
save_every_n_epochs:
|
| 219 |
+
value: 5
|
| 220 |
+
scheduler:
|
| 221 |
+
value: linear_warmup_cosine_lr
|
| 222 |
+
seed:
|
| 223 |
+
value: 42
|
| 224 |
+
strategy:
|
| 225 |
+
value: ddp
|
| 226 |
+
temperature:
|
| 227 |
+
value: 0.1
|
| 228 |
+
text_max_len:
|
| 229 |
+
value: 128
|
| 230 |
+
use_wandb_logger:
|
| 231 |
+
value: true
|
| 232 |
+
warmup_lr:
|
| 233 |
+
value: 1e-06
|
| 234 |
+
warmup_steps:
|
| 235 |
+
value: 1000
|
| 236 |
+
weight_decay:
|
| 237 |
+
value: 0.05
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/output.log
ADDED
|
File without changes
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/requirements.txt
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
stack-data==0.6.3
|
| 2 |
+
yarl==1.20.0
|
| 3 |
+
setuptools==78.1.1
|
| 4 |
+
cloudpathlib==0.21.1
|
| 5 |
+
pytz==2025.2
|
| 6 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 7 |
+
shellingham==1.5.4
|
| 8 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 9 |
+
Jinja2==3.1.6
|
| 10 |
+
pycocotools==2.0.8
|
| 11 |
+
pandas==2.2.3
|
| 12 |
+
scipy==1.15.3
|
| 13 |
+
tenacity==9.1.2
|
| 14 |
+
lightning-utilities==0.14.3
|
| 15 |
+
cfgv==3.4.0
|
| 16 |
+
hf-xet==1.1.2
|
| 17 |
+
platformdirs==4.3.8
|
| 18 |
+
smart-open==7.1.0
|
| 19 |
+
text-unidecode==1.3
|
| 20 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 21 |
+
omegaconf==2.3.0
|
| 22 |
+
referencing==0.36.2
|
| 23 |
+
mdurl==0.1.2
|
| 24 |
+
gitdb==4.0.12
|
| 25 |
+
identify==2.6.12
|
| 26 |
+
ipython==8.36.0
|
| 27 |
+
spacy-loggers==1.0.5
|
| 28 |
+
distlib==0.3.9
|
| 29 |
+
typing-inspection==0.4.1
|
| 30 |
+
antlr4-python3-runtime==4.9.3
|
| 31 |
+
multidict==6.4.4
|
| 32 |
+
nvidia-curand-cu12==10.3.5.147
|
| 33 |
+
prompt_toolkit==3.0.51
|
| 34 |
+
Pygments==2.19.1
|
| 35 |
+
numpy==2.2.6
|
| 36 |
+
decord==0.6.0
|
| 37 |
+
srsly==2.5.1
|
| 38 |
+
watchdog==6.0.0
|
| 39 |
+
pure_eval==0.2.3
|
| 40 |
+
virtualenv==20.31.2
|
| 41 |
+
altair==5.5.0
|
| 42 |
+
matplotlib-inline==0.1.7
|
| 43 |
+
bleach==6.2.0
|
| 44 |
+
exceptiongroup==1.3.0
|
| 45 |
+
fairscale==0.4.4
|
| 46 |
+
confection==0.1.5
|
| 47 |
+
fonttools==4.58.0
|
| 48 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 49 |
+
ptyprocess==0.7.0
|
| 50 |
+
pytorch-lightning==2.5.1.post0
|
| 51 |
+
nodeenv==1.9.1
|
| 52 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 53 |
+
requests==2.32.3
|
| 54 |
+
marisa-trie==1.2.1
|
| 55 |
+
cachetools==5.5.2
|
| 56 |
+
matplotlib==3.10.3
|
| 57 |
+
typing_extensions==4.13.2
|
| 58 |
+
asttokens==3.0.0
|
| 59 |
+
torch==2.6.0
|
| 60 |
+
PyYAML==6.0.2
|
| 61 |
+
tifffile==2025.5.10
|
| 62 |
+
spacy==3.8.7
|
| 63 |
+
braceexpand==0.1.7
|
| 64 |
+
plotly==6.1.1
|
| 65 |
+
attrs==25.3.0
|
| 66 |
+
py-cpuinfo==9.0.0
|
| 67 |
+
frozenlist==1.6.0
|
| 68 |
+
catalogue==2.0.10
|
| 69 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 70 |
+
traitlets==5.14.3
|
| 71 |
+
annotated-types==0.7.0
|
| 72 |
+
language_data==1.3.0
|
| 73 |
+
thinc==8.3.6
|
| 74 |
+
imageio==2.37.0
|
| 75 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 76 |
+
certifi==2025.4.26
|
| 77 |
+
smmap==5.0.2
|
| 78 |
+
python-magic==0.4.27
|
| 79 |
+
triton==3.2.0
|
| 80 |
+
weasel==0.4.1
|
| 81 |
+
async-timeout==5.0.1
|
| 82 |
+
wcwidth==0.2.13
|
| 83 |
+
pillow==11.2.1
|
| 84 |
+
torchmetrics==1.7.1
|
| 85 |
+
kaggle==1.7.4.5
|
| 86 |
+
regex==2024.11.6
|
| 87 |
+
aiosignal==1.3.2
|
| 88 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 89 |
+
scikit-image==0.25.2
|
| 90 |
+
nvidia-nvtx-cu12==12.4.127
|
| 91 |
+
opendatasets==0.1.22
|
| 92 |
+
iopath==0.1.10
|
| 93 |
+
pyparsing==3.2.3
|
| 94 |
+
portalocker==3.1.1
|
| 95 |
+
executing==2.2.0
|
| 96 |
+
contexttimer==0.3.3
|
| 97 |
+
lazy_loader==0.4
|
| 98 |
+
wrapt==1.17.2
|
| 99 |
+
webdataset==0.2.111
|
| 100 |
+
blis==1.3.0
|
| 101 |
+
idna==3.10
|
| 102 |
+
timm==0.4.12
|
| 103 |
+
einops==0.8.1
|
| 104 |
+
packaging==24.2
|
| 105 |
+
decorator==5.2.1
|
| 106 |
+
filelock==3.18.0
|
| 107 |
+
python-slugify==8.0.4
|
| 108 |
+
cycler==0.12.1
|
| 109 |
+
charset-normalizer==3.4.2
|
| 110 |
+
pydantic==2.11.5
|
| 111 |
+
pydeck==0.9.1
|
| 112 |
+
tzdata==2025.2
|
| 113 |
+
jedi==0.19.2
|
| 114 |
+
aiohappyeyeballs==2.6.1
|
| 115 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 116 |
+
salesforce-lavis==1.0.2
|
| 117 |
+
parso==0.8.4
|
| 118 |
+
nvidia-nccl-cu12==2.21.5
|
| 119 |
+
toml==0.10.2
|
| 120 |
+
python-dateutil==2.9.0.post0
|
| 121 |
+
rich==14.0.0
|
| 122 |
+
tqdm==4.67.1
|
| 123 |
+
rpds-py==0.25.1
|
| 124 |
+
opencv-python-headless==4.5.5.64
|
| 125 |
+
tornado==6.5.1
|
| 126 |
+
propcache==0.3.1
|
| 127 |
+
webencodings==0.5.1
|
| 128 |
+
murmurhash==1.0.13
|
| 129 |
+
contourpy==1.3.2
|
| 130 |
+
joblib==1.5.1
|
| 131 |
+
networkx==3.4.2
|
| 132 |
+
six==1.17.0
|
| 133 |
+
markdown-it-py==3.0.0
|
| 134 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 135 |
+
msgpack==1.1.0
|
| 136 |
+
sentencepiece==0.2.0
|
| 137 |
+
cymem==2.0.11
|
| 138 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 139 |
+
absl-py==2.2.2
|
| 140 |
+
hjson==3.1.0
|
| 141 |
+
mpmath==1.3.0
|
| 142 |
+
pydantic_core==2.33.2
|
| 143 |
+
psutil==7.0.0
|
| 144 |
+
nvidia-ml-py==12.575.51
|
| 145 |
+
pyarrow==20.0.0
|
| 146 |
+
kiwisolver==1.4.8
|
| 147 |
+
sympy==1.13.1
|
| 148 |
+
ninja==1.11.1.4
|
| 149 |
+
rouge_score==0.1.2
|
| 150 |
+
deepspeed==0.16.10+b666844f
|
| 151 |
+
spacy-legacy==3.0.12
|
| 152 |
+
pycocoevalcap==1.2
|
| 153 |
+
pexpect==4.9.0
|
| 154 |
+
ftfy==6.3.1
|
| 155 |
+
protobuf==6.31.0
|
| 156 |
+
urllib3==2.4.0
|
| 157 |
+
wheel==0.45.1
|
| 158 |
+
nltk==3.9.1
|
| 159 |
+
streamlit==1.45.1
|
| 160 |
+
wasabi==1.1.3
|
| 161 |
+
pre_commit==4.2.0
|
| 162 |
+
safetensors==0.5.3
|
| 163 |
+
jsonschema-specifications==2025.4.1
|
| 164 |
+
langcodes==3.5.0
|
| 165 |
+
GitPython==3.1.44
|
| 166 |
+
blinker==1.9.0
|
| 167 |
+
torchvision==0.21.0
|
| 168 |
+
MarkupSafe==3.0.2
|
| 169 |
+
dill==0.3.8
|
| 170 |
+
yacs==0.1.8
|
| 171 |
+
pathlib==1.0.1
|
| 172 |
+
scikit-learn==1.6.1
|
| 173 |
+
cffi==1.17.1
|
| 174 |
+
pycparser==2.22
|
| 175 |
+
flash-attn==2.7.1.post1
|
| 176 |
+
cryptography==45.0.3
|
| 177 |
+
pycryptodome==3.23.0
|
| 178 |
+
cheroot==10.0.1
|
| 179 |
+
more-itertools==10.7.0
|
| 180 |
+
setproctitle==1.3.6
|
| 181 |
+
delta-center-client==0.0.4
|
| 182 |
+
jmespath==0.10.0
|
| 183 |
+
xxhash==3.5.0
|
| 184 |
+
pip==25.1.1
|
| 185 |
+
aliyun-python-sdk-core==2.16.0
|
| 186 |
+
jaraco.functools==4.1.0
|
| 187 |
+
bigmodelvis==0.0.1
|
| 188 |
+
aiohttp==3.12.2
|
| 189 |
+
multiprocess==0.70.16
|
| 190 |
+
opendelta==0.3.2
|
| 191 |
+
docker-pycreds==0.4.0
|
| 192 |
+
threadpoolctl==3.6.0
|
| 193 |
+
click==8.2.1
|
| 194 |
+
oss2==2.15.0
|
| 195 |
+
crcmod==1.7
|
| 196 |
+
transformers==4.52.3
|
| 197 |
+
datasets==3.6.0
|
| 198 |
+
jsonschema==4.24.0
|
| 199 |
+
opencv-python==4.11.0.86
|
| 200 |
+
wandb==0.19.11
|
| 201 |
+
fsspec==2025.3.0
|
| 202 |
+
tokenizers==0.21.1
|
| 203 |
+
sentry-sdk==2.29.1
|
| 204 |
+
preshed==3.0.10
|
| 205 |
+
aliyun-python-sdk-kms==2.16.5
|
| 206 |
+
huggingface-hub==0.32.1
|
| 207 |
+
typer==0.16.0
|
| 208 |
+
narwhals==1.41.0
|
| 209 |
+
web.py==0.62
|
| 210 |
+
autocommand==2.2.2
|
| 211 |
+
importlib_metadata==8.0.0
|
| 212 |
+
zipp==3.19.2
|
| 213 |
+
jaraco.context==5.3.0
|
| 214 |
+
typeguard==4.3.0
|
| 215 |
+
jaraco.collections==5.1.0
|
| 216 |
+
typing_extensions==4.12.2
|
| 217 |
+
backports.tarfile==1.2.0
|
| 218 |
+
jaraco.functools==4.0.1
|
| 219 |
+
more-itertools==10.3.0
|
| 220 |
+
platformdirs==4.2.2
|
| 221 |
+
packaging==24.2
|
| 222 |
+
tomli==2.0.1
|
| 223 |
+
jaraco.text==3.12.1
|
| 224 |
+
wheel==0.45.1
|
| 225 |
+
inflect==7.3.1
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-06-28T16:07:02.125247Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--devices",
|
| 7 |
+
"0,1,2,3,4,5,6,7",
|
| 8 |
+
"--mode",
|
| 9 |
+
"train",
|
| 10 |
+
"--filename",
|
| 11 |
+
"stage1_06282348_ddp",
|
| 12 |
+
"--num_query_token",
|
| 13 |
+
"8",
|
| 14 |
+
"--plm_name",
|
| 15 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
|
| 16 |
+
"--bert_name",
|
| 17 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
|
| 18 |
+
"--save_every_n_epochs",
|
| 19 |
+
"5",
|
| 20 |
+
"--max_epochs",
|
| 21 |
+
"20",
|
| 22 |
+
"--batch_size",
|
| 23 |
+
"96",
|
| 24 |
+
"--precision",
|
| 25 |
+
"bf16-mixed",
|
| 26 |
+
"--mix_dataset",
|
| 27 |
+
"--num_workers",
|
| 28 |
+
"8",
|
| 29 |
+
"--use_wandb_logger",
|
| 30 |
+
"--strategy",
|
| 31 |
+
"ddp"
|
| 32 |
+
],
|
| 33 |
+
"program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
|
| 34 |
+
"codePath": "stage1.py",
|
| 35 |
+
"root": "./all_checkpoints/stage1_06282348_ddp/",
|
| 36 |
+
"host": "dsw-265304-57b7b77cbc-vwbwc",
|
| 37 |
+
"executable": "/root/miniconda3/envs/protT3/bin/python",
|
| 38 |
+
"codePathLocal": "stage1.py",
|
| 39 |
+
"cpu_count": 64,
|
| 40 |
+
"cpu_count_logical": 64,
|
| 41 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 42 |
+
"gpu_count": 8,
|
| 43 |
+
"disk": {
|
| 44 |
+
"/": {
|
| 45 |
+
"total": "1623302262784",
|
| 46 |
+
"used": "1285242880"
|
| 47 |
+
}
|
| 48 |
+
},
|
| 49 |
+
"memory": {
|
| 50 |
+
"total": "549755813888"
|
| 51 |
+
},
|
| 52 |
+
"cpu": {
|
| 53 |
+
"count": 64,
|
| 54 |
+
"countLogical": 64
|
| 55 |
+
},
|
| 56 |
+
"gpu_nvidia": [
|
| 57 |
+
{
|
| 58 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 59 |
+
"memoryTotal": "85198045184",
|
| 60 |
+
"architecture": "Ampere"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 64 |
+
"memoryTotal": "85198045184",
|
| 65 |
+
"architecture": "Ampere"
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 69 |
+
"memoryTotal": "85198045184",
|
| 70 |
+
"architecture": "Ampere"
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 74 |
+
"memoryTotal": "85198045184",
|
| 75 |
+
"architecture": "Ampere"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 79 |
+
"memoryTotal": "85198045184",
|
| 80 |
+
"architecture": "Ampere"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 84 |
+
"memoryTotal": "85198045184",
|
| 85 |
+
"architecture": "Ampere"
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 89 |
+
"memoryTotal": "85198045184",
|
| 90 |
+
"architecture": "Ampere"
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 94 |
+
"memoryTotal": "85198045184",
|
| 95 |
+
"architecture": "Ampere"
|
| 96 |
+
}
|
| 97 |
+
],
|
| 98 |
+
"cudaVersion": "12.1"
|
| 99 |
+
}
|
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"loader1/val_loss_lm/dataloader_idx_1":6.895899772644043,"loader2/val_loss_lm/dataloader_idx_2":7.282508373260498,"loader0/val_loss_ptc/dataloader_idx_0":0.645770788192749,"loader0/val_loss_lm/dataloader_idx_0":2.0093350410461426,"loader2/val_loss_ptc/dataloader_idx_2":4.874267101287842,"_step":243,"lr":1.0554024811426643e-05,"loader0/val_loss/dataloader_idx_0":3.0574357509613037,"train_loss_lm":1.5327174663543701,"train_loss":2.090651035308838,"loader0/val_loss_ptm/dataloader_idx_0":0.4023294746875763,"_runtime":36514.777432942,"loader2/val_loss_ptm/dataloader_idx_2":2.5906364917755127,"trainer/global_step":11199,"loader1/val_loss/dataloader_idx_1":14.83725357055664,"loader1/val_loss_ptm/dataloader_idx_1":2.7793824672698975,"_wandb":{"runtime":36546},"loader2/val_loss/dataloader_idx_2":14.74741268157959,"train_loss_ptm":0.22319991886615753,"train_loss_ptc":0.33473363518714905,"_timestamp":1.751163336902292e+09,"loader1/val_loss_ptc/dataloader_idx_1":5.1619720458984375,"epoch":19}
|
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-29T00:12:00.930911849+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-29T00:12:31.03707486+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 3 |
+
{"time":"2025-06-29T00:12:36.876219526+08:00","level":"INFO","msg":"created new stream","id":"vgvxxzqc"}
|
| 4 |
+
{"time":"2025-06-29T00:12:36.876272436+08:00","level":"INFO","msg":"stream: started","id":"vgvxxzqc"}
|
| 5 |
+
{"time":"2025-06-29T00:12:36.876317878+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"vgvxxzqc"}
|
| 6 |
+
{"time":"2025-06-29T00:12:36.876360145+08:00","level":"INFO","msg":"handler: started","stream_id":"vgvxxzqc"}
|
| 7 |
+
{"time":"2025-06-29T00:12:36.876401621+08:00","level":"INFO","msg":"sender: started","stream_id":"vgvxxzqc"}
|
| 8 |
+
{"time":"2025-06-29T00:12:39.839397838+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 9 |
+
{"time":"2025-06-29T00:13:20.87652364+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58014->104.21.20.172:443: read: connection reset by peer"}
|
| 10 |
+
{"time":"2025-06-29T00:16:17.426969211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 11 |
+
{"time":"2025-06-29T00:17:58.51721346+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 12 |
+
{"time":"2025-06-29T00:26:10.222503445+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51478->172.67.193.61:443: read: connection timed out"}
|
| 13 |
+
{"time":"2025-06-29T00:28:34.807681677+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46066->104.21.20.172:443: read: connection reset by peer"}
|
| 14 |
+
{"time":"2025-06-29T00:33:01.870533298+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55734->104.21.20.172:443: read: connection timed out"}
|
| 15 |
+
{"time":"2025-06-29T00:33:23.754842161+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46042->172.67.193.61:443: read: connection reset by peer"}
|
| 16 |
+
{"time":"2025-06-29T00:36:47.149592254+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:57176->172.67.193.61:443: read: connection timed out"}
|
| 17 |
+
{"time":"2025-06-29T00:41:05.645712211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 18 |
+
{"time":"2025-06-29T00:47:20.493525378+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58300->104.21.20.172:443: read: connection timed out"}
|
| 19 |
+
{"time":"2025-06-29T00:47:45.629981285+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 20 |
+
{"time":"2025-06-29T00:51:19.597480154+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:60802->104.21.20.172:443: read: connection timed out"}
|
| 21 |
+
{"time":"2025-06-29T00:54:09.777365701+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51532->104.21.20.172:443: read: connection reset by peer"}
|
| 22 |
+
{"time":"2025-06-29T01:00:20.78154967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:34934->172.67.193.61:443: read: connection timed out"}
|
| 23 |
+
{"time":"2025-06-29T01:04:21.421531776+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55938->104.21.20.172:443: read: connection timed out"}
|
| 24 |
+
{"time":"2025-06-29T01:05:41.05509194+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45678->172.67.193.61:443: read: connection reset by peer"}
|
| 25 |
+
{"time":"2025-06-29T01:08:44.130266043+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 26 |
+
{"time":"2025-06-29T01:10:59.724692621+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 27 |
+
{"time":"2025-06-29T01:14:35.821570745+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:35232->104.21.20.172:443: read: connection timed out"}
|
| 28 |
+
{"time":"2025-06-29T01:17:35.533530754+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:44258->104.21.20.172:443: read: connection timed out"}
|
| 29 |
+
{"time":"2025-06-29T01:23:12.630381225+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:48108->172.67.193.61:443: read: connection reset by peer"}
|
| 30 |
+
{"time":"2025-06-29T01:24:55.067569821+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 31 |
+
{"time":"2025-06-29T01:25:27.188065501+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 32 |
+
{"time":"2025-06-29T01:25:56.782489942+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45024->104.21.20.172:443: read: connection timed out"}
|
| 33 |
+
{"time":"2025-06-29T01:26:49.538892815+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 34 |
+
{"time":"2025-06-29T01:29:46.157546022+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:42508->172.67.193.61:443: read: connection timed out"}
|
| 35 |
+
{"time":"2025-06-29T05:53:57.9412544+08:00","level":"INFO","msg":"stream: closing","id":"vgvxxzqc"}
|
| 36 |
+
{"time":"2025-06-29T05:53:57.941286983+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 37 |
+
{"time":"2025-06-29T05:53:57.942366437+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 38 |
+
{"time":"2025-06-29T05:54:00.869660002+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 39 |
+
{"time":"2025-06-29T05:54:03.731237694+08:00","level":"INFO","msg":"handler: closed","stream_id":"vgvxxzqc"}
|
| 40 |
+
{"time":"2025-06-29T05:54:03.731282348+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"vgvxxzqc"}
|
| 41 |
+
{"time":"2025-06-29T05:54:03.731313818+08:00","level":"INFO","msg":"sender: closed","stream_id":"vgvxxzqc"}
|
| 42 |
+
{"time":"2025-06-29T05:54:03.735031072+08:00","level":"INFO","msg":"stream: closed","id":"vgvxxzqc"}
|
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Configure stats pid to 2351
|
| 3 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log
|
| 7 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log
|
| 8 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-29 00:12:00,923 INFO MainThread:2351 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-29 00:12:00,924 INFO MainThread:2351 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-29 00:12:00,926 INFO MainThread:2351 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-29 00:12:00,929 INFO MainThread:2351 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-29 00:12:39,788 INFO MainThread:2351 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-06-29 00:12:40,036 INFO MainThread:2351 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-06-29 00:12:46,669 INFO MainThread:2351 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06290009_deepspeed', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 168, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
|
| 24 |
+
2025-06-29 05:53:57,929 INFO MsgRouterThr:2351 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
|
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/config.yaml
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.11
|
| 4 |
+
m:
|
| 5 |
+
- "1": trainer/global_step
|
| 6 |
+
"6":
|
| 7 |
+
- 3
|
| 8 |
+
"7": []
|
| 9 |
+
- "1": train_loss
|
| 10 |
+
"5": 1
|
| 11 |
+
"6":
|
| 12 |
+
- 1
|
| 13 |
+
- 3
|
| 14 |
+
"7": []
|
| 15 |
+
- "1": loader1/val_loss/dataloader_idx_1
|
| 16 |
+
"5": 1
|
| 17 |
+
"6":
|
| 18 |
+
- 1
|
| 19 |
+
- 3
|
| 20 |
+
"7": []
|
| 21 |
+
- "1": loader0/val_loss_lm/dataloader_idx_0
|
| 22 |
+
"5": 1
|
| 23 |
+
"6":
|
| 24 |
+
- 1
|
| 25 |
+
- 3
|
| 26 |
+
"7": []
|
| 27 |
+
- "1": loader1/val_loss_ptm/dataloader_idx_1
|
| 28 |
+
"5": 1
|
| 29 |
+
"6":
|
| 30 |
+
- 1
|
| 31 |
+
- 3
|
| 32 |
+
"7": []
|
| 33 |
+
- "1": loader2/val_loss_lm/dataloader_idx_2
|
| 34 |
+
"5": 1
|
| 35 |
+
"6":
|
| 36 |
+
- 1
|
| 37 |
+
- 3
|
| 38 |
+
"7": []
|
| 39 |
+
- "1": loader1/val_loss_ptc/dataloader_idx_1
|
| 40 |
+
"5": 1
|
| 41 |
+
"6":
|
| 42 |
+
- 1
|
| 43 |
+
- 3
|
| 44 |
+
"7": []
|
| 45 |
+
- "1": epoch
|
| 46 |
+
"5": 1
|
| 47 |
+
"6":
|
| 48 |
+
- 1
|
| 49 |
+
- 3
|
| 50 |
+
"7": []
|
| 51 |
+
- "1": lr
|
| 52 |
+
"5": 1
|
| 53 |
+
"6":
|
| 54 |
+
- 1
|
| 55 |
+
- 3
|
| 56 |
+
"7": []
|
| 57 |
+
- "1": loader2/val_loss_ptc/dataloader_idx_2
|
| 58 |
+
"5": 1
|
| 59 |
+
"6":
|
| 60 |
+
- 1
|
| 61 |
+
- 3
|
| 62 |
+
"7": []
|
| 63 |
+
- "1": loader0/val_loss_ptm/dataloader_idx_0
|
| 64 |
+
"5": 1
|
| 65 |
+
"6":
|
| 66 |
+
- 1
|
| 67 |
+
- 3
|
| 68 |
+
"7": []
|
| 69 |
+
- "1": train_loss_ptc
|
| 70 |
+
"5": 1
|
| 71 |
+
"6":
|
| 72 |
+
- 1
|
| 73 |
+
- 3
|
| 74 |
+
"7": []
|
| 75 |
+
- "1": train_loss_ptm
|
| 76 |
+
"5": 1
|
| 77 |
+
"6":
|
| 78 |
+
- 1
|
| 79 |
+
- 3
|
| 80 |
+
"7": []
|
| 81 |
+
- "1": train_loss_lm
|
| 82 |
+
"5": 1
|
| 83 |
+
"6":
|
| 84 |
+
- 1
|
| 85 |
+
- 3
|
| 86 |
+
"7": []
|
| 87 |
+
- "1": loader2/val_loss_ptm/dataloader_idx_2
|
| 88 |
+
"5": 1
|
| 89 |
+
"6":
|
| 90 |
+
- 1
|
| 91 |
+
- 3
|
| 92 |
+
"7": []
|
| 93 |
+
- "1": loader0/val_loss/dataloader_idx_0
|
| 94 |
+
"5": 1
|
| 95 |
+
"6":
|
| 96 |
+
- 1
|
| 97 |
+
- 3
|
| 98 |
+
"7": []
|
| 99 |
+
- "1": loader2/val_loss/dataloader_idx_2
|
| 100 |
+
"5": 1
|
| 101 |
+
"6":
|
| 102 |
+
- 1
|
| 103 |
+
- 3
|
| 104 |
+
"7": []
|
| 105 |
+
- "1": loader1/val_loss_lm/dataloader_idx_1
|
| 106 |
+
"5": 1
|
| 107 |
+
"6":
|
| 108 |
+
- 1
|
| 109 |
+
- 3
|
| 110 |
+
"7": []
|
| 111 |
+
- "1": loader0/val_loss_ptc/dataloader_idx_0
|
| 112 |
+
"5": 1
|
| 113 |
+
"6":
|
| 114 |
+
- 1
|
| 115 |
+
- 3
|
| 116 |
+
"7": []
|
| 117 |
+
python_version: 3.10.0
|
| 118 |
+
t:
|
| 119 |
+
"1":
|
| 120 |
+
- 1
|
| 121 |
+
- 5
|
| 122 |
+
- 9
|
| 123 |
+
- 11
|
| 124 |
+
- 33
|
| 125 |
+
- 41
|
| 126 |
+
- 49
|
| 127 |
+
- 53
|
| 128 |
+
- 55
|
| 129 |
+
- 63
|
| 130 |
+
- 103
|
| 131 |
+
"2":
|
| 132 |
+
- 1
|
| 133 |
+
- 5
|
| 134 |
+
- 9
|
| 135 |
+
- 11
|
| 136 |
+
- 33
|
| 137 |
+
- 41
|
| 138 |
+
- 49
|
| 139 |
+
- 53
|
| 140 |
+
- 55
|
| 141 |
+
- 63
|
| 142 |
+
- 103
|
| 143 |
+
"3":
|
| 144 |
+
- 7
|
| 145 |
+
- 23
|
| 146 |
+
- 55
|
| 147 |
+
- 66
|
| 148 |
+
"4": 3.10.0
|
| 149 |
+
"5": 0.19.11
|
| 150 |
+
"6": 4.52.3
|
| 151 |
+
"8":
|
| 152 |
+
- 5
|
| 153 |
+
"12": 0.19.11
|
| 154 |
+
"13": linux-x86_64
|
| 155 |
+
accelerator:
|
| 156 |
+
value: gpu
|
| 157 |
+
batch_size:
|
| 158 |
+
value: 168
|
| 159 |
+
bert_hidden_dim:
|
| 160 |
+
value: 768
|
| 161 |
+
bert_name:
|
| 162 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
|
| 163 |
+
check_val_every_n_epoch:
|
| 164 |
+
value: 1
|
| 165 |
+
cross_attention_freq:
|
| 166 |
+
value: 2
|
| 167 |
+
devices:
|
| 168 |
+
value: 0,1,2,3,4,5,6,7
|
| 169 |
+
filename:
|
| 170 |
+
value: stage1_06290009_deepspeed
|
| 171 |
+
init_checkpoint:
|
| 172 |
+
value: ""
|
| 173 |
+
init_lr:
|
| 174 |
+
value: 0.0001
|
| 175 |
+
lm:
|
| 176 |
+
value: true
|
| 177 |
+
load_4bit:
|
| 178 |
+
value: false
|
| 179 |
+
lr_decay_rate:
|
| 180 |
+
value: 0.9
|
| 181 |
+
match_batch_size:
|
| 182 |
+
value: 64
|
| 183 |
+
max_epochs:
|
| 184 |
+
value: 20
|
| 185 |
+
min_lr:
|
| 186 |
+
value: 1e-05
|
| 187 |
+
mix_dataset:
|
| 188 |
+
value: true
|
| 189 |
+
mode:
|
| 190 |
+
value: train
|
| 191 |
+
num_query_token:
|
| 192 |
+
value: 8
|
| 193 |
+
num_workers:
|
| 194 |
+
value: 8
|
| 195 |
+
plm_name:
|
| 196 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
|
| 197 |
+
plm_tune:
|
| 198 |
+
value: freeze
|
| 199 |
+
pool_size:
|
| 200 |
+
value: 0
|
| 201 |
+
precision:
|
| 202 |
+
value: bf16-mixed
|
| 203 |
+
projection_dim:
|
| 204 |
+
value: 256
|
| 205 |
+
prot_aug:
|
| 206 |
+
value: None
|
| 207 |
+
prot_max_len:
|
| 208 |
+
value: 1024
|
| 209 |
+
ptm:
|
| 210 |
+
value: true
|
| 211 |
+
rerank_cand_num:
|
| 212 |
+
value: 128
|
| 213 |
+
retrieval_eval_epoch:
|
| 214 |
+
value: 10
|
| 215 |
+
root:
|
| 216 |
+
value: data
|
| 217 |
+
save_every_n_epochs:
|
| 218 |
+
value: 5
|
| 219 |
+
scheduler:
|
| 220 |
+
value: linear_warmup_cosine_lr
|
| 221 |
+
seed:
|
| 222 |
+
value: 42
|
| 223 |
+
strategy:
|
| 224 |
+
value: deepspeed
|
| 225 |
+
temperature:
|
| 226 |
+
value: 0.1
|
| 227 |
+
text_max_len:
|
| 228 |
+
value: 128
|
| 229 |
+
use_wandb_logger:
|
| 230 |
+
value: true
|
| 231 |
+
warmup_lr:
|
| 232 |
+
value: 1e-06
|
| 233 |
+
warmup_steps:
|
| 234 |
+
value: 1000
|
| 235 |
+
weight_decay:
|
| 236 |
+
value: 0.05
|
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/output.log
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed exists and is not empty.
|
| 2 |
+
Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
|
| 3 |
+
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
|
| 4 |
+
|
| 5 |
+
| Name | Type | Params | Mode
|
| 6 |
+
------------------------------------------------------
|
| 7 |
+
0 | blip2qformer | Blip2Qformer | 327 M | train
|
| 8 |
+
------------------------------------------------------
|
| 9 |
+
179 M Trainable params
|
| 10 |
+
147 M Non-trainable params
|
| 11 |
+
327 M Total params
|
| 12 |
+
1,309.467 Total estimated model params size (MB)
|
| 13 |
+
5 Modules in train mode
|
| 14 |
+
926 Modules in eval mode
|
| 15 |
+
Epoch 19: 100%|███████████████████████████████████████████| 320/320 [17:07<00:00, 0.31it/s, v_num=xzqc]
|
| 16 |
+
/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:220: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
|
| 17 |
+
with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
|
| 18 |
+
|
| 19 |
+
/nas/shared/kilab/wangyujia/ProtT3/model/dist_funs.py:18: FutureWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
|
| 20 |
+
sd = self.module.state_dict(destination, prefix, keep_vars)
|
| 21 |
+
`Trainer.fit` stopped: `max_epochs=20` reached.
|
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/requirements.txt
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
opendatasets==0.1.22
|
| 2 |
+
salesforce-lavis==1.0.2
|
| 3 |
+
Pygments==2.19.1
|
| 4 |
+
nvidia-nccl-cu12==2.21.5
|
| 5 |
+
tornado==6.5.1
|
| 6 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 7 |
+
requests==2.32.3
|
| 8 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 9 |
+
decord==0.6.0
|
| 10 |
+
braceexpand==0.1.7
|
| 11 |
+
frozenlist==1.6.0
|
| 12 |
+
markdown-it-py==3.0.0
|
| 13 |
+
shellingham==1.5.4
|
| 14 |
+
absl-py==2.2.2
|
| 15 |
+
pycocoevalcap==1.2
|
| 16 |
+
contexttimer==0.3.3
|
| 17 |
+
bleach==6.2.0
|
| 18 |
+
jsonschema-specifications==2025.4.1
|
| 19 |
+
pycocotools==2.0.8
|
| 20 |
+
python-slugify==8.0.4
|
| 21 |
+
tqdm==4.67.1
|
| 22 |
+
numpy==2.2.6
|
| 23 |
+
urllib3==2.4.0
|
| 24 |
+
deepspeed==0.16.10+b666844f
|
| 25 |
+
watchdog==6.0.0
|
| 26 |
+
wrapt==1.17.2
|
| 27 |
+
setuptools==78.1.1
|
| 28 |
+
matplotlib==3.10.3
|
| 29 |
+
pydeck==0.9.1
|
| 30 |
+
aiosignal==1.3.2
|
| 31 |
+
gitdb==4.0.12
|
| 32 |
+
hjson==3.1.0
|
| 33 |
+
timm==0.4.12
|
| 34 |
+
blis==1.3.0
|
| 35 |
+
PyYAML==6.0.2
|
| 36 |
+
referencing==0.36.2
|
| 37 |
+
contourpy==1.3.2
|
| 38 |
+
kaggle==1.7.4.5
|
| 39 |
+
triton==3.2.0
|
| 40 |
+
catalogue==2.0.10
|
| 41 |
+
idna==3.10
|
| 42 |
+
torch==2.6.0
|
| 43 |
+
text-unidecode==1.3
|
| 44 |
+
altair==5.5.0
|
| 45 |
+
cloudpathlib==0.21.1
|
| 46 |
+
protobuf==6.31.0
|
| 47 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 48 |
+
pytz==2025.2
|
| 49 |
+
sympy==1.13.1
|
| 50 |
+
spacy==3.8.7
|
| 51 |
+
MarkupSafe==3.0.2
|
| 52 |
+
thinc==8.3.6
|
| 53 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 54 |
+
wasabi==1.1.3
|
| 55 |
+
aiohappyeyeballs==2.6.1
|
| 56 |
+
nvidia-nvtx-cu12==12.4.127
|
| 57 |
+
rich==14.0.0
|
| 58 |
+
ipython==8.36.0
|
| 59 |
+
yarl==1.20.0
|
| 60 |
+
torchmetrics==1.7.1
|
| 61 |
+
multidict==6.4.4
|
| 62 |
+
cfgv==3.4.0
|
| 63 |
+
smmap==5.0.2
|
| 64 |
+
srsly==2.5.1
|
| 65 |
+
scikit-image==0.25.2
|
| 66 |
+
matplotlib-inline==0.1.7
|
| 67 |
+
annotated-types==0.7.0
|
| 68 |
+
lazy_loader==0.4
|
| 69 |
+
tenacity==9.1.2
|
| 70 |
+
GitPython==3.1.44
|
| 71 |
+
language_data==1.3.0
|
| 72 |
+
pydantic_core==2.33.2
|
| 73 |
+
sentencepiece==0.2.0
|
| 74 |
+
platformdirs==4.3.8
|
| 75 |
+
distlib==0.3.9
|
| 76 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 77 |
+
blinker==1.9.0
|
| 78 |
+
regex==2024.11.6
|
| 79 |
+
tifffile==2025.5.10
|
| 80 |
+
py-cpuinfo==9.0.0
|
| 81 |
+
attrs==25.3.0
|
| 82 |
+
mdurl==0.1.2
|
| 83 |
+
prompt_toolkit==3.0.51
|
| 84 |
+
packaging==24.2
|
| 85 |
+
async-timeout==5.0.1
|
| 86 |
+
six==1.17.0
|
| 87 |
+
executing==2.2.0
|
| 88 |
+
parso==0.8.4
|
| 89 |
+
omegaconf==2.3.0
|
| 90 |
+
wcwidth==0.2.13
|
| 91 |
+
murmurhash==1.0.13
|
| 92 |
+
stack-data==0.6.3
|
| 93 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 94 |
+
virtualenv==20.31.2
|
| 95 |
+
langcodes==3.5.0
|
| 96 |
+
fonttools==4.58.0
|
| 97 |
+
opencv-python-headless==4.5.5.64
|
| 98 |
+
jedi==0.19.2
|
| 99 |
+
torchvision==0.21.0
|
| 100 |
+
plotly==6.1.1
|
| 101 |
+
nodeenv==1.9.1
|
| 102 |
+
smart-open==7.1.0
|
| 103 |
+
toml==0.10.2
|
| 104 |
+
pytorch-lightning==2.5.1.post0
|
| 105 |
+
typing_extensions==4.13.2
|
| 106 |
+
safetensors==0.5.3
|
| 107 |
+
psutil==7.0.0
|
| 108 |
+
pillow==11.2.1
|
| 109 |
+
python-dateutil==2.9.0.post0
|
| 110 |
+
ftfy==6.3.1
|
| 111 |
+
scipy==1.15.3
|
| 112 |
+
webdataset==0.2.111
|
| 113 |
+
charset-normalizer==3.4.2
|
| 114 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 115 |
+
kiwisolver==1.4.8
|
| 116 |
+
nvidia-ml-py==12.575.51
|
| 117 |
+
confection==0.1.5
|
| 118 |
+
nvidia-curand-cu12==10.3.5.147
|
| 119 |
+
pandas==2.2.3
|
| 120 |
+
nltk==3.9.1
|
| 121 |
+
webencodings==0.5.1
|
| 122 |
+
pyarrow==20.0.0
|
| 123 |
+
asttokens==3.0.0
|
| 124 |
+
exceptiongroup==1.3.0
|
| 125 |
+
pre_commit==4.2.0
|
| 126 |
+
ninja==1.11.1.4
|
| 127 |
+
spacy-loggers==1.0.5
|
| 128 |
+
msgpack==1.1.0
|
| 129 |
+
lightning-utilities==0.14.3
|
| 130 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 131 |
+
tzdata==2025.2
|
| 132 |
+
cycler==0.12.1
|
| 133 |
+
hf-xet==1.1.2
|
| 134 |
+
antlr4-python3-runtime==4.9.3
|
| 135 |
+
iopath==0.1.10
|
| 136 |
+
pexpect==4.9.0
|
| 137 |
+
imageio==2.37.0
|
| 138 |
+
streamlit==1.45.1
|
| 139 |
+
python-magic==0.4.27
|
| 140 |
+
networkx==3.4.2
|
| 141 |
+
portalocker==3.1.1
|
| 142 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 143 |
+
propcache==0.3.1
|
| 144 |
+
ptyprocess==0.7.0
|
| 145 |
+
fairscale==0.4.4
|
| 146 |
+
rpds-py==0.25.1
|
| 147 |
+
certifi==2025.4.26
|
| 148 |
+
rouge_score==0.1.2
|
| 149 |
+
traitlets==5.14.3
|
| 150 |
+
identify==2.6.12
|
| 151 |
+
spacy-legacy==3.0.12
|
| 152 |
+
weasel==0.4.1
|
| 153 |
+
mpmath==1.3.0
|
| 154 |
+
cymem==2.0.11
|
| 155 |
+
typing-inspection==0.4.1
|
| 156 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 157 |
+
marisa-trie==1.2.1
|
| 158 |
+
einops==0.8.1
|
| 159 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 160 |
+
pydantic==2.11.5
|
| 161 |
+
cachetools==5.5.2
|
| 162 |
+
joblib==1.5.1
|
| 163 |
+
Jinja2==3.1.6
|
| 164 |
+
filelock==3.18.0
|
| 165 |
+
pyparsing==3.2.3
|
| 166 |
+
pure_eval==0.2.3
|
| 167 |
+
decorator==5.2.1
|
| 168 |
+
wheel==0.45.1
|
| 169 |
+
pycryptodome==3.23.0
|
| 170 |
+
cheroot==10.0.1
|
| 171 |
+
multiprocess==0.70.16
|
| 172 |
+
aiohttp==3.12.2
|
| 173 |
+
crcmod==1.7
|
| 174 |
+
fsspec==2025.3.0
|
| 175 |
+
jmespath==0.10.0
|
| 176 |
+
preshed==3.0.10
|
| 177 |
+
jaraco.functools==4.1.0
|
| 178 |
+
cryptography==45.0.3
|
| 179 |
+
sentry-sdk==2.29.1
|
| 180 |
+
tokenizers==0.21.1
|
| 181 |
+
opendelta==0.3.2
|
| 182 |
+
pycparser==2.22
|
| 183 |
+
narwhals==1.41.0
|
| 184 |
+
scikit-learn==1.6.1
|
| 185 |
+
dill==0.3.8
|
| 186 |
+
oss2==2.15.0
|
| 187 |
+
yacs==0.1.8
|
| 188 |
+
more-itertools==10.7.0
|
| 189 |
+
pip==25.1.1
|
| 190 |
+
threadpoolctl==3.6.0
|
| 191 |
+
flash-attn==2.7.1.post1
|
| 192 |
+
bigmodelvis==0.0.1
|
| 193 |
+
pathlib==1.0.1
|
| 194 |
+
delta-center-client==0.0.4
|
| 195 |
+
xxhash==3.5.0
|
| 196 |
+
wandb==0.19.11
|
| 197 |
+
setproctitle==1.3.6
|
| 198 |
+
aliyun-python-sdk-core==2.16.0
|
| 199 |
+
transformers==4.52.3
|
| 200 |
+
aliyun-python-sdk-kms==2.16.5
|
| 201 |
+
datasets==3.6.0
|
| 202 |
+
typer==0.16.0
|
| 203 |
+
docker-pycreds==0.4.0
|
| 204 |
+
click==8.2.1
|
| 205 |
+
huggingface-hub==0.32.1
|
| 206 |
+
web.py==0.62
|
| 207 |
+
cffi==1.17.1
|
| 208 |
+
opencv-python==4.11.0.86
|
| 209 |
+
jsonschema==4.24.0
|
| 210 |
+
typing_extensions==4.12.2
|
| 211 |
+
jaraco.functools==4.0.1
|
| 212 |
+
jaraco.text==3.12.1
|
| 213 |
+
jaraco.collections==5.1.0
|
| 214 |
+
inflect==7.3.1
|
| 215 |
+
more-itertools==10.3.0
|
| 216 |
+
packaging==24.2
|
| 217 |
+
importlib_metadata==8.0.0
|
| 218 |
+
backports.tarfile==1.2.0
|
| 219 |
+
typeguard==4.3.0
|
| 220 |
+
zipp==3.19.2
|
| 221 |
+
platformdirs==4.2.2
|
| 222 |
+
autocommand==2.2.2
|
| 223 |
+
jaraco.context==5.3.0
|
| 224 |
+
tomli==2.0.1
|
| 225 |
+
wheel==0.45.1
|
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-06-28T16:12:00.926076Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--devices",
|
| 7 |
+
"0,1,2,3,4,5,6,7",
|
| 8 |
+
"--mode",
|
| 9 |
+
"train",
|
| 10 |
+
"--filename",
|
| 11 |
+
"stage1_06290009_deepspeed",
|
| 12 |
+
"--num_query_token",
|
| 13 |
+
"8",
|
| 14 |
+
"--plm_name",
|
| 15 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
|
| 16 |
+
"--bert_name",
|
| 17 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
|
| 18 |
+
"--save_every_n_epochs",
|
| 19 |
+
"5",
|
| 20 |
+
"--max_epochs",
|
| 21 |
+
"20",
|
| 22 |
+
"--batch_size",
|
| 23 |
+
"168",
|
| 24 |
+
"--precision",
|
| 25 |
+
"bf16-mixed",
|
| 26 |
+
"--mix_dataset",
|
| 27 |
+
"--num_workers",
|
| 28 |
+
"8",
|
| 29 |
+
"--use_wandb_logger"
|
| 30 |
+
],
|
| 31 |
+
"program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
|
| 32 |
+
"codePath": "stage1.py",
|
| 33 |
+
"root": "./all_checkpoints/stage1_06290009_deepspeed/",
|
| 34 |
+
"host": "dsw-251511-c65bb988c-9g24f",
|
| 35 |
+
"executable": "/root/miniconda3/envs/protT3/bin/python",
|
| 36 |
+
"codePathLocal": "stage1.py",
|
| 37 |
+
"cpu_count": 64,
|
| 38 |
+
"cpu_count_logical": 64,
|
| 39 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 40 |
+
"gpu_count": 8,
|
| 41 |
+
"disk": {
|
| 42 |
+
"/": {
|
| 43 |
+
"total": "1623302262784",
|
| 44 |
+
"used": "987680768"
|
| 45 |
+
}
|
| 46 |
+
},
|
| 47 |
+
"memory": {
|
| 48 |
+
"total": "549755813888"
|
| 49 |
+
},
|
| 50 |
+
"cpu": {
|
| 51 |
+
"count": 64,
|
| 52 |
+
"countLogical": 64
|
| 53 |
+
},
|
| 54 |
+
"gpu_nvidia": [
|
| 55 |
+
{
|
| 56 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 57 |
+
"memoryTotal": "85198045184",
|
| 58 |
+
"architecture": "Ampere"
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 62 |
+
"memoryTotal": "85198045184",
|
| 63 |
+
"architecture": "Ampere"
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 67 |
+
"memoryTotal": "85198045184",
|
| 68 |
+
"architecture": "Ampere"
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 72 |
+
"memoryTotal": "85198045184",
|
| 73 |
+
"architecture": "Ampere"
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 77 |
+
"memoryTotal": "85198045184",
|
| 78 |
+
"architecture": "Ampere"
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 82 |
+
"memoryTotal": "85198045184",
|
| 83 |
+
"architecture": "Ampere"
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 87 |
+
"memoryTotal": "85198045184",
|
| 88 |
+
"architecture": "Ampere"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 92 |
+
"memoryTotal": "85198045184",
|
| 93 |
+
"architecture": "Ampere"
|
| 94 |
+
}
|
| 95 |
+
],
|
| 96 |
+
"cudaVersion": "12.1"
|
| 97 |
+
}
|
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"loader1/val_loss_lm/dataloader_idx_1":5.460818767547607,"lr":1.0554024811426643e-05,"_timestamp":1.7511476307831557e+09,"loader1/val_loss/dataloader_idx_1":14.773687362670898,"train_loss":2.630859375,"loader2/val_loss_ptm/dataloader_idx_2":2.988206148147583,"loader1/val_loss_ptm/dataloader_idx_1":3.491912603378296,"loader2/val_loss_lm/dataloader_idx_2":6.094737529754639,"_runtime":20509.857471191,"loader2/val_loss_ptc/dataloader_idx_2":5.59119987487793,"_wandb":{"runtime":20517},"train_loss_ptc":0.64306640625,"train_loss_ptm":0.291015625,"loader0/val_loss_ptc/dataloader_idx_0":1.0085703134536743,"loader1/val_loss_ptc/dataloader_idx_1":5.825062274932861,"loader0/val_loss_lm/dataloader_idx_0":2.1440062522888184,"loader2/val_loss/dataloader_idx_2":14.674175262451172,"epoch":19,"train_loss_lm":1.697265625,"_step":147,"loader0/val_loss/dataloader_idx_0":3.6699562072753906,"loader0/val_loss_ptm/dataloader_idx_0":0.5172882676124573,"trainer/global_step":6399}
|
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-29T00:12:00.930911849+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-29T00:12:31.03707486+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 3 |
+
{"time":"2025-06-29T00:12:36.876219526+08:00","level":"INFO","msg":"created new stream","id":"vgvxxzqc"}
|
| 4 |
+
{"time":"2025-06-29T00:12:36.876272436+08:00","level":"INFO","msg":"stream: started","id":"vgvxxzqc"}
|
| 5 |
+
{"time":"2025-06-29T00:12:36.876317878+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"vgvxxzqc"}
|
| 6 |
+
{"time":"2025-06-29T00:12:36.876360145+08:00","level":"INFO","msg":"handler: started","stream_id":"vgvxxzqc"}
|
| 7 |
+
{"time":"2025-06-29T00:12:36.876401621+08:00","level":"INFO","msg":"sender: started","stream_id":"vgvxxzqc"}
|
| 8 |
+
{"time":"2025-06-29T00:12:39.839397838+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 9 |
+
{"time":"2025-06-29T00:13:20.87652364+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58014->104.21.20.172:443: read: connection reset by peer"}
|
| 10 |
+
{"time":"2025-06-29T00:16:17.426969211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 11 |
+
{"time":"2025-06-29T00:17:58.51721346+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 12 |
+
{"time":"2025-06-29T00:26:10.222503445+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51478->172.67.193.61:443: read: connection timed out"}
|
| 13 |
+
{"time":"2025-06-29T00:28:34.807681677+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46066->104.21.20.172:443: read: connection reset by peer"}
|
| 14 |
+
{"time":"2025-06-29T00:33:01.870533298+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55734->104.21.20.172:443: read: connection timed out"}
|
| 15 |
+
{"time":"2025-06-29T00:33:23.754842161+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46042->172.67.193.61:443: read: connection reset by peer"}
|
| 16 |
+
{"time":"2025-06-29T00:36:47.149592254+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:57176->172.67.193.61:443: read: connection timed out"}
|
| 17 |
+
{"time":"2025-06-29T00:41:05.645712211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 18 |
+
{"time":"2025-06-29T00:47:20.493525378+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58300->104.21.20.172:443: read: connection timed out"}
|
| 19 |
+
{"time":"2025-06-29T00:47:45.629981285+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 20 |
+
{"time":"2025-06-29T00:51:19.597480154+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:60802->104.21.20.172:443: read: connection timed out"}
|
| 21 |
+
{"time":"2025-06-29T00:54:09.777365701+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51532->104.21.20.172:443: read: connection reset by peer"}
|
| 22 |
+
{"time":"2025-06-29T01:00:20.78154967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:34934->172.67.193.61:443: read: connection timed out"}
|
| 23 |
+
{"time":"2025-06-29T01:04:21.421531776+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55938->104.21.20.172:443: read: connection timed out"}
|
| 24 |
+
{"time":"2025-06-29T01:05:41.05509194+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45678->172.67.193.61:443: read: connection reset by peer"}
|
| 25 |
+
{"time":"2025-06-29T01:08:44.130266043+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 26 |
+
{"time":"2025-06-29T01:10:59.724692621+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 27 |
+
{"time":"2025-06-29T01:14:35.821570745+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:35232->104.21.20.172:443: read: connection timed out"}
|
| 28 |
+
{"time":"2025-06-29T01:17:35.533530754+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:44258->104.21.20.172:443: read: connection timed out"}
|
| 29 |
+
{"time":"2025-06-29T01:23:12.630381225+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:48108->172.67.193.61:443: read: connection reset by peer"}
|
| 30 |
+
{"time":"2025-06-29T01:24:55.067569821+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 31 |
+
{"time":"2025-06-29T01:25:27.188065501+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 32 |
+
{"time":"2025-06-29T01:25:56.782489942+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45024->104.21.20.172:443: read: connection timed out"}
|
| 33 |
+
{"time":"2025-06-29T01:26:49.538892815+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 34 |
+
{"time":"2025-06-29T01:29:46.157546022+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:42508->172.67.193.61:443: read: connection timed out"}
|
| 35 |
+
{"time":"2025-06-29T05:53:57.9412544+08:00","level":"INFO","msg":"stream: closing","id":"vgvxxzqc"}
|
| 36 |
+
{"time":"2025-06-29T05:53:57.941286983+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 37 |
+
{"time":"2025-06-29T05:53:57.942366437+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 38 |
+
{"time":"2025-06-29T05:54:00.869660002+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 39 |
+
{"time":"2025-06-29T05:54:03.731237694+08:00","level":"INFO","msg":"handler: closed","stream_id":"vgvxxzqc"}
|
| 40 |
+
{"time":"2025-06-29T05:54:03.731282348+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"vgvxxzqc"}
|
| 41 |
+
{"time":"2025-06-29T05:54:03.731313818+08:00","level":"INFO","msg":"sender: closed","stream_id":"vgvxxzqc"}
|
| 42 |
+
{"time":"2025-06-29T05:54:03.735031072+08:00","level":"INFO","msg":"stream: closed","id":"vgvxxzqc"}
|
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Configure stats pid to 2351
|
| 3 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log
|
| 7 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log
|
| 8 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-29 00:12:00,923 INFO MainThread:2351 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-29 00:12:00,924 INFO MainThread:2351 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-29 00:12:00,926 INFO MainThread:2351 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-29 00:12:00,929 INFO MainThread:2351 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-29 00:12:39,788 INFO MainThread:2351 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-06-29 00:12:40,036 INFO MainThread:2351 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-06-29 00:12:46,669 INFO MainThread:2351 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06290009_deepspeed', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 168, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
|
| 24 |
+
2025-06-29 05:53:57,929 INFO MsgRouterThr:2351 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
|
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-07-04T17:28:55.160594539+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-07-04T17:28:56.249328974+08:00","level":"INFO","msg":"created new stream","id":"6bkqzmou"}
|
| 3 |
+
{"time":"2025-07-04T17:28:56.249372351+08:00","level":"INFO","msg":"stream: started","id":"6bkqzmou"}
|
| 4 |
+
{"time":"2025-07-04T17:28:56.249400451+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"6bkqzmou"}
|
| 5 |
+
{"time":"2025-07-04T17:28:56.249431272+08:00","level":"INFO","msg":"sender: started","stream_id":"6bkqzmou"}
|
| 6 |
+
{"time":"2025-07-04T17:28:56.249469216+08:00","level":"INFO","msg":"handler: started","stream_id":"6bkqzmou"}
|
| 7 |
+
{"time":"2025-07-04T17:28:57.491653525+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-07-04T22:06:52.200518707+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 9 |
+
{"time":"2025-07-04T22:09:45.338273816+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 10 |
+
{"time":"2025-07-04T22:19:20.574743081+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59078->172.67.193.61:443: read: connection timed out"}
|
| 11 |
+
{"time":"2025-07-04T22:25:54.288016702+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:42132->172.67.193.61:443: read: connection timed out"}
|
| 12 |
+
{"time":"2025-07-04T22:29:40.591991523+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:37544->172.67.193.61:443: read: connection timed out"}
|
| 13 |
+
{"time":"2025-07-04T22:36:54.256091094+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:40968->172.67.193.61:443: read: connection timed out"}
|
| 14 |
+
{"time":"2025-07-04T22:37:22.364944108+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 15 |
+
{"time":"2025-07-04T22:40:24.499117928+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 16 |
+
{"time":"2025-07-04T22:40:51.249223858+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 17 |
+
{"time":"2025-07-04T22:44:05.872015851+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59778->172.67.193.61:443: read: connection timed out"}
|
| 18 |
+
{"time":"2025-07-04T22:49:18.192032141+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:47358->104.21.20.172:443: read: connection timed out"}
|
| 19 |
+
{"time":"2025-07-04T22:52:13.295997002+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:46182->172.67.193.61:443: read: connection timed out"}
|
| 20 |
+
{"time":"2025-07-04T22:53:26.345699486+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 21 |
+
{"time":"2025-07-04T22:55:37.691524069+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 22 |
+
{"time":"2025-07-04T22:59:01.477384402+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 23 |
+
{"time":"2025-07-04T23:01:22.224282887+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:45222->172.67.193.61:443: read: connection reset by peer"}
|
| 24 |
+
{"time":"2025-07-04T23:06:44.720013857+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:46280->104.21.20.172:443: read: connection timed out"}
|
| 25 |
+
{"time":"2025-07-04T23:08:42.894770628+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 26 |
+
{"time":"2025-07-04T23:10:13.616061547+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:57034->172.67.193.61:443: read: connection timed out"}
|
| 27 |
+
{"time":"2025-07-04T23:11:27.896127402+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 28 |
+
{"time":"2025-07-04T23:15:47.19805854+08:00","level":"ERROR","msg":"filestream: json decode error: net/http: request canceled (Client.Timeout or context cancellation while reading body)"}
|
| 29 |
+
{"time":"2025-07-04T23:15:47.222866077+08:00","level":"ERROR","msg":"filestream: error closing response body: net/http: request canceled"}
|
| 30 |
+
{"time":"2025-07-04T23:19:26.063989295+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:35504->172.67.193.61:443: read: connection timed out"}
|
| 31 |
+
{"time":"2025-07-04T23:21:57.905369451+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 32 |
+
{"time":"2025-07-04T23:23:00.080992848+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:38378->172.67.193.61:443: read: connection timed out"}
|
| 33 |
+
{"time":"2025-07-04T23:26:54.577250259+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:41928->104.21.20.172:443: read: connection reset by peer"}
|
| 34 |
+
{"time":"2025-07-04T23:28:47.703904029+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:38040->172.67.193.61:443: read: connection reset by peer"}
|
| 35 |
+
{"time":"2025-07-04T23:30:12.910139882+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 36 |
+
{"time":"2025-07-04T23:30:45.313312591+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 37 |
+
{"time":"2025-07-04T23:35:25.039973358+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:38402->104.21.20.172:443: read: connection timed out"}
|
| 38 |
+
{"time":"2025-07-04T23:39:27.49206097+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:39926->172.67.193.61:443: read: connection reset by peer"}
|
| 39 |
+
{"time":"2025-07-04T23:43:09.424012888+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59448->172.67.193.61:443: read: connection timed out"}
|
| 40 |
+
{"time":"2025-07-04T23:46:07.600020006+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:33610->172.67.193.61:443: read: connection timed out"}
|
| 41 |
+
{"time":"2025-07-04T23:46:28.951111977+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 42 |
+
{"time":"2025-07-04T23:48:12.919414088+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 43 |
+
{"time":"2025-07-04T23:48:45.403207458+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 44 |
+
{"time":"2025-07-04T23:49:16.527984782+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:49472->172.67.193.61:443: read: connection timed out"}
|
| 45 |
+
{"time":"2025-07-04T23:50:27.921623046+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 46 |
+
{"time":"2025-07-04T23:52:57.899934024+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:40636->172.67.193.61:443: read: connection reset by peer"}
|
| 47 |
+
{"time":"2025-07-04T23:56:18.928962652+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:53056->172.67.193.61:443: read: connection timed out"}
|
| 48 |
+
{"time":"2025-07-04T23:56:57.924908638+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 49 |
+
{"time":"2025-07-04T23:57:30.375318804+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 50 |
+
{"time":"2025-07-04T23:58:04.586629939+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 51 |
+
{"time":"2025-07-05T00:03:09.552010125+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:39630->172.67.193.61:443: read: connection timed out"}
|
| 52 |
+
{"time":"2025-07-05T00:03:42.930344983+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 53 |
+
{"time":"2025-07-05T00:04:15.375941679+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 54 |
+
{"time":"2025-07-05T00:04:21.655905995+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:48150->172.67.193.61:443: read: connection reset by peer"}
|
| 55 |
+
{"time":"2025-07-05T00:04:50.22664016+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 56 |
+
{"time":"2025-07-05T00:08:01.391966638+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:53260->104.21.20.172:443: read: connection timed out"}
|
| 57 |
+
{"time":"2025-07-05T00:12:23.023992865+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:44698->172.67.193.61:443: read: connection timed out"}
|
| 58 |
+
{"time":"2025-07-05T00:15:42.93731147+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 59 |
+
{"time":"2025-07-05T00:16:15.214992648+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 60 |
+
{"time":"2025-07-05T00:16:49.667525584+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 61 |
+
{"time":"2025-07-05T00:17:01.040050871+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:42760->172.67.193.61:443: read: connection timed out"}
|
| 62 |
+
{"time":"2025-07-05T00:20:02.288062562+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:37934->104.21.20.172:443: read: connection timed out"}
|
| 63 |
+
{"time":"2025-07-05T00:23:13.264033499+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:51502->172.67.193.61:443: read: connection timed out"}
|
| 64 |
+
{"time":"2025-07-05T00:26:32.944001316+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:50460->172.67.193.61:443: read: connection timed out"}
|
| 65 |
+
{"time":"2025-07-05T00:30:17.199971228+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:43756->104.21.20.172:443: read: connection timed out"}
|
| 66 |
+
{"time":"2025-07-05T00:35:42.94748626+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 67 |
+
{"time":"2025-07-05T00:35:44.881161178+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59610->172.67.193.61:443: read: connection timed out"}
|
| 68 |
+
{"time":"2025-07-05T00:39:14.287974585+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:46688->104.21.20.172:443: read: connection timed out"}
|
| 69 |
+
{"time":"2025-07-05T00:39:33.427697791+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:40430->104.21.20.172:443: read: connection reset by peer"}
|
| 70 |
+
{"time":"2025-07-05T00:40:47.648388331+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:60164->172.67.193.61:443: read: connection reset by peer"}
|
| 71 |
+
{"time":"2025-07-05T00:42:34.088456552+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:52978->104.21.20.172:443: read: connection reset by peer"}
|
| 72 |
+
{"time":"2025-07-05T00:47:13.006425282+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:33354->172.67.193.61:443: read: connection reset by peer"}
|
| 73 |
+
{"time":"2025-07-05T00:49:55.823998082+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:33330->172.67.193.61:443: read: connection timed out"}
|
| 74 |
+
{"time":"2025-07-05T00:52:51.439993456+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:52416->172.67.193.61:443: read: connection timed out"}
|
| 75 |
+
{"time":"2025-07-05T00:57:07.440983899+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:39188->172.67.193.61:443: read: connection timed out"}
|
| 76 |
+
{"time":"2025-07-05T01:04:51.312039238+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59436->172.67.193.61:443: read: connection timed out"}
|
| 77 |
+
{"time":"2025-07-05T01:09:40.080000713+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:34348->172.67.193.61:443: read: connection timed out"}
|
| 78 |
+
{"time":"2025-07-05T01:18:31.535996696+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:57184->104.21.20.172:443: read: connection timed out"}
|
| 79 |
+
{"time":"2025-07-05T01:25:12.431983593+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59852->172.67.193.61:443: read: connection timed out"}
|
| 80 |
+
{"time":"2025-07-05T11:36:49.210079644+08:00","level":"INFO","msg":"stream: closing","id":"6bkqzmou"}
|
| 81 |
+
{"time":"2025-07-05T11:36:49.210163239+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 82 |
+
{"time":"2025-07-05T11:36:49.211103046+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 83 |
+
{"time":"2025-07-05T11:36:51.804545543+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 84 |
+
{"time":"2025-07-05T11:36:53.755788884+08:00","level":"INFO","msg":"handler: closed","stream_id":"6bkqzmou"}
|
| 85 |
+
{"time":"2025-07-05T11:36:53.755828602+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"6bkqzmou"}
|
| 86 |
+
{"time":"2025-07-05T11:36:53.75584333+08:00","level":"INFO","msg":"sender: closed","stream_id":"6bkqzmou"}
|
| 87 |
+
{"time":"2025-07-05T11:36:53.759902053+08:00","level":"INFO","msg":"stream: closed","id":"6bkqzmou"}
|
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Configure stats pid to 29356
|
| 3 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug.log
|
| 7 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug-internal.log
|
| 8 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-07-04 17:28:55,102 INFO MainThread:29356 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-07-04 17:28:55,102 INFO MainThread:29356 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-07-04 17:28:55,103 INFO MainThread:29356 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-07-04 17:28:55,103 INFO MainThread:29356 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-07-04 17:28:57,453 INFO MainThread:29356 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-07-04 17:28:57,668 INFO MainThread:29356 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-07-04 17:28:57,669 INFO MainThread:29356 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-07-04 17:28:57,678 INFO MainThread:29356 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-07-04 17:28:57,684 INFO MainThread:29356 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-07-04 17:28:57,686 INFO MainThread:29356 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-07-04 17:29:03,015 INFO MainThread:29356 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_07041727_2dataset', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 30, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
|
| 24 |
+
2025-07-05 11:36:49,208 INFO MsgRouterThr:29356 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/config.yaml
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.11
|
| 4 |
+
m:
|
| 5 |
+
- "1": loader1/val_loss_lm/dataloader_idx_1
|
| 6 |
+
"5": 2
|
| 7 |
+
"6":
|
| 8 |
+
- 1
|
| 9 |
+
- 3
|
| 10 |
+
"7": []
|
| 11 |
+
- "1": trainer/global_step
|
| 12 |
+
"6":
|
| 13 |
+
- 3
|
| 14 |
+
"7": []
|
| 15 |
+
- "1": loader2/val_loss/dataloader_idx_2
|
| 16 |
+
"5": 2
|
| 17 |
+
"6":
|
| 18 |
+
- 1
|
| 19 |
+
- 3
|
| 20 |
+
"7": []
|
| 21 |
+
- "1": lr
|
| 22 |
+
"5": 2
|
| 23 |
+
"6":
|
| 24 |
+
- 1
|
| 25 |
+
- 3
|
| 26 |
+
"7": []
|
| 27 |
+
- "1": loader1/val_loss/dataloader_idx_1
|
| 28 |
+
"5": 2
|
| 29 |
+
"6":
|
| 30 |
+
- 1
|
| 31 |
+
- 3
|
| 32 |
+
"7": []
|
| 33 |
+
- "1": loader2/val_loss_ptc/dataloader_idx_2
|
| 34 |
+
"5": 2
|
| 35 |
+
"6":
|
| 36 |
+
- 1
|
| 37 |
+
- 3
|
| 38 |
+
"7": []
|
| 39 |
+
- "1": loader1/val_loss_ptm/dataloader_idx_1
|
| 40 |
+
"5": 2
|
| 41 |
+
"6":
|
| 42 |
+
- 1
|
| 43 |
+
- 3
|
| 44 |
+
"7": []
|
| 45 |
+
- "1": loader2/val_loss_ptm/dataloader_idx_2
|
| 46 |
+
"5": 2
|
| 47 |
+
"6":
|
| 48 |
+
- 1
|
| 49 |
+
- 3
|
| 50 |
+
"7": []
|
| 51 |
+
- "1": train_loss
|
| 52 |
+
"5": 2
|
| 53 |
+
"6":
|
| 54 |
+
- 1
|
| 55 |
+
- 3
|
| 56 |
+
"7": []
|
| 57 |
+
- "1": loader0/val_loss_ptc/dataloader_idx_0
|
| 58 |
+
"5": 2
|
| 59 |
+
"6":
|
| 60 |
+
- 1
|
| 61 |
+
- 3
|
| 62 |
+
"7": []
|
| 63 |
+
- "1": loader0/val_loss_ptm/dataloader_idx_0
|
| 64 |
+
"5": 2
|
| 65 |
+
"6":
|
| 66 |
+
- 1
|
| 67 |
+
- 3
|
| 68 |
+
"7": []
|
| 69 |
+
- "1": loader1/val_loss_ptc/dataloader_idx_1
|
| 70 |
+
"5": 2
|
| 71 |
+
"6":
|
| 72 |
+
- 1
|
| 73 |
+
- 3
|
| 74 |
+
"7": []
|
| 75 |
+
- "1": loader0/val_loss_lm/dataloader_idx_0
|
| 76 |
+
"5": 2
|
| 77 |
+
"6":
|
| 78 |
+
- 1
|
| 79 |
+
- 3
|
| 80 |
+
"7": []
|
| 81 |
+
- "1": train_loss_ptc
|
| 82 |
+
"5": 2
|
| 83 |
+
"6":
|
| 84 |
+
- 1
|
| 85 |
+
- 3
|
| 86 |
+
"7": []
|
| 87 |
+
- "1": train_loss_ptm
|
| 88 |
+
"5": 2
|
| 89 |
+
"6":
|
| 90 |
+
- 1
|
| 91 |
+
- 3
|
| 92 |
+
"7": []
|
| 93 |
+
- "1": train_loss_lm
|
| 94 |
+
"5": 2
|
| 95 |
+
"6":
|
| 96 |
+
- 1
|
| 97 |
+
- 3
|
| 98 |
+
"7": []
|
| 99 |
+
- "1": epoch
|
| 100 |
+
"5": 2
|
| 101 |
+
"6":
|
| 102 |
+
- 1
|
| 103 |
+
- 3
|
| 104 |
+
"7": []
|
| 105 |
+
- "1": loader0/val_loss/dataloader_idx_0
|
| 106 |
+
"5": 2
|
| 107 |
+
"6":
|
| 108 |
+
- 1
|
| 109 |
+
- 3
|
| 110 |
+
"7": []
|
| 111 |
+
- "1": loader2/val_loss_lm/dataloader_idx_2
|
| 112 |
+
"5": 2
|
| 113 |
+
"6":
|
| 114 |
+
- 1
|
| 115 |
+
- 3
|
| 116 |
+
"7": []
|
| 117 |
+
python_version: 3.10.0
|
| 118 |
+
t:
|
| 119 |
+
"1":
|
| 120 |
+
- 1
|
| 121 |
+
- 5
|
| 122 |
+
- 9
|
| 123 |
+
- 11
|
| 124 |
+
- 33
|
| 125 |
+
- 41
|
| 126 |
+
- 49
|
| 127 |
+
- 53
|
| 128 |
+
- 55
|
| 129 |
+
- 63
|
| 130 |
+
- 103
|
| 131 |
+
"2":
|
| 132 |
+
- 1
|
| 133 |
+
- 5
|
| 134 |
+
- 9
|
| 135 |
+
- 11
|
| 136 |
+
- 33
|
| 137 |
+
- 41
|
| 138 |
+
- 49
|
| 139 |
+
- 53
|
| 140 |
+
- 55
|
| 141 |
+
- 63
|
| 142 |
+
- 103
|
| 143 |
+
"3":
|
| 144 |
+
- 7
|
| 145 |
+
- 23
|
| 146 |
+
- 55
|
| 147 |
+
- 66
|
| 148 |
+
"4": 3.10.0
|
| 149 |
+
"5": 0.19.11
|
| 150 |
+
"6": 4.52.3
|
| 151 |
+
"8":
|
| 152 |
+
- 5
|
| 153 |
+
"12": 0.19.11
|
| 154 |
+
"13": linux-x86_64
|
| 155 |
+
accelerator:
|
| 156 |
+
value: gpu
|
| 157 |
+
batch_size:
|
| 158 |
+
value: 32
|
| 159 |
+
bert_hidden_dim:
|
| 160 |
+
value: 768
|
| 161 |
+
bert_name:
|
| 162 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
|
| 163 |
+
check_val_every_n_epoch:
|
| 164 |
+
value: 1
|
| 165 |
+
cross_attention_freq:
|
| 166 |
+
value: 2
|
| 167 |
+
devices:
|
| 168 |
+
value: 0,1,2,3,4,5,6,7
|
| 169 |
+
filename:
|
| 170 |
+
value: stage1_07041727_2dataset
|
| 171 |
+
init_checkpoint:
|
| 172 |
+
value: ""
|
| 173 |
+
init_lr:
|
| 174 |
+
value: 0.0001
|
| 175 |
+
lm:
|
| 176 |
+
value: true
|
| 177 |
+
load_4bit:
|
| 178 |
+
value: false
|
| 179 |
+
lr_decay_rate:
|
| 180 |
+
value: 0.9
|
| 181 |
+
match_batch_size:
|
| 182 |
+
value: 64
|
| 183 |
+
max_epochs:
|
| 184 |
+
value: 30
|
| 185 |
+
min_lr:
|
| 186 |
+
value: 1e-05
|
| 187 |
+
mix_dataset:
|
| 188 |
+
value: true
|
| 189 |
+
mode:
|
| 190 |
+
value: train
|
| 191 |
+
num_query_token:
|
| 192 |
+
value: 8
|
| 193 |
+
num_workers:
|
| 194 |
+
value: 8
|
| 195 |
+
plm_name:
|
| 196 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
|
| 197 |
+
plm_tune:
|
| 198 |
+
value: freeze
|
| 199 |
+
pool_size:
|
| 200 |
+
value: 0
|
| 201 |
+
precision:
|
| 202 |
+
value: bf16-mixed
|
| 203 |
+
projection_dim:
|
| 204 |
+
value: 256
|
| 205 |
+
prot_aug:
|
| 206 |
+
value: None
|
| 207 |
+
prot_max_len:
|
| 208 |
+
value: 1024
|
| 209 |
+
ptm:
|
| 210 |
+
value: true
|
| 211 |
+
rerank_cand_num:
|
| 212 |
+
value: 128
|
| 213 |
+
retrieval_eval_epoch:
|
| 214 |
+
value: 10
|
| 215 |
+
root:
|
| 216 |
+
value: data
|
| 217 |
+
save_every_n_epochs:
|
| 218 |
+
value: 5
|
| 219 |
+
scheduler:
|
| 220 |
+
value: linear_warmup_cosine_lr
|
| 221 |
+
seed:
|
| 222 |
+
value: 42
|
| 223 |
+
strategy:
|
| 224 |
+
value: deepspeed
|
| 225 |
+
temperature:
|
| 226 |
+
value: 0.1
|
| 227 |
+
text_max_len:
|
| 228 |
+
value: 128
|
| 229 |
+
use_wandb_logger:
|
| 230 |
+
value: true
|
| 231 |
+
warmup_lr:
|
| 232 |
+
value: 1e-06
|
| 233 |
+
warmup_steps:
|
| 234 |
+
value: 1000
|
| 235 |
+
weight_decay:
|
| 236 |
+
value: 0.05
|
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/output.log
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset exists and is not empty.
|
| 2 |
+
Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
|
| 3 |
+
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
|
| 4 |
+
|
| 5 |
+
| Name | Type | Params | Mode
|
| 6 |
+
------------------------------------------------------
|
| 7 |
+
0 | blip2qformer | Blip2Qformer | 327 M | train
|
| 8 |
+
------------------------------------------------------
|
| 9 |
+
179 M Trainable params
|
| 10 |
+
147 M Non-trainable params
|
| 11 |
+
327 M Total params
|
| 12 |
+
1,309.467 Total estimated model params size (MB)
|
| 13 |
+
5 Modules in train mode
|
| 14 |
+
926 Modules in eval mode
|
| 15 |
+
Epoch 29: 100%|█████████████████████████████████████████| 3331/3331 [36:16<00:00, 1.53it/s, v_num=zmou]
|
| 16 |
+
/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:220: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
|
| 17 |
+
with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
|
| 18 |
+
|
| 19 |
+
/nas/shared/kilab/wangyujia/ProtT3/model/dist_funs.py:18: FutureWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
|
| 20 |
+
sd = self.module.state_dict(destination, prefix, keep_vars)
|
| 21 |
+
`Trainer.fit` stopped: `max_epochs=30` reached.
|
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/requirements.txt
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
attrs==25.3.0
|
| 2 |
+
tqdm==4.67.1
|
| 3 |
+
langcodes==3.5.0
|
| 4 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 5 |
+
tifffile==2025.5.10
|
| 6 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 7 |
+
nltk==3.9.1
|
| 8 |
+
salesforce-lavis==1.0.2
|
| 9 |
+
tzdata==2025.2
|
| 10 |
+
pyparsing==3.2.3
|
| 11 |
+
six==1.17.0
|
| 12 |
+
python-dateutil==2.9.0.post0
|
| 13 |
+
pandas==2.2.3
|
| 14 |
+
pytorch-lightning==2.5.1.post0
|
| 15 |
+
blinker==1.9.0
|
| 16 |
+
opencv-python-headless==4.5.5.64
|
| 17 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 18 |
+
pytz==2025.2
|
| 19 |
+
async-timeout==5.0.1
|
| 20 |
+
pillow==11.2.1
|
| 21 |
+
parso==0.8.4
|
| 22 |
+
joblib==1.5.1
|
| 23 |
+
contourpy==1.3.2
|
| 24 |
+
triton==3.2.0
|
| 25 |
+
marisa-trie==1.2.1
|
| 26 |
+
PyYAML==6.0.2
|
| 27 |
+
regex==2024.11.6
|
| 28 |
+
idna==3.10
|
| 29 |
+
nvidia-curand-cu12==10.3.5.147
|
| 30 |
+
rpds-py==0.25.1
|
| 31 |
+
aiosignal==1.3.2
|
| 32 |
+
srsly==2.5.1
|
| 33 |
+
confection==0.1.5
|
| 34 |
+
typing-inspection==0.4.1
|
| 35 |
+
packaging==24.2
|
| 36 |
+
distlib==0.3.9
|
| 37 |
+
networkx==3.4.2
|
| 38 |
+
absl-py==2.2.2
|
| 39 |
+
yarl==1.20.0
|
| 40 |
+
lightning-utilities==0.14.3
|
| 41 |
+
executing==2.2.0
|
| 42 |
+
pycocoevalcap==1.2
|
| 43 |
+
wheel==0.45.1
|
| 44 |
+
nvidia-ml-py==12.575.51
|
| 45 |
+
cycler==0.12.1
|
| 46 |
+
wrapt==1.17.2
|
| 47 |
+
jsonschema-specifications==2025.4.1
|
| 48 |
+
protobuf==6.31.0
|
| 49 |
+
mpmath==1.3.0
|
| 50 |
+
certifi==2025.4.26
|
| 51 |
+
py-cpuinfo==9.0.0
|
| 52 |
+
contexttimer==0.3.3
|
| 53 |
+
watchdog==6.0.0
|
| 54 |
+
pexpect==4.9.0
|
| 55 |
+
webencodings==0.5.1
|
| 56 |
+
hf-xet==1.1.2
|
| 57 |
+
cymem==2.0.11
|
| 58 |
+
requests==2.32.3
|
| 59 |
+
timm==0.4.12
|
| 60 |
+
omegaconf==2.3.0
|
| 61 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 62 |
+
webdataset==0.2.111
|
| 63 |
+
nodeenv==1.9.1
|
| 64 |
+
frozenlist==1.6.0
|
| 65 |
+
annotated-types==0.7.0
|
| 66 |
+
matplotlib-inline==0.1.7
|
| 67 |
+
urllib3==2.4.0
|
| 68 |
+
rich==14.0.0
|
| 69 |
+
GitPython==3.1.44
|
| 70 |
+
lazy_loader==0.4
|
| 71 |
+
msgpack==1.1.0
|
| 72 |
+
prompt_toolkit==3.0.51
|
| 73 |
+
fonttools==4.58.0
|
| 74 |
+
multidict==6.4.4
|
| 75 |
+
blis==1.3.0
|
| 76 |
+
thinc==8.3.6
|
| 77 |
+
nvidia-nvtx-cu12==12.4.127
|
| 78 |
+
torchmetrics==1.7.1
|
| 79 |
+
weasel==0.4.1
|
| 80 |
+
numpy==2.2.6
|
| 81 |
+
cachetools==5.5.2
|
| 82 |
+
Jinja2==3.1.6
|
| 83 |
+
matplotlib==3.10.3
|
| 84 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 85 |
+
Pygments==2.19.1
|
| 86 |
+
tornado==6.5.1
|
| 87 |
+
scipy==1.15.3
|
| 88 |
+
rouge_score==0.1.2
|
| 89 |
+
cloudpathlib==0.21.1
|
| 90 |
+
jedi==0.19.2
|
| 91 |
+
referencing==0.36.2
|
| 92 |
+
decord==0.6.0
|
| 93 |
+
setuptools==78.1.1
|
| 94 |
+
mdurl==0.1.2
|
| 95 |
+
identify==2.6.12
|
| 96 |
+
python-slugify==8.0.4
|
| 97 |
+
portalocker==3.1.1
|
| 98 |
+
catalogue==2.0.10
|
| 99 |
+
platformdirs==4.3.8
|
| 100 |
+
antlr4-python3-runtime==4.9.3
|
| 101 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 102 |
+
kaggle==1.7.4.5
|
| 103 |
+
pydeck==0.9.1
|
| 104 |
+
pydantic==2.11.5
|
| 105 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 106 |
+
pyarrow==20.0.0
|
| 107 |
+
nvidia-nccl-cu12==2.21.5
|
| 108 |
+
markdown-it-py==3.0.0
|
| 109 |
+
gitdb==4.0.12
|
| 110 |
+
altair==5.5.0
|
| 111 |
+
torchvision==0.21.0
|
| 112 |
+
python-magic==0.4.27
|
| 113 |
+
iopath==0.1.10
|
| 114 |
+
smart-open==7.1.0
|
| 115 |
+
torch==2.6.0
|
| 116 |
+
pycocotools==2.0.8
|
| 117 |
+
fairscale==0.4.4
|
| 118 |
+
traitlets==5.14.3
|
| 119 |
+
pure_eval==0.2.3
|
| 120 |
+
sympy==1.13.1
|
| 121 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 122 |
+
imageio==2.37.0
|
| 123 |
+
stack-data==0.6.3
|
| 124 |
+
shellingham==1.5.4
|
| 125 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 126 |
+
einops==0.8.1
|
| 127 |
+
tenacity==9.1.2
|
| 128 |
+
virtualenv==20.31.2
|
| 129 |
+
ptyprocess==0.7.0
|
| 130 |
+
cfgv==3.4.0
|
| 131 |
+
pre_commit==4.2.0
|
| 132 |
+
language_data==1.3.0
|
| 133 |
+
typing_extensions==4.13.2
|
| 134 |
+
propcache==0.3.1
|
| 135 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 136 |
+
safetensors==0.5.3
|
| 137 |
+
text-unidecode==1.3
|
| 138 |
+
wcwidth==0.2.13
|
| 139 |
+
charset-normalizer==3.4.2
|
| 140 |
+
aiohappyeyeballs==2.6.1
|
| 141 |
+
ipython==8.36.0
|
| 142 |
+
streamlit==1.45.1
|
| 143 |
+
asttokens==3.0.0
|
| 144 |
+
psutil==7.0.0
|
| 145 |
+
smmap==5.0.2
|
| 146 |
+
exceptiongroup==1.3.0
|
| 147 |
+
murmurhash==1.0.13
|
| 148 |
+
filelock==3.18.0
|
| 149 |
+
plotly==6.1.1
|
| 150 |
+
hjson==3.1.0
|
| 151 |
+
pydantic_core==2.33.2
|
| 152 |
+
ninja==1.11.1.4
|
| 153 |
+
kiwisolver==1.4.8
|
| 154 |
+
spacy-legacy==3.0.12
|
| 155 |
+
opendatasets==0.1.22
|
| 156 |
+
decorator==5.2.1
|
| 157 |
+
spacy==3.8.7
|
| 158 |
+
wasabi==1.1.3
|
| 159 |
+
sentencepiece==0.2.0
|
| 160 |
+
toml==0.10.2
|
| 161 |
+
scikit-image==0.25.2
|
| 162 |
+
deepspeed==0.16.10+b666844f
|
| 163 |
+
ftfy==6.3.1
|
| 164 |
+
bleach==6.2.0
|
| 165 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 166 |
+
spacy-loggers==1.0.5
|
| 167 |
+
MarkupSafe==3.0.2
|
| 168 |
+
braceexpand==0.1.7
|
| 169 |
+
oss2==2.15.0
|
| 170 |
+
preshed==3.0.10
|
| 171 |
+
transformers==4.52.3
|
| 172 |
+
aiohttp==3.12.2
|
| 173 |
+
web.py==0.62
|
| 174 |
+
threadpoolctl==3.6.0
|
| 175 |
+
jaraco.functools==4.1.0
|
| 176 |
+
wandb==0.19.11
|
| 177 |
+
sentry-sdk==2.29.1
|
| 178 |
+
tokenizers==0.21.1
|
| 179 |
+
fsspec==2025.3.0
|
| 180 |
+
flash-attn==2.7.1.post1
|
| 181 |
+
opendelta==0.3.2
|
| 182 |
+
opencv-python==4.11.0.86
|
| 183 |
+
click==8.2.1
|
| 184 |
+
docker-pycreds==0.4.0
|
| 185 |
+
typer==0.16.0
|
| 186 |
+
xxhash==3.5.0
|
| 187 |
+
pathlib==1.0.1
|
| 188 |
+
dill==0.3.8
|
| 189 |
+
crcmod==1.7
|
| 190 |
+
bigmodelvis==0.0.1
|
| 191 |
+
datasets==3.6.0
|
| 192 |
+
pycryptodome==3.23.0
|
| 193 |
+
jsonschema==4.24.0
|
| 194 |
+
aliyun-python-sdk-core==2.16.0
|
| 195 |
+
jmespath==0.10.0
|
| 196 |
+
more-itertools==10.7.0
|
| 197 |
+
scikit-learn==1.6.1
|
| 198 |
+
huggingface-hub==0.32.1
|
| 199 |
+
cryptography==45.0.3
|
| 200 |
+
pycparser==2.22
|
| 201 |
+
yacs==0.1.8
|
| 202 |
+
aliyun-python-sdk-kms==2.16.5
|
| 203 |
+
cffi==1.17.1
|
| 204 |
+
delta-center-client==0.0.4
|
| 205 |
+
multiprocess==0.70.16
|
| 206 |
+
setproctitle==1.3.6
|
| 207 |
+
narwhals==1.41.0
|
| 208 |
+
pip==25.1.1
|
| 209 |
+
cheroot==10.0.1
|
| 210 |
+
jaraco.context==5.3.0
|
| 211 |
+
more-itertools==10.3.0
|
| 212 |
+
jaraco.functools==4.0.1
|
| 213 |
+
jaraco.text==3.12.1
|
| 214 |
+
platformdirs==4.2.2
|
| 215 |
+
packaging==24.2
|
| 216 |
+
wheel==0.45.1
|
| 217 |
+
zipp==3.19.2
|
| 218 |
+
inflect==7.3.1
|
| 219 |
+
autocommand==2.2.2
|
| 220 |
+
typeguard==4.3.0
|
| 221 |
+
jaraco.collections==5.1.0
|
| 222 |
+
backports.tarfile==1.2.0
|
| 223 |
+
tomli==2.0.1
|
| 224 |
+
importlib_metadata==8.0.0
|
| 225 |
+
typing_extensions==4.12.2
|
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-07-04T09:28:55.102499Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--devices",
|
| 7 |
+
"0,1,2,3,4,5,6,7",
|
| 8 |
+
"--mode",
|
| 9 |
+
"train",
|
| 10 |
+
"--filename",
|
| 11 |
+
"stage1_07041727_2dataset",
|
| 12 |
+
"--num_query_token",
|
| 13 |
+
"8",
|
| 14 |
+
"--plm_name",
|
| 15 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
|
| 16 |
+
"--bert_name",
|
| 17 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
|
| 18 |
+
"--save_every_n_epochs",
|
| 19 |
+
"5",
|
| 20 |
+
"--max_epochs",
|
| 21 |
+
"30",
|
| 22 |
+
"--batch_size",
|
| 23 |
+
"32",
|
| 24 |
+
"--precision",
|
| 25 |
+
"bf16-mixed",
|
| 26 |
+
"--mix_dataset",
|
| 27 |
+
"--num_workers",
|
| 28 |
+
"8",
|
| 29 |
+
"--use_wandb_logger"
|
| 30 |
+
],
|
| 31 |
+
"program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
|
| 32 |
+
"codePath": "stage1.py",
|
| 33 |
+
"email": "gia0603yucca@gmail.com",
|
| 34 |
+
"root": "./all_checkpoints/stage1_07041727_2dataset/",
|
| 35 |
+
"host": "dsw-266702-5bd8569444-hrqd7",
|
| 36 |
+
"executable": "/root/miniconda3/envs/protT3/bin/python",
|
| 37 |
+
"codePathLocal": "stage1.py",
|
| 38 |
+
"cpu_count": 64,
|
| 39 |
+
"cpu_count_logical": 64,
|
| 40 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 41 |
+
"gpu_count": 8,
|
| 42 |
+
"disk": {
|
| 43 |
+
"/": {
|
| 44 |
+
"total": "1623302262784",
|
| 45 |
+
"used": "1260302336"
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"memory": {
|
| 49 |
+
"total": "549755813888"
|
| 50 |
+
},
|
| 51 |
+
"cpu": {
|
| 52 |
+
"count": 64,
|
| 53 |
+
"countLogical": 64
|
| 54 |
+
},
|
| 55 |
+
"gpu_nvidia": [
|
| 56 |
+
{
|
| 57 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 58 |
+
"memoryTotal": "85198045184",
|
| 59 |
+
"architecture": "Ampere"
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 63 |
+
"memoryTotal": "85198045184",
|
| 64 |
+
"architecture": "Ampere"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 68 |
+
"memoryTotal": "85198045184",
|
| 69 |
+
"architecture": "Ampere"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 73 |
+
"memoryTotal": "85198045184",
|
| 74 |
+
"architecture": "Ampere"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 78 |
+
"memoryTotal": "85198045184",
|
| 79 |
+
"architecture": "Ampere"
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 83 |
+
"memoryTotal": "85198045184",
|
| 84 |
+
"architecture": "Ampere"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 88 |
+
"memoryTotal": "85198045184",
|
| 89 |
+
"architecture": "Ampere"
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 93 |
+
"memoryTotal": "85198045184",
|
| 94 |
+
"architecture": "Ampere"
|
| 95 |
+
}
|
| 96 |
+
],
|
| 97 |
+
"cudaVersion": "12.1"
|
| 98 |
+
}
|
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"loader2/val_loss/dataloader_idx_2":13.163299560546875,"loader2/val_loss_ptm/dataloader_idx_2":2.123929738998413,"_step":2027,"loader1/val_loss_ptc/dataloader_idx_1":0.5523239970207214,"lr":1.0246513738820795e-05,"epoch":29,"_runtime":65267.979623679,"loader0/val_loss_ptm/dataloader_idx_0":0.3546168804168701,"train_loss_lm":1.42578125,"train_loss_ptc":0.218505859375,"loader0/val_loss/dataloader_idx_0":2.6084561347961426,"_timestamp":1.7516866030817497e+09,"loader2/val_loss_lm/dataloader_idx_2":7.963825225830078,"_wandb":{"runtime":65274},"loader1/val_loss/dataloader_idx_1":2.692500114440918,"loader1/val_loss_lm/dataloader_idx_1":1.6366312503814697,"loader2/val_loss_ptc/dataloader_idx_2":3.075716018676758,"train_loss_ptm":0.2022705078125,"loader0/val_loss_lm/dataloader_idx_0":1.9205952882766724,"loader1/val_loss_ptm/dataloader_idx_1":0.50240159034729,"loader0/val_loss_ptc/dataloader_idx_0":0.3330143988132477,"trainer/global_step":99929,"train_loss":1.845703125}
|
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-07-04T17:28:55.160594539+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-07-04T17:28:56.249328974+08:00","level":"INFO","msg":"created new stream","id":"6bkqzmou"}
|
| 3 |
+
{"time":"2025-07-04T17:28:56.249372351+08:00","level":"INFO","msg":"stream: started","id":"6bkqzmou"}
|
| 4 |
+
{"time":"2025-07-04T17:28:56.249400451+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"6bkqzmou"}
|
| 5 |
+
{"time":"2025-07-04T17:28:56.249431272+08:00","level":"INFO","msg":"sender: started","stream_id":"6bkqzmou"}
|
| 6 |
+
{"time":"2025-07-04T17:28:56.249469216+08:00","level":"INFO","msg":"handler: started","stream_id":"6bkqzmou"}
|
| 7 |
+
{"time":"2025-07-04T17:28:57.491653525+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-07-04T22:06:52.200518707+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 9 |
+
{"time":"2025-07-04T22:09:45.338273816+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 10 |
+
{"time":"2025-07-04T22:19:20.574743081+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59078->172.67.193.61:443: read: connection timed out"}
|
| 11 |
+
{"time":"2025-07-04T22:25:54.288016702+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:42132->172.67.193.61:443: read: connection timed out"}
|
| 12 |
+
{"time":"2025-07-04T22:29:40.591991523+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:37544->172.67.193.61:443: read: connection timed out"}
|
| 13 |
+
{"time":"2025-07-04T22:36:54.256091094+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:40968->172.67.193.61:443: read: connection timed out"}
|
| 14 |
+
{"time":"2025-07-04T22:37:22.364944108+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 15 |
+
{"time":"2025-07-04T22:40:24.499117928+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 16 |
+
{"time":"2025-07-04T22:40:51.249223858+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 17 |
+
{"time":"2025-07-04T22:44:05.872015851+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59778->172.67.193.61:443: read: connection timed out"}
|
| 18 |
+
{"time":"2025-07-04T22:49:18.192032141+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:47358->104.21.20.172:443: read: connection timed out"}
|
| 19 |
+
{"time":"2025-07-04T22:52:13.295997002+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:46182->172.67.193.61:443: read: connection timed out"}
|
| 20 |
+
{"time":"2025-07-04T22:53:26.345699486+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 21 |
+
{"time":"2025-07-04T22:55:37.691524069+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 22 |
+
{"time":"2025-07-04T22:59:01.477384402+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 23 |
+
{"time":"2025-07-04T23:01:22.224282887+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:45222->172.67.193.61:443: read: connection reset by peer"}
|
| 24 |
+
{"time":"2025-07-04T23:06:44.720013857+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:46280->104.21.20.172:443: read: connection timed out"}
|
| 25 |
+
{"time":"2025-07-04T23:08:42.894770628+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 26 |
+
{"time":"2025-07-04T23:10:13.616061547+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:57034->172.67.193.61:443: read: connection timed out"}
|
| 27 |
+
{"time":"2025-07-04T23:11:27.896127402+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 28 |
+
{"time":"2025-07-04T23:15:47.19805854+08:00","level":"ERROR","msg":"filestream: json decode error: net/http: request canceled (Client.Timeout or context cancellation while reading body)"}
|
| 29 |
+
{"time":"2025-07-04T23:15:47.222866077+08:00","level":"ERROR","msg":"filestream: error closing response body: net/http: request canceled"}
|
| 30 |
+
{"time":"2025-07-04T23:19:26.063989295+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:35504->172.67.193.61:443: read: connection timed out"}
|
| 31 |
+
{"time":"2025-07-04T23:21:57.905369451+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 32 |
+
{"time":"2025-07-04T23:23:00.080992848+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:38378->172.67.193.61:443: read: connection timed out"}
|
| 33 |
+
{"time":"2025-07-04T23:26:54.577250259+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:41928->104.21.20.172:443: read: connection reset by peer"}
|
| 34 |
+
{"time":"2025-07-04T23:28:47.703904029+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:38040->172.67.193.61:443: read: connection reset by peer"}
|
| 35 |
+
{"time":"2025-07-04T23:30:12.910139882+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 36 |
+
{"time":"2025-07-04T23:30:45.313312591+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 37 |
+
{"time":"2025-07-04T23:35:25.039973358+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:38402->104.21.20.172:443: read: connection timed out"}
|
| 38 |
+
{"time":"2025-07-04T23:39:27.49206097+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:39926->172.67.193.61:443: read: connection reset by peer"}
|
| 39 |
+
{"time":"2025-07-04T23:43:09.424012888+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59448->172.67.193.61:443: read: connection timed out"}
|
| 40 |
+
{"time":"2025-07-04T23:46:07.600020006+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:33610->172.67.193.61:443: read: connection timed out"}
|
| 41 |
+
{"time":"2025-07-04T23:46:28.951111977+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
|
| 42 |
+
{"time":"2025-07-04T23:48:12.919414088+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 43 |
+
{"time":"2025-07-04T23:48:45.403207458+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 44 |
+
{"time":"2025-07-04T23:49:16.527984782+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:49472->172.67.193.61:443: read: connection timed out"}
|
| 45 |
+
{"time":"2025-07-04T23:50:27.921623046+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 46 |
+
{"time":"2025-07-04T23:52:57.899934024+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:40636->172.67.193.61:443: read: connection reset by peer"}
|
| 47 |
+
{"time":"2025-07-04T23:56:18.928962652+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:53056->172.67.193.61:443: read: connection timed out"}
|
| 48 |
+
{"time":"2025-07-04T23:56:57.924908638+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 49 |
+
{"time":"2025-07-04T23:57:30.375318804+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 50 |
+
{"time":"2025-07-04T23:58:04.586629939+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 51 |
+
{"time":"2025-07-05T00:03:09.552010125+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:39630->172.67.193.61:443: read: connection timed out"}
|
| 52 |
+
{"time":"2025-07-05T00:03:42.930344983+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 53 |
+
{"time":"2025-07-05T00:04:15.375941679+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 54 |
+
{"time":"2025-07-05T00:04:21.655905995+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:48150->172.67.193.61:443: read: connection reset by peer"}
|
| 55 |
+
{"time":"2025-07-05T00:04:50.22664016+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 56 |
+
{"time":"2025-07-05T00:08:01.391966638+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:53260->104.21.20.172:443: read: connection timed out"}
|
| 57 |
+
{"time":"2025-07-05T00:12:23.023992865+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:44698->172.67.193.61:443: read: connection timed out"}
|
| 58 |
+
{"time":"2025-07-05T00:15:42.93731147+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 59 |
+
{"time":"2025-07-05T00:16:15.214992648+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 60 |
+
{"time":"2025-07-05T00:16:49.667525584+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 61 |
+
{"time":"2025-07-05T00:17:01.040050871+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:42760->172.67.193.61:443: read: connection timed out"}
|
| 62 |
+
{"time":"2025-07-05T00:20:02.288062562+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:37934->104.21.20.172:443: read: connection timed out"}
|
| 63 |
+
{"time":"2025-07-05T00:23:13.264033499+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:51502->172.67.193.61:443: read: connection timed out"}
|
| 64 |
+
{"time":"2025-07-05T00:26:32.944001316+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:50460->172.67.193.61:443: read: connection timed out"}
|
| 65 |
+
{"time":"2025-07-05T00:30:17.199971228+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:43756->104.21.20.172:443: read: connection timed out"}
|
| 66 |
+
{"time":"2025-07-05T00:35:42.94748626+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 67 |
+
{"time":"2025-07-05T00:35:44.881161178+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59610->172.67.193.61:443: read: connection timed out"}
|
| 68 |
+
{"time":"2025-07-05T00:39:14.287974585+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:46688->104.21.20.172:443: read: connection timed out"}
|
| 69 |
+
{"time":"2025-07-05T00:39:33.427697791+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:40430->104.21.20.172:443: read: connection reset by peer"}
|
| 70 |
+
{"time":"2025-07-05T00:40:47.648388331+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:60164->172.67.193.61:443: read: connection reset by peer"}
|
| 71 |
+
{"time":"2025-07-05T00:42:34.088456552+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:52978->104.21.20.172:443: read: connection reset by peer"}
|
| 72 |
+
{"time":"2025-07-05T00:47:13.006425282+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:33354->172.67.193.61:443: read: connection reset by peer"}
|
| 73 |
+
{"time":"2025-07-05T00:49:55.823998082+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:33330->172.67.193.61:443: read: connection timed out"}
|
| 74 |
+
{"time":"2025-07-05T00:52:51.439993456+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:52416->172.67.193.61:443: read: connection timed out"}
|
| 75 |
+
{"time":"2025-07-05T00:57:07.440983899+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:39188->172.67.193.61:443: read: connection timed out"}
|
| 76 |
+
{"time":"2025-07-05T01:04:51.312039238+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59436->172.67.193.61:443: read: connection timed out"}
|
| 77 |
+
{"time":"2025-07-05T01:09:40.080000713+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:34348->172.67.193.61:443: read: connection timed out"}
|
| 78 |
+
{"time":"2025-07-05T01:18:31.535996696+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:57184->104.21.20.172:443: read: connection timed out"}
|
| 79 |
+
{"time":"2025-07-05T01:25:12.431983593+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59852->172.67.193.61:443: read: connection timed out"}
|
| 80 |
+
{"time":"2025-07-05T11:36:49.210079644+08:00","level":"INFO","msg":"stream: closing","id":"6bkqzmou"}
|
| 81 |
+
{"time":"2025-07-05T11:36:49.210163239+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 82 |
+
{"time":"2025-07-05T11:36:49.211103046+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 83 |
+
{"time":"2025-07-05T11:36:51.804545543+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 84 |
+
{"time":"2025-07-05T11:36:53.755788884+08:00","level":"INFO","msg":"handler: closed","stream_id":"6bkqzmou"}
|
| 85 |
+
{"time":"2025-07-05T11:36:53.755828602+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"6bkqzmou"}
|
| 86 |
+
{"time":"2025-07-05T11:36:53.75584333+08:00","level":"INFO","msg":"sender: closed","stream_id":"6bkqzmou"}
|
| 87 |
+
{"time":"2025-07-05T11:36:53.759902053+08:00","level":"INFO","msg":"stream: closed","id":"6bkqzmou"}
|
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Configure stats pid to 29356
|
| 3 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug.log
|
| 7 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug-internal.log
|
| 8 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-07-04 17:28:55,102 INFO MainThread:29356 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-07-04 17:28:55,102 INFO MainThread:29356 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-07-04 17:28:55,103 INFO MainThread:29356 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-07-04 17:28:55,103 INFO MainThread:29356 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-07-04 17:28:57,453 INFO MainThread:29356 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-07-04 17:28:57,668 INFO MainThread:29356 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-07-04 17:28:57,669 INFO MainThread:29356 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-07-04 17:28:57,678 INFO MainThread:29356 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-07-04 17:28:57,684 INFO MainThread:29356 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-07-04 17:28:57,686 INFO MainThread:29356 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-07-04 17:29:03,015 INFO MainThread:29356 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_07041727_2dataset', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 30, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
|
| 24 |
+
2025-07-05 11:36:49,208 INFO MsgRouterThr:29356 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
ProtT3/all_checkpoints/stage1_ckpt/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-28T22:23:55.245053432+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-28T22:23:56.442748363+08:00","level":"INFO","msg":"created new stream","id":"e9wtzwz1"}
|
| 3 |
+
{"time":"2025-06-28T22:23:56.442786823+08:00","level":"INFO","msg":"stream: started","id":"e9wtzwz1"}
|
| 4 |
+
{"time":"2025-06-28T22:23:56.44283909+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"e9wtzwz1"}
|
| 5 |
+
{"time":"2025-06-28T22:23:56.44287811+08:00","level":"INFO","msg":"sender: started","stream_id":"e9wtzwz1"}
|
| 6 |
+
{"time":"2025-06-28T22:23:56.442850569+08:00","level":"INFO","msg":"handler: started","stream_id":"e9wtzwz1"}
|
| 7 |
+
{"time":"2025-06-28T22:23:57.657067842+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-06-28T23:13:08.786733475+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:41088->172.67.193.61:443: read: connection timed out"}
|
| 9 |
+
{"time":"2025-06-28T23:16:23.858735046+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:58168->104.21.20.172:443: read: connection timed out"}
|
| 10 |
+
{"time":"2025-06-28T23:20:12.333412842+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:50886->172.67.193.61:443: read: connection reset by peer"}
|
| 11 |
+
{"time":"2025-06-28T23:28:29.895934993+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": unexpected EOF"}
|
| 12 |
+
{"time":"2025-06-28T23:32:39.731699923+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:54496->104.21.20.172:443: read: connection timed out"}
|
| 13 |
+
{"time":"2025-06-28T23:35:17.938724051+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:33406->172.67.193.61:443: read: connection timed out"}
|
| 14 |
+
{"time":"2025-06-28T23:38:54.515701632+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:59930->172.67.193.61:443: read: connection timed out"}
|
| 15 |
+
{"time":"2025-06-28T23:41:28.010949965+08:00","level":"INFO","msg":"stream: closing","id":"e9wtzwz1"}
|
| 16 |
+
{"time":"2025-06-28T23:41:28.011132748+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 17 |
+
{"time":"2025-06-28T23:41:28.066664522+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 18 |
+
{"time":"2025-06-28T23:41:37.996261564+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 19 |
+
{"time":"2025-06-28T23:42:36.21077519+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:36806->172.67.193.61:443: read: connection timed out"}
|
ProtT3/all_checkpoints/stage1_ckpt/wandb/debug.log
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Configure stats pid to 3589
|
| 3 |
+
2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug.log
|
| 7 |
+
2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug-internal.log
|
| 8 |
+
2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-28 22:23:55,237 INFO MainThread:3589 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-28 22:23:55,239 INFO MainThread:3589 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-28 22:23:55,240 INFO MainThread:3589 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-28 22:23:55,249 INFO MainThread:3589 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-28 22:23:57,648 INFO MainThread:3589 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-06-28 22:23:57,823 INFO MainThread:3589 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-06-28 22:23:57,823 INFO MainThread:3589 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-06-28 22:23:57,827 INFO MainThread:3589 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-06-28 22:23:57,827 INFO MainThread:3589 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-06-28 22:23:57,829 INFO MainThread:3589 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-06-28 23:41:27,993 INFO MsgRouterThr:3589 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
|
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/config.yaml
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.11
|
| 4 |
+
m:
|
| 5 |
+
- "1": trainer/global_step
|
| 6 |
+
"6":
|
| 7 |
+
- 3
|
| 8 |
+
"7": []
|
| 9 |
+
python_version: 3.10.0
|
| 10 |
+
t:
|
| 11 |
+
"1":
|
| 12 |
+
- 1
|
| 13 |
+
- 5
|
| 14 |
+
- 9
|
| 15 |
+
- 11
|
| 16 |
+
- 33
|
| 17 |
+
- 41
|
| 18 |
+
- 49
|
| 19 |
+
- 53
|
| 20 |
+
- 55
|
| 21 |
+
- 63
|
| 22 |
+
- 103
|
| 23 |
+
"2":
|
| 24 |
+
- 1
|
| 25 |
+
- 5
|
| 26 |
+
- 9
|
| 27 |
+
- 11
|
| 28 |
+
- 33
|
| 29 |
+
- 41
|
| 30 |
+
- 49
|
| 31 |
+
- 53
|
| 32 |
+
- 55
|
| 33 |
+
- 63
|
| 34 |
+
- 103
|
| 35 |
+
"3":
|
| 36 |
+
- 7
|
| 37 |
+
- 23
|
| 38 |
+
- 33
|
| 39 |
+
- 55
|
| 40 |
+
- 66
|
| 41 |
+
"4": 3.10.0
|
| 42 |
+
"5": 0.19.11
|
| 43 |
+
"6": 4.52.3
|
| 44 |
+
"8":
|
| 45 |
+
- 5
|
| 46 |
+
"12": 0.19.11
|
| 47 |
+
"13": linux-x86_64
|
| 48 |
+
accelerator:
|
| 49 |
+
value: gpu
|
| 50 |
+
batch_size:
|
| 51 |
+
value: 32
|
| 52 |
+
bert_hidden_dim:
|
| 53 |
+
value: 768
|
| 54 |
+
bert_name:
|
| 55 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
|
| 56 |
+
check_val_every_n_epoch:
|
| 57 |
+
value: 1
|
| 58 |
+
cross_attention_freq:
|
| 59 |
+
value: 2
|
| 60 |
+
devices:
|
| 61 |
+
value: 0,1,2,3,4,5,6,7
|
| 62 |
+
filename:
|
| 63 |
+
value: stage1_ckpt
|
| 64 |
+
init_checkpoint:
|
| 65 |
+
value: ""
|
| 66 |
+
init_lr:
|
| 67 |
+
value: 0.0001
|
| 68 |
+
lm:
|
| 69 |
+
value: true
|
| 70 |
+
load_4bit:
|
| 71 |
+
value: false
|
| 72 |
+
lr_decay_rate:
|
| 73 |
+
value: 0.9
|
| 74 |
+
match_batch_size:
|
| 75 |
+
value: 64
|
| 76 |
+
max_epochs:
|
| 77 |
+
value: 20
|
| 78 |
+
min_lr:
|
| 79 |
+
value: 1e-05
|
| 80 |
+
mix_dataset:
|
| 81 |
+
value: true
|
| 82 |
+
mode:
|
| 83 |
+
value: train
|
| 84 |
+
num_query_token:
|
| 85 |
+
value: 8
|
| 86 |
+
num_workers:
|
| 87 |
+
value: 8
|
| 88 |
+
plm_name:
|
| 89 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
|
| 90 |
+
plm_tune:
|
| 91 |
+
value: freeze
|
| 92 |
+
pool_size:
|
| 93 |
+
value: 0
|
| 94 |
+
precision:
|
| 95 |
+
value: bf16-mixed
|
| 96 |
+
projection_dim:
|
| 97 |
+
value: 256
|
| 98 |
+
prot_aug:
|
| 99 |
+
value: None
|
| 100 |
+
prot_max_len:
|
| 101 |
+
value: 1024
|
| 102 |
+
ptm:
|
| 103 |
+
value: true
|
| 104 |
+
rerank_cand_num:
|
| 105 |
+
value: 128
|
| 106 |
+
retrieval_eval_epoch:
|
| 107 |
+
value: 10
|
| 108 |
+
root:
|
| 109 |
+
value: data_small
|
| 110 |
+
save_every_n_epochs:
|
| 111 |
+
value: 5
|
| 112 |
+
scheduler:
|
| 113 |
+
value: linear_warmup_cosine_lr
|
| 114 |
+
seed:
|
| 115 |
+
value: 42
|
| 116 |
+
strategy:
|
| 117 |
+
value: ddp
|
| 118 |
+
temperature:
|
| 119 |
+
value: 0.1
|
| 120 |
+
text_max_len:
|
| 121 |
+
value: 128
|
| 122 |
+
use_wandb_logger:
|
| 123 |
+
value: true
|
| 124 |
+
warmup_lr:
|
| 125 |
+
value: 1e-06
|
| 126 |
+
warmup_steps:
|
| 127 |
+
value: 1000
|
| 128 |
+
weight_decay:
|
| 129 |
+
value: 0.05
|
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/output.log
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
Detected KeyboardInterrupt, attempting graceful shutdown ...
|
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/requirements.txt
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
marisa-trie==1.2.1
|
| 2 |
+
pydantic==2.11.5
|
| 3 |
+
mdurl==0.1.2
|
| 4 |
+
gitdb==4.0.12
|
| 5 |
+
scikit-image==0.25.2
|
| 6 |
+
async-timeout==5.0.1
|
| 7 |
+
blis==1.3.0
|
| 8 |
+
urllib3==2.4.0
|
| 9 |
+
spacy==3.8.7
|
| 10 |
+
nvidia-ml-py==12.575.51
|
| 11 |
+
braceexpand==0.1.7
|
| 12 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 13 |
+
rich==14.0.0
|
| 14 |
+
setuptools==78.1.1
|
| 15 |
+
matplotlib==3.10.3
|
| 16 |
+
catalogue==2.0.10
|
| 17 |
+
decord==0.6.0
|
| 18 |
+
numpy==2.2.6
|
| 19 |
+
charset-normalizer==3.4.2
|
| 20 |
+
langcodes==3.5.0
|
| 21 |
+
pexpect==4.9.0
|
| 22 |
+
nltk==3.9.1
|
| 23 |
+
cachetools==5.5.2
|
| 24 |
+
cfgv==3.4.0
|
| 25 |
+
prompt_toolkit==3.0.51
|
| 26 |
+
srsly==2.5.1
|
| 27 |
+
einops==0.8.1
|
| 28 |
+
Jinja2==3.1.6
|
| 29 |
+
cloudpathlib==0.21.1
|
| 30 |
+
streamlit==1.45.1
|
| 31 |
+
pydantic_core==2.33.2
|
| 32 |
+
tornado==6.5.1
|
| 33 |
+
nvidia-curand-cu12==10.3.5.147
|
| 34 |
+
deepspeed==0.16.10+b666844f
|
| 35 |
+
networkx==3.4.2
|
| 36 |
+
omegaconf==2.3.0
|
| 37 |
+
msgpack==1.1.0
|
| 38 |
+
pandas==2.2.3
|
| 39 |
+
rouge_score==0.1.2
|
| 40 |
+
six==1.17.0
|
| 41 |
+
language_data==1.3.0
|
| 42 |
+
referencing==0.36.2
|
| 43 |
+
rpds-py==0.25.1
|
| 44 |
+
lazy_loader==0.4
|
| 45 |
+
pydeck==0.9.1
|
| 46 |
+
markdown-it-py==3.0.0
|
| 47 |
+
fonttools==4.58.0
|
| 48 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 49 |
+
smart-open==7.1.0
|
| 50 |
+
identify==2.6.12
|
| 51 |
+
pure_eval==0.2.3
|
| 52 |
+
confection==0.1.5
|
| 53 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 54 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 55 |
+
decorator==5.2.1
|
| 56 |
+
nvidia-nccl-cu12==2.21.5
|
| 57 |
+
pytz==2025.2
|
| 58 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 59 |
+
plotly==6.1.1
|
| 60 |
+
safetensors==0.5.3
|
| 61 |
+
portalocker==3.1.1
|
| 62 |
+
toml==0.10.2
|
| 63 |
+
triton==3.2.0
|
| 64 |
+
cycler==0.12.1
|
| 65 |
+
torch==2.6.0
|
| 66 |
+
python-magic==0.4.27
|
| 67 |
+
ptyprocess==0.7.0
|
| 68 |
+
regex==2024.11.6
|
| 69 |
+
absl-py==2.2.2
|
| 70 |
+
psutil==7.0.0
|
| 71 |
+
murmurhash==1.0.13
|
| 72 |
+
wrapt==1.17.2
|
| 73 |
+
pycocoevalcap==1.2
|
| 74 |
+
python-slugify==8.0.4
|
| 75 |
+
stack-data==0.6.3
|
| 76 |
+
python-dateutil==2.9.0.post0
|
| 77 |
+
scipy==1.15.3
|
| 78 |
+
annotated-types==0.7.0
|
| 79 |
+
mpmath==1.3.0
|
| 80 |
+
ipython==8.36.0
|
| 81 |
+
pyparsing==3.2.3
|
| 82 |
+
nvidia-nvtx-cu12==12.4.127
|
| 83 |
+
fairscale==0.4.4
|
| 84 |
+
jsonschema-specifications==2025.4.1
|
| 85 |
+
matplotlib-inline==0.1.7
|
| 86 |
+
watchdog==6.0.0
|
| 87 |
+
thinc==8.3.6
|
| 88 |
+
antlr4-python3-runtime==4.9.3
|
| 89 |
+
webencodings==0.5.1
|
| 90 |
+
hjson==3.1.0
|
| 91 |
+
propcache==0.3.1
|
| 92 |
+
virtualenv==20.31.2
|
| 93 |
+
pytorch-lightning==2.5.1.post0
|
| 94 |
+
Pygments==2.19.1
|
| 95 |
+
pillow==11.2.1
|
| 96 |
+
joblib==1.5.1
|
| 97 |
+
tqdm==4.67.1
|
| 98 |
+
timm==0.4.12
|
| 99 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 100 |
+
aiosignal==1.3.2
|
| 101 |
+
kaggle==1.7.4.5
|
| 102 |
+
idna==3.10
|
| 103 |
+
pycocotools==2.0.8
|
| 104 |
+
MarkupSafe==3.0.2
|
| 105 |
+
traitlets==5.14.3
|
| 106 |
+
multidict==6.4.4
|
| 107 |
+
distlib==0.3.9
|
| 108 |
+
torchmetrics==1.7.1
|
| 109 |
+
pyarrow==20.0.0
|
| 110 |
+
tzdata==2025.2
|
| 111 |
+
platformdirs==4.3.8
|
| 112 |
+
yarl==1.20.0
|
| 113 |
+
tenacity==9.1.2
|
| 114 |
+
altair==5.5.0
|
| 115 |
+
wasabi==1.1.3
|
| 116 |
+
attrs==25.3.0
|
| 117 |
+
contourpy==1.3.2
|
| 118 |
+
kiwisolver==1.4.8
|
| 119 |
+
PyYAML==6.0.2
|
| 120 |
+
exceptiongroup==1.3.0
|
| 121 |
+
jedi==0.19.2
|
| 122 |
+
sentencepiece==0.2.0
|
| 123 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 124 |
+
requests==2.32.3
|
| 125 |
+
opendatasets==0.1.22
|
| 126 |
+
GitPython==3.1.44
|
| 127 |
+
bleach==6.2.0
|
| 128 |
+
protobuf==6.31.0
|
| 129 |
+
sympy==1.13.1
|
| 130 |
+
filelock==3.18.0
|
| 131 |
+
pre_commit==4.2.0
|
| 132 |
+
text-unidecode==1.3
|
| 133 |
+
wheel==0.45.1
|
| 134 |
+
contexttimer==0.3.3
|
| 135 |
+
wcwidth==0.2.13
|
| 136 |
+
spacy-legacy==3.0.12
|
| 137 |
+
aiohappyeyeballs==2.6.1
|
| 138 |
+
imageio==2.37.0
|
| 139 |
+
nodeenv==1.9.1
|
| 140 |
+
py-cpuinfo==9.0.0
|
| 141 |
+
hf-xet==1.1.2
|
| 142 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 143 |
+
weasel==0.4.1
|
| 144 |
+
certifi==2025.4.26
|
| 145 |
+
lightning-utilities==0.14.3
|
| 146 |
+
typing_extensions==4.13.2
|
| 147 |
+
typing-inspection==0.4.1
|
| 148 |
+
webdataset==0.2.111
|
| 149 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 150 |
+
asttokens==3.0.0
|
| 151 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 152 |
+
opencv-python-headless==4.5.5.64
|
| 153 |
+
smmap==5.0.2
|
| 154 |
+
tifffile==2025.5.10
|
| 155 |
+
iopath==0.1.10
|
| 156 |
+
packaging==24.2
|
| 157 |
+
cymem==2.0.11
|
| 158 |
+
spacy-loggers==1.0.5
|
| 159 |
+
ninja==1.11.1.4
|
| 160 |
+
ftfy==6.3.1
|
| 161 |
+
executing==2.2.0
|
| 162 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 163 |
+
blinker==1.9.0
|
| 164 |
+
torchvision==0.21.0
|
| 165 |
+
parso==0.8.4
|
| 166 |
+
salesforce-lavis==1.0.2
|
| 167 |
+
frozenlist==1.6.0
|
| 168 |
+
shellingham==1.5.4
|
| 169 |
+
flash-attn==2.7.1.post1
|
| 170 |
+
pycparser==2.22
|
| 171 |
+
threadpoolctl==3.6.0
|
| 172 |
+
opencv-python==4.11.0.86
|
| 173 |
+
fsspec==2025.3.0
|
| 174 |
+
aiohttp==3.12.2
|
| 175 |
+
narwhals==1.41.0
|
| 176 |
+
opendelta==0.3.2
|
| 177 |
+
pycryptodome==3.23.0
|
| 178 |
+
crcmod==1.7
|
| 179 |
+
delta-center-client==0.0.4
|
| 180 |
+
tokenizers==0.21.1
|
| 181 |
+
aliyun-python-sdk-kms==2.16.5
|
| 182 |
+
more-itertools==10.7.0
|
| 183 |
+
yacs==0.1.8
|
| 184 |
+
bigmodelvis==0.0.1
|
| 185 |
+
jmespath==0.10.0
|
| 186 |
+
docker-pycreds==0.4.0
|
| 187 |
+
web.py==0.62
|
| 188 |
+
scikit-learn==1.6.1
|
| 189 |
+
pip==25.1.1
|
| 190 |
+
cheroot==10.0.1
|
| 191 |
+
setproctitle==1.3.6
|
| 192 |
+
huggingface-hub==0.32.1
|
| 193 |
+
oss2==2.15.0
|
| 194 |
+
cryptography==45.0.3
|
| 195 |
+
typer==0.16.0
|
| 196 |
+
xxhash==3.5.0
|
| 197 |
+
jsonschema==4.24.0
|
| 198 |
+
click==8.2.1
|
| 199 |
+
preshed==3.0.10
|
| 200 |
+
sentry-sdk==2.29.1
|
| 201 |
+
wandb==0.19.11
|
| 202 |
+
dill==0.3.8
|
| 203 |
+
aliyun-python-sdk-core==2.16.0
|
| 204 |
+
transformers==4.52.3
|
| 205 |
+
cffi==1.17.1
|
| 206 |
+
pathlib==1.0.1
|
| 207 |
+
jaraco.functools==4.1.0
|
| 208 |
+
datasets==3.6.0
|
| 209 |
+
multiprocess==0.70.16
|
| 210 |
+
backports.tarfile==1.2.0
|
| 211 |
+
tomli==2.0.1
|
| 212 |
+
autocommand==2.2.2
|
| 213 |
+
zipp==3.19.2
|
| 214 |
+
jaraco.text==3.12.1
|
| 215 |
+
jaraco.collections==5.1.0
|
| 216 |
+
platformdirs==4.2.2
|
| 217 |
+
typeguard==4.3.0
|
| 218 |
+
typing_extensions==4.12.2
|
| 219 |
+
jaraco.functools==4.0.1
|
| 220 |
+
inflect==7.3.1
|
| 221 |
+
wheel==0.45.1
|
| 222 |
+
more-itertools==10.3.0
|
| 223 |
+
importlib_metadata==8.0.0
|
| 224 |
+
jaraco.context==5.3.0
|
| 225 |
+
packaging==24.2
|