diff --git a/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_token_acc.png b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..8e2443f6d742910c248cbb383ae32fa7ee7ededd Binary files /dev/null and b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_token_acc.png differ diff --git a/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_total_flos.png b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_total_flos.png new file mode 100644 index 0000000000000000000000000000000000000000..fb1d61a8123da51c800af60600dcc71f7bc8abc3 Binary files /dev/null and b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_total_flos.png differ diff --git a/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_train_loss.png b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..05a62360d80e7083e58558fca324bb57890fb487 Binary files /dev/null and b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_train_loss.png differ diff --git a/ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug-internal.log b/ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ca3b723446681dd3fc03c497714766c5e551dbdc --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug-internal.log @@ -0,0 +1,63 @@ +{"time":"2025-06-29T00:07:02.130913564+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug-core.log"} +{"time":"2025-06-29T00:07:16.339720801+08:00","level":"INFO","msg":"created new stream","id":"rypk39yq"} +{"time":"2025-06-29T00:07:16.340562919+08:00","level":"INFO","msg":"stream: started","id":"rypk39yq"} +{"time":"2025-06-29T00:07:16.340584288+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"rypk39yq"} +{"time":"2025-06-29T00:07:16.340617888+08:00","level":"INFO","msg":"sender: started","stream_id":"rypk39yq"} +{"time":"2025-06-29T00:07:16.340654242+08:00","level":"INFO","msg":"handler: started","stream_id":"rypk39yq"} +{"time":"2025-06-29T00:07:28.033909694+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-06-29T00:12:24.114755958+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:52688->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-29T00:15:17.682707235+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:43992->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-29T00:16:13.20335199+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T00:16:45.525802023+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T00:17:19.98711773+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-06-29T00:18:06.642780387+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:36080->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-29T00:22:43.123257688+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:52664->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-29T00:26:08.434737599+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:42534->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-29T00:27:44.454100719+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:53006->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-29T00:29:13.211268181+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T00:29:45.68436365+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T00:30:19.759580601+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T00:30:33.650730605+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:38754->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-29T00:30:58.011093426+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T00:34:39.922752645+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:35350->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-29T00:36:41.88529828+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": unexpected EOF"} +{"time":"2025-06-29T00:37:20.878368218+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:46470->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-29T00:38:49.414424011+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": unexpected EOF"} +{"time":"2025-06-29T00:38:58.216757113+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-06-29T00:39:20.141003198+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:53708->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-29T00:41:33.299264534+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:44198->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-29T00:47:37.138754922+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:39138->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-29T00:54:28.224811124+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T00:55:15.429710397+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:49584->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-29T00:55:36.251525534+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:55184->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-29T00:56:12.092902722+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": http2: client conn is closed"} +{"time":"2025-06-29T00:59:32.604209299+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:44582->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-06-29T01:00:43.231046844+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-06-29T01:05:28.234577388+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T01:06:00.428439859+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-06-29T01:06:35.403033399+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-06-29T01:07:13.835463934+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T01:12:30.014897464+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": unexpected EOF"} +{"time":"2025-06-29T01:14:58.239397356+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T01:15:30.658073848+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T01:16:05.133874663+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T01:16:43.256922452+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-06-29T01:17:07.122753765+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:42208->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-29T01:17:31.631854783+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T01:18:38.479583401+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T01:20:08.481626584+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-06-29T01:21:38.483904393+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T01:22:09.185192206+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T01:28:06.578759778+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:39408->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-29T02:00:40.766530394+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.bandw.top/graphql","body":"error code: 502"} +{"time":"2025-06-29T08:45:43.611887283+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-29T08:45:55.061157169+08:00","level":"INFO","msg":"api: retrying HTTP error","status":520,"url":"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream","body":"error code: 520"} +{"time":"2025-06-29T08:51:23.638432293+08:00","level":"INFO","msg":"api: retrying HTTP error","status":524,"url":"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream","body":"error code: 524"} +{"time":"2025-06-29T10:16:08.309722526+08:00","level":"INFO","msg":"stream: closing","id":"rypk39yq"} +{"time":"2025-06-29T10:16:08.309813211+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-06-29T10:16:08.311047133+08:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-06-29T10:16:10.887637294+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-06-29T10:16:11.831362524+08:00","level":"INFO","msg":"handler: closed","stream_id":"rypk39yq"} +{"time":"2025-06-29T10:16:11.831401295+08:00","level":"INFO","msg":"sender: closed","stream_id":"rypk39yq"} +{"time":"2025-06-29T10:16:11.831391+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"rypk39yq"} +{"time":"2025-06-29T10:16:11.835883161+08:00","level":"INFO","msg":"stream: closed","id":"rypk39yq"} diff --git a/ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug.log b/ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..b15e3cda4bdab0e51e02b73f6c588f8762f439da --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug.log @@ -0,0 +1,23 @@ +2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Configure stats pid to 938398 +2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug.log +2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug-internal.log +2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():852] calling init triggers +2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():893] starting backend +2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():897] sending inform_init request +2025-06-29 00:07:02,122 INFO MainThread:938398 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-29 00:07:02,125 INFO MainThread:938398 [wandb_init.py:init():907] backend started and connected +2025-06-29 00:07:02,126 INFO MainThread:938398 [wandb_init.py:init():1005] updated telemetry +2025-06-29 00:07:02,129 INFO MainThread:938398 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-06-29 00:07:27,982 INFO MainThread:938398 [wandb_init.py:init():1104] starting run threads in backend +2025-06-29 00:07:28,171 INFO MainThread:938398 [wandb_run.py:_console_start():2573] atexit reg +2025-06-29 00:07:28,172 INFO MainThread:938398 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-06-29 00:07:28,176 INFO MainThread:938398 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-06-29 00:07:28,176 INFO MainThread:938398 [wandb_run.py:_redirect():2513] Redirects installed. +2025-06-29 00:07:28,177 INFO MainThread:938398 [wandb_init.py:init():1150] run started, returning control to user process +2025-06-29 10:16:08,240 INFO MsgRouterThr:938398 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a1b524f6bc9878b6d4456ad4a13be043e37a03d5 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/wandb-metadata.json @@ -0,0 +1,100 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-06-28T12:54:50.854308Z", + "args": [ + "--devices", + "0,1,2,3,4,5,6,7", + "--mode", + "train", + "--filename", + "stage1_ckpt", + "--num_query_token", + "8", + "--plm_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--save_every_n_epochs", + "5", + "--max_epochs", + "20", + "--batch_size", + "32", + "--precision", + "bf16-mixed", + "--mix_dataset", + "--num_workers", + "8", + "--use_wandb_logger", + "--strategy", + "ddp" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", + "codePath": "stage1.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage1_ckpt/", + "host": "dsw-265304-558499d597-hhhs7", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage1.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "11399286784" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/wandb-summary.json b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..43bf8ae4a127ce942762944e6ac015b65e12a482 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":461}} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug-internal.log b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..33c2138f86674fe9ed7434a7cbafe153e6ab33ae --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug-internal.log @@ -0,0 +1,15 @@ +{"time":"2025-06-28T20:54:50.857319195+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug-core.log"} +{"time":"2025-06-28T20:54:52.34815986+08:00","level":"INFO","msg":"created new stream","id":"irx8yzsh"} +{"time":"2025-06-28T20:54:52.348208346+08:00","level":"INFO","msg":"stream: started","id":"irx8yzsh"} +{"time":"2025-06-28T20:54:52.348232003+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"irx8yzsh"} +{"time":"2025-06-28T20:54:52.348267908+08:00","level":"INFO","msg":"handler: started","stream_id":"irx8yzsh"} +{"time":"2025-06-28T20:54:52.348288286+08:00","level":"INFO","msg":"sender: started","stream_id":"irx8yzsh"} +{"time":"2025-06-28T20:54:53.671617933+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-06-28T21:02:32.638770807+08:00","level":"INFO","msg":"stream: closing","id":"irx8yzsh"} +{"time":"2025-06-28T21:02:32.638994707+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-06-28T21:02:32.646934986+08:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-06-28T21:02:34.307048324+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-06-28T21:02:35.437245138+08:00","level":"INFO","msg":"handler: closed","stream_id":"irx8yzsh"} +{"time":"2025-06-28T21:02:35.437313629+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"irx8yzsh"} +{"time":"2025-06-28T21:02:35.437394408+08:00","level":"INFO","msg":"sender: closed","stream_id":"irx8yzsh"} +{"time":"2025-06-28T21:02:35.441861626+08:00","level":"INFO","msg":"stream: closed","id":"irx8yzsh"} diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug.log b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..45ae9f98a5e2636f54efa8724c48f25ef9dfa588 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug.log @@ -0,0 +1,23 @@ +2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_setup.py:_flush():70] Configure stats pid to 45186 +2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug.log +2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug-internal.log +2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_init.py:init():852] calling init triggers +2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_init.py:init():893] starting backend +2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_init.py:init():897] sending inform_init request +2025-06-28 20:54:50,845 INFO MainThread:45186 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-28 20:54:50,852 INFO MainThread:45186 [wandb_init.py:init():907] backend started and connected +2025-06-28 20:54:50,855 INFO MainThread:45186 [wandb_init.py:init():1005] updated telemetry +2025-06-28 20:54:50,859 INFO MainThread:45186 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-06-28 20:54:53,659 INFO MainThread:45186 [wandb_init.py:init():1104] starting run threads in backend +2025-06-28 20:54:53,876 INFO MainThread:45186 [wandb_run.py:_console_start():2573] atexit reg +2025-06-28 20:54:53,876 INFO MainThread:45186 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-06-28 20:54:53,923 INFO MainThread:45186 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-06-28 20:54:53,924 INFO MainThread:45186 [wandb_run.py:_redirect():2513] Redirects installed. +2025-06-28 20:54:53,930 INFO MainThread:45186 [wandb_init.py:init():1150] run started, returning control to user process +2025-06-28 21:02:32,610 INFO MsgRouterThr:45186 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/config.yaml b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3df74576bd4502c6c8aadfdbee519d24f541f48a --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/config.yaml @@ -0,0 +1,129 @@ +_wandb: + value: + cli_version: 0.19.11 + m: + - "1": trainer/global_step + "6": + - 3 + "7": [] + python_version: 3.10.0 + t: + "1": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "2": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "3": + - 7 + - 23 + - 33 + - 55 + - 66 + "4": 3.10.0 + "5": 0.19.11 + "6": 4.52.3 + "8": + - 5 + "12": 0.19.11 + "13": linux-x86_64 +accelerator: + value: gpu +batch_size: + value: 168 +bert_hidden_dim: + value: 768 +bert_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft +check_val_every_n_epoch: + value: 1 +cross_attention_freq: + value: 2 +devices: + value: 0,1,2,3,4,5,6,7 +filename: + value: stage1_ckpt +init_checkpoint: + value: "" +init_lr: + value: 0.0001 +lm: + value: true +load_4bit: + value: false +lr_decay_rate: + value: 0.9 +match_batch_size: + value: 64 +max_epochs: + value: 20 +min_lr: + value: 1e-05 +mix_dataset: + value: true +mode: + value: train +num_query_token: + value: 8 +num_workers: + value: 8 +plm_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m +plm_tune: + value: freeze +pool_size: + value: 0 +precision: + value: bf16-mixed +projection_dim: + value: 256 +prot_aug: + value: None +prot_max_len: + value: 1024 +ptm: + value: true +rerank_cand_num: + value: 128 +retrieval_eval_epoch: + value: 10 +root: + value: data_small +save_every_n_epochs: + value: 5 +scheduler: + value: linear_warmup_cosine_lr +seed: + value: 42 +strategy: + value: ddp +temperature: + value: 0.1 +text_max_len: + value: 128 +use_wandb_logger: + value: true +warmup_lr: + value: 1e-06 +warmup_steps: + value: 1000 +weight_decay: + value: 0.05 diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/output.log b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..bb74f82a041a866426176fa3172ee6aa715e896b --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/output.log @@ -0,0 +1,145 @@ +W0628 21:09:07.551322 234012 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 234629 via signal SIGTERM +W0628 21:09:07.552175 234012 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 234847 via signal SIGTERM +W0628 21:09:07.552469 234012 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 235007 via signal SIGTERM +W0628 21:09:07.552683 234012 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 235147 via signal SIGTERM +W0628 21:09:07.552918 234012 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 235302 via signal SIGTERM +W0628 21:09:07.553165 234012 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 235447 via signal SIGTERM +W0628 21:09:07.553355 234012 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 235583 via signal SIGTERM +Traceback (most recent call last): + File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 135, in + main(args) + File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 101, in main + trainer.fit(model, datamodule=dm) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit + call._call_and_handle_interrupt( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt + return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py", line 144, in launch + while not process_context.join(): + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 215, in join + raise ProcessRaisedException(msg, error_index, failed_process.pid) +torch.multiprocessing.spawn.ProcessRaisedException: + +-- Process 1 terminated with the following error: +Traceback (most recent call last): + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 90, in _wrap + fn(i, *args) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py", line 173, in _wrapping_function + results = function(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl + self._run(model, ckpt_path=ckpt_path) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run + results = self._run_stage() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1056, in _run_stage + self.fit_loop.run() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 216, in run + self.advance() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 455, in advance + self.epoch_loop.run(self._data_fetcher) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 150, in run + self.advance(data_fetcher) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 320, in advance + batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 192, in run + self._optimizer_step(batch_idx, closure) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 270, in _optimizer_step + call._call_lightning_module_hook( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 176, in _call_lightning_module_hook + output = fn(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1302, in optimizer_step + optimizer.step(closure=optimizer_closure) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py", line 154, in step + step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 270, in optimizer_step + optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 239, in optimizer_step + return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/amp.py", line 76, in optimizer_step + return super().optimizer_step(optimizer, model=model, closure=closure, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision.py", line 123, in optimizer_step + return optimizer.step(closure=closure, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/optim/optimizer.py", line 493, in wrapper + out = func(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/optim/optimizer.py", line 91, in _use_grad + ret = func(self, *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/optim/adamw.py", line 220, in step + loss = closure() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision.py", line 109, in _wrap_closure + closure_result = closure() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 146, in __call__ + self._result = self.closure(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context + return func(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 131, in closure + step_output = self._step_fn() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 319, in _training_step + training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values()) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook + output = fn(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 390, in training_step + return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__ + wrapper_output = wrapper_module(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1643, in forward + else self._run_ddp_forward(*inputs, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1459, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward + out = method(*_args, **_kwargs) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 184, in training_step + blip2_loss = self.blip2qformer(batch) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 228, in forward + lm_output = self.Qformer( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lavis/models/blip2_models/Qformer.py", line 1046, in forward + outputs = self.bert( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lavis/models/blip2_models/Qformer.py", line 937, in forward + encoder_outputs = self.encoder( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lavis/models/blip2_models/Qformer.py", line 550, in forward + layer_outputs = layer_module( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lavis/models/blip2_models/Qformer.py", line 464, in forward + layer_output = apply_chunking_to_forward( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/pytorch_utils.py", line 253, in apply_chunking_to_forward + return forward_fn(*input_tensors) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lavis/models/blip2_models/Qformer.py", line 477, in feed_forward_chunk + intermediate_output = self.intermediate(attention_output) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lavis/models/blip2_models/Qformer.py", line 359, in forward + hidden_states = self.dense(hidden_states) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 125, in forward + return F.linear(input, self.weight, self.bias) +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 252.00 MiB. GPU 1 has a total capacity of 79.35 GiB of which 112.19 MiB is free. Process 1747899 has 79.23 GiB memory in use. Of the allocated memory 77.42 GiB is allocated by PyTorch, and 738.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/requirements.txt b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4b19413ecc562d59f65f708a0c214f92e7006d2 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/requirements.txt @@ -0,0 +1,225 @@ +marisa-trie==1.2.1 +pydantic==2.11.5 +mdurl==0.1.2 +gitdb==4.0.12 +scikit-image==0.25.2 +async-timeout==5.0.1 +blis==1.3.0 +urllib3==2.4.0 +spacy==3.8.7 +nvidia-ml-py==12.575.51 +braceexpand==0.1.7 +nvidia-cufft-cu12==11.2.1.3 +rich==14.0.0 +setuptools==78.1.1 +matplotlib==3.10.3 +catalogue==2.0.10 +decord==0.6.0 +numpy==2.2.6 +charset-normalizer==3.4.2 +langcodes==3.5.0 +pexpect==4.9.0 +nltk==3.9.1 +cachetools==5.5.2 +cfgv==3.4.0 +prompt_toolkit==3.0.51 +srsly==2.5.1 +einops==0.8.1 +Jinja2==3.1.6 +cloudpathlib==0.21.1 +streamlit==1.45.1 +pydantic_core==2.33.2 +tornado==6.5.1 +nvidia-curand-cu12==10.3.5.147 +deepspeed==0.16.10+b666844f +networkx==3.4.2 +omegaconf==2.3.0 +msgpack==1.1.0 +pandas==2.2.3 +rouge_score==0.1.2 +six==1.17.0 +language_data==1.3.0 +referencing==0.36.2 +rpds-py==0.25.1 +lazy_loader==0.4 +pydeck==0.9.1 +markdown-it-py==3.0.0 +fonttools==4.58.0 +nvidia-cuda-runtime-cu12==12.4.127 +smart-open==7.1.0 +identify==2.6.12 +pure_eval==0.2.3 +confection==0.1.5 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cusparselt-cu12==0.6.2 +decorator==5.2.1 +nvidia-nccl-cu12==2.21.5 +pytz==2025.2 +nvidia-cudnn-cu12==9.1.0.70 +plotly==6.1.1 +safetensors==0.5.3 +portalocker==3.1.1 +toml==0.10.2 +triton==3.2.0 +cycler==0.12.1 +torch==2.6.0 +python-magic==0.4.27 +ptyprocess==0.7.0 +regex==2024.11.6 +absl-py==2.2.2 +psutil==7.0.0 +murmurhash==1.0.13 +wrapt==1.17.2 +pycocoevalcap==1.2 +python-slugify==8.0.4 +stack-data==0.6.3 +python-dateutil==2.9.0.post0 +scipy==1.15.3 +annotated-types==0.7.0 +mpmath==1.3.0 +ipython==8.36.0 +pyparsing==3.2.3 +nvidia-nvtx-cu12==12.4.127 +fairscale==0.4.4 +jsonschema-specifications==2025.4.1 +matplotlib-inline==0.1.7 +watchdog==6.0.0 +thinc==8.3.6 +antlr4-python3-runtime==4.9.3 +webencodings==0.5.1 +hjson==3.1.0 +propcache==0.3.1 +virtualenv==20.31.2 +pytorch-lightning==2.5.1.post0 +Pygments==2.19.1 +pillow==11.2.1 +joblib==1.5.1 +tqdm==4.67.1 +timm==0.4.12 +nvidia-nvjitlink-cu12==12.4.127 +aiosignal==1.3.2 +kaggle==1.7.4.5 +idna==3.10 +pycocotools==2.0.8 +MarkupSafe==3.0.2 +traitlets==5.14.3 +multidict==6.4.4 +distlib==0.3.9 +torchmetrics==1.7.1 +pyarrow==20.0.0 +tzdata==2025.2 +platformdirs==4.3.8 +yarl==1.20.0 +tenacity==9.1.2 +altair==5.5.0 +wasabi==1.1.3 +attrs==25.3.0 +contourpy==1.3.2 +kiwisolver==1.4.8 +PyYAML==6.0.2 +exceptiongroup==1.3.0 +jedi==0.19.2 +sentencepiece==0.2.0 +nvidia-cusolver-cu12==11.6.1.9 +requests==2.32.3 +opendatasets==0.1.22 +GitPython==3.1.44 +bleach==6.2.0 +protobuf==6.31.0 +sympy==1.13.1 +filelock==3.18.0 +pre_commit==4.2.0 +text-unidecode==1.3 +wheel==0.45.1 +contexttimer==0.3.3 +wcwidth==0.2.13 +spacy-legacy==3.0.12 +aiohappyeyeballs==2.6.1 +imageio==2.37.0 +nodeenv==1.9.1 +py-cpuinfo==9.0.0 +hf-xet==1.1.2 +nvidia-cuda-cupti-cu12==12.4.127 +weasel==0.4.1 +certifi==2025.4.26 +lightning-utilities==0.14.3 +typing_extensions==4.13.2 +typing-inspection==0.4.1 +webdataset==0.2.111 +nvidia-cusparse-cu12==12.3.1.170 +asttokens==3.0.0 +nvidia-cufile-cu12==1.11.1.6 +opencv-python-headless==4.5.5.64 +smmap==5.0.2 +tifffile==2025.5.10 +iopath==0.1.10 +packaging==24.2 +cymem==2.0.11 +spacy-loggers==1.0.5 +ninja==1.11.1.4 +ftfy==6.3.1 +executing==2.2.0 +nvidia-cuda-nvrtc-cu12==12.4.127 +blinker==1.9.0 +torchvision==0.21.0 +parso==0.8.4 +salesforce-lavis==1.0.2 +frozenlist==1.6.0 +shellingham==1.5.4 +flash-attn==2.7.1.post1 +pycparser==2.22 +threadpoolctl==3.6.0 +opencv-python==4.11.0.86 +fsspec==2025.3.0 +aiohttp==3.12.2 +narwhals==1.41.0 +opendelta==0.3.2 +pycryptodome==3.23.0 +crcmod==1.7 +delta-center-client==0.0.4 +tokenizers==0.21.1 +aliyun-python-sdk-kms==2.16.5 +more-itertools==10.7.0 +yacs==0.1.8 +bigmodelvis==0.0.1 +jmespath==0.10.0 +docker-pycreds==0.4.0 +web.py==0.62 +scikit-learn==1.6.1 +pip==25.1.1 +cheroot==10.0.1 +setproctitle==1.3.6 +huggingface-hub==0.32.1 +oss2==2.15.0 +cryptography==45.0.3 +typer==0.16.0 +xxhash==3.5.0 +jsonschema==4.24.0 +click==8.2.1 +preshed==3.0.10 +sentry-sdk==2.29.1 +wandb==0.19.11 +dill==0.3.8 +aliyun-python-sdk-core==2.16.0 +transformers==4.52.3 +cffi==1.17.1 +pathlib==1.0.1 +jaraco.functools==4.1.0 +datasets==3.6.0 +multiprocess==0.70.16 +backports.tarfile==1.2.0 +tomli==2.0.1 +autocommand==2.2.2 +zipp==3.19.2 +jaraco.text==3.12.1 +jaraco.collections==5.1.0 +platformdirs==4.2.2 +typeguard==4.3.0 +typing_extensions==4.12.2 +jaraco.functools==4.0.1 +inflect==7.3.1 +wheel==0.45.1 +more-itertools==10.3.0 +importlib_metadata==8.0.0 +jaraco.context==5.3.0 +packaging==24.2 diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..13694c90edf4d35a13ce79970bf264638a3791d3 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/wandb-metadata.json @@ -0,0 +1,100 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-06-28T13:03:20.280479Z", + "args": [ + "--devices", + "0,1,2,3,4,5,6,7", + "--mode", + "train", + "--filename", + "stage1_ckpt", + "--num_query_token", + "8", + "--plm_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--save_every_n_epochs", + "5", + "--max_epochs", + "20", + "--batch_size", + "168", + "--precision", + "bf16-mixed", + "--mix_dataset", + "--num_workers", + "8", + "--use_wandb_logger", + "--strategy", + "ddp" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", + "codePath": "stage1.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage1_ckpt/", + "host": "dsw-265304-558499d597-hhhs7", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage1.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "11399290880" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/wandb-summary.json b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..7d9a46293e3453e3b7bd390977674124bbe725ff --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":349}} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug-internal.log b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..8e035efc8cf8d11dc3468a30d3bafaed54ff7cce --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug-internal.log @@ -0,0 +1,15 @@ +{"time":"2025-06-28T21:03:20.284878435+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug-core.log"} +{"time":"2025-06-28T21:03:21.431223505+08:00","level":"INFO","msg":"created new stream","id":"rrhzb5iq"} +{"time":"2025-06-28T21:03:21.431273053+08:00","level":"INFO","msg":"stream: started","id":"rrhzb5iq"} +{"time":"2025-06-28T21:03:21.431320218+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"rrhzb5iq"} +{"time":"2025-06-28T21:03:21.431348318+08:00","level":"INFO","msg":"handler: started","stream_id":"rrhzb5iq"} +{"time":"2025-06-28T21:03:21.431364756+08:00","level":"INFO","msg":"sender: started","stream_id":"rrhzb5iq"} +{"time":"2025-06-28T21:03:22.981144572+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-06-28T21:09:09.647087317+08:00","level":"INFO","msg":"stream: closing","id":"rrhzb5iq"} +{"time":"2025-06-28T21:09:09.647148642+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-06-28T21:09:09.648142179+08:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-06-28T21:09:11.22982496+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-06-28T21:09:12.511227827+08:00","level":"INFO","msg":"handler: closed","stream_id":"rrhzb5iq"} +{"time":"2025-06-28T21:09:12.51128659+08:00","level":"INFO","msg":"sender: closed","stream_id":"rrhzb5iq"} +{"time":"2025-06-28T21:09:12.511284269+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"rrhzb5iq"} +{"time":"2025-06-28T21:09:12.519090125+08:00","level":"INFO","msg":"stream: closed","id":"rrhzb5iq"} diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug.log b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..ca14d32439cfe7a0d8446a907a62f1611f29509b --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug.log @@ -0,0 +1,23 @@ +2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_setup.py:_flush():70] Configure stats pid to 234012 +2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug.log +2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug-internal.log +2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_init.py:init():852] calling init triggers +2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_init.py:init():893] starting backend +2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_init.py:init():897] sending inform_init request +2025-06-28 21:03:20,276 INFO MainThread:234012 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-28 21:03:20,280 INFO MainThread:234012 [wandb_init.py:init():907] backend started and connected +2025-06-28 21:03:20,281 INFO MainThread:234012 [wandb_init.py:init():1005] updated telemetry +2025-06-28 21:03:20,281 INFO MainThread:234012 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-06-28 21:03:22,930 INFO MainThread:234012 [wandb_init.py:init():1104] starting run threads in backend +2025-06-28 21:03:23,114 INFO MainThread:234012 [wandb_run.py:_console_start():2573] atexit reg +2025-06-28 21:03:23,114 INFO MainThread:234012 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-06-28 21:03:23,118 INFO MainThread:234012 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-06-28 21:03:23,118 INFO MainThread:234012 [wandb_run.py:_redirect():2513] Redirects installed. +2025-06-28 21:03:23,120 INFO MainThread:234012 [wandb_init.py:init():1150] run started, returning control to user process +2025-06-28 21:09:09,645 INFO MsgRouterThr:234012 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/run-rrhzb5iq.wandb b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/run-rrhzb5iq.wandb new file mode 100644 index 0000000000000000000000000000000000000000..be9f4c78458b5f79c18800639c15d25521e148e3 Binary files /dev/null and b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/run-rrhzb5iq.wandb differ diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/files/output.log b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/files/requirements.txt b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4b19413ecc562d59f65f708a0c214f92e7006d2 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/files/requirements.txt @@ -0,0 +1,225 @@ +marisa-trie==1.2.1 +pydantic==2.11.5 +mdurl==0.1.2 +gitdb==4.0.12 +scikit-image==0.25.2 +async-timeout==5.0.1 +blis==1.3.0 +urllib3==2.4.0 +spacy==3.8.7 +nvidia-ml-py==12.575.51 +braceexpand==0.1.7 +nvidia-cufft-cu12==11.2.1.3 +rich==14.0.0 +setuptools==78.1.1 +matplotlib==3.10.3 +catalogue==2.0.10 +decord==0.6.0 +numpy==2.2.6 +charset-normalizer==3.4.2 +langcodes==3.5.0 +pexpect==4.9.0 +nltk==3.9.1 +cachetools==5.5.2 +cfgv==3.4.0 +prompt_toolkit==3.0.51 +srsly==2.5.1 +einops==0.8.1 +Jinja2==3.1.6 +cloudpathlib==0.21.1 +streamlit==1.45.1 +pydantic_core==2.33.2 +tornado==6.5.1 +nvidia-curand-cu12==10.3.5.147 +deepspeed==0.16.10+b666844f +networkx==3.4.2 +omegaconf==2.3.0 +msgpack==1.1.0 +pandas==2.2.3 +rouge_score==0.1.2 +six==1.17.0 +language_data==1.3.0 +referencing==0.36.2 +rpds-py==0.25.1 +lazy_loader==0.4 +pydeck==0.9.1 +markdown-it-py==3.0.0 +fonttools==4.58.0 +nvidia-cuda-runtime-cu12==12.4.127 +smart-open==7.1.0 +identify==2.6.12 +pure_eval==0.2.3 +confection==0.1.5 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cusparselt-cu12==0.6.2 +decorator==5.2.1 +nvidia-nccl-cu12==2.21.5 +pytz==2025.2 +nvidia-cudnn-cu12==9.1.0.70 +plotly==6.1.1 +safetensors==0.5.3 +portalocker==3.1.1 +toml==0.10.2 +triton==3.2.0 +cycler==0.12.1 +torch==2.6.0 +python-magic==0.4.27 +ptyprocess==0.7.0 +regex==2024.11.6 +absl-py==2.2.2 +psutil==7.0.0 +murmurhash==1.0.13 +wrapt==1.17.2 +pycocoevalcap==1.2 +python-slugify==8.0.4 +stack-data==0.6.3 +python-dateutil==2.9.0.post0 +scipy==1.15.3 +annotated-types==0.7.0 +mpmath==1.3.0 +ipython==8.36.0 +pyparsing==3.2.3 +nvidia-nvtx-cu12==12.4.127 +fairscale==0.4.4 +jsonschema-specifications==2025.4.1 +matplotlib-inline==0.1.7 +watchdog==6.0.0 +thinc==8.3.6 +antlr4-python3-runtime==4.9.3 +webencodings==0.5.1 +hjson==3.1.0 +propcache==0.3.1 +virtualenv==20.31.2 +pytorch-lightning==2.5.1.post0 +Pygments==2.19.1 +pillow==11.2.1 +joblib==1.5.1 +tqdm==4.67.1 +timm==0.4.12 +nvidia-nvjitlink-cu12==12.4.127 +aiosignal==1.3.2 +kaggle==1.7.4.5 +idna==3.10 +pycocotools==2.0.8 +MarkupSafe==3.0.2 +traitlets==5.14.3 +multidict==6.4.4 +distlib==0.3.9 +torchmetrics==1.7.1 +pyarrow==20.0.0 +tzdata==2025.2 +platformdirs==4.3.8 +yarl==1.20.0 +tenacity==9.1.2 +altair==5.5.0 +wasabi==1.1.3 +attrs==25.3.0 +contourpy==1.3.2 +kiwisolver==1.4.8 +PyYAML==6.0.2 +exceptiongroup==1.3.0 +jedi==0.19.2 +sentencepiece==0.2.0 +nvidia-cusolver-cu12==11.6.1.9 +requests==2.32.3 +opendatasets==0.1.22 +GitPython==3.1.44 +bleach==6.2.0 +protobuf==6.31.0 +sympy==1.13.1 +filelock==3.18.0 +pre_commit==4.2.0 +text-unidecode==1.3 +wheel==0.45.1 +contexttimer==0.3.3 +wcwidth==0.2.13 +spacy-legacy==3.0.12 +aiohappyeyeballs==2.6.1 +imageio==2.37.0 +nodeenv==1.9.1 +py-cpuinfo==9.0.0 +hf-xet==1.1.2 +nvidia-cuda-cupti-cu12==12.4.127 +weasel==0.4.1 +certifi==2025.4.26 +lightning-utilities==0.14.3 +typing_extensions==4.13.2 +typing-inspection==0.4.1 +webdataset==0.2.111 +nvidia-cusparse-cu12==12.3.1.170 +asttokens==3.0.0 +nvidia-cufile-cu12==1.11.1.6 +opencv-python-headless==4.5.5.64 +smmap==5.0.2 +tifffile==2025.5.10 +iopath==0.1.10 +packaging==24.2 +cymem==2.0.11 +spacy-loggers==1.0.5 +ninja==1.11.1.4 +ftfy==6.3.1 +executing==2.2.0 +nvidia-cuda-nvrtc-cu12==12.4.127 +blinker==1.9.0 +torchvision==0.21.0 +parso==0.8.4 +salesforce-lavis==1.0.2 +frozenlist==1.6.0 +shellingham==1.5.4 +flash-attn==2.7.1.post1 +pycparser==2.22 +threadpoolctl==3.6.0 +opencv-python==4.11.0.86 +fsspec==2025.3.0 +aiohttp==3.12.2 +narwhals==1.41.0 +opendelta==0.3.2 +pycryptodome==3.23.0 +crcmod==1.7 +delta-center-client==0.0.4 +tokenizers==0.21.1 +aliyun-python-sdk-kms==2.16.5 +more-itertools==10.7.0 +yacs==0.1.8 +bigmodelvis==0.0.1 +jmespath==0.10.0 +docker-pycreds==0.4.0 +web.py==0.62 +scikit-learn==1.6.1 +pip==25.1.1 +cheroot==10.0.1 +setproctitle==1.3.6 +huggingface-hub==0.32.1 +oss2==2.15.0 +cryptography==45.0.3 +typer==0.16.0 +xxhash==3.5.0 +jsonschema==4.24.0 +click==8.2.1 +preshed==3.0.10 +sentry-sdk==2.29.1 +wandb==0.19.11 +dill==0.3.8 +aliyun-python-sdk-core==2.16.0 +transformers==4.52.3 +cffi==1.17.1 +pathlib==1.0.1 +jaraco.functools==4.1.0 +datasets==3.6.0 +multiprocess==0.70.16 +backports.tarfile==1.2.0 +tomli==2.0.1 +autocommand==2.2.2 +zipp==3.19.2 +jaraco.text==3.12.1 +jaraco.collections==5.1.0 +platformdirs==4.2.2 +typeguard==4.3.0 +typing_extensions==4.12.2 +jaraco.functools==4.0.1 +inflect==7.3.1 +wheel==0.45.1 +more-itertools==10.3.0 +importlib_metadata==8.0.0 +jaraco.context==5.3.0 +packaging==24.2 diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..baf6e4a8faeeab0ff98a92e65cd984778485d6a5 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/files/wandb-metadata.json @@ -0,0 +1,100 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-06-28T13:10:12.808868Z", + "args": [ + "--devices", + "0,1,2,3,4,5,6,7", + "--mode", + "train", + "--filename", + "stage1_ckpt", + "--num_query_token", + "8", + "--plm_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--save_every_n_epochs", + "5", + "--max_epochs", + "20", + "--batch_size", + "64", + "--precision", + "bf16-mixed", + "--mix_dataset", + "--num_workers", + "8", + "--use_wandb_logger", + "--strategy", + "ddp" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", + "codePath": "stage1.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage1_ckpt/", + "host": "dsw-265304-558499d597-hhhs7", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage1.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "11399299072" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug-internal.log b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..7be8ff37bece497a2f489dcf2e126a14d7e58480 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-06-28T21:10:12.812155856+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug-core.log"} +{"time":"2025-06-28T21:10:14.053332929+08:00","level":"INFO","msg":"created new stream","id":"qflz8r5n"} +{"time":"2025-06-28T21:10:14.053381793+08:00","level":"INFO","msg":"stream: started","id":"qflz8r5n"} +{"time":"2025-06-28T21:10:14.053445291+08:00","level":"INFO","msg":"sender: started","stream_id":"qflz8r5n"} +{"time":"2025-06-28T21:10:14.053438231+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"qflz8r5n"} +{"time":"2025-06-28T21:10:14.053511614+08:00","level":"INFO","msg":"handler: started","stream_id":"qflz8r5n"} +{"time":"2025-06-28T21:10:15.239272048+08:00","level":"INFO","msg":"Starting system monitor"} diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug.log b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..028766eb57f0ce63e5aa7b5f439f95baae72f7fd --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug.log @@ -0,0 +1,22 @@ +2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_setup.py:_flush():70] Configure stats pid to 407544 +2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug.log +2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug-internal.log +2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_init.py:init():852] calling init triggers +2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_init.py:init():893] starting backend +2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_init.py:init():897] sending inform_init request +2025-06-28 21:10:12,804 INFO MainThread:407544 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-28 21:10:12,806 INFO MainThread:407544 [wandb_init.py:init():907] backend started and connected +2025-06-28 21:10:12,809 INFO MainThread:407544 [wandb_init.py:init():1005] updated telemetry +2025-06-28 21:10:12,813 INFO MainThread:407544 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-06-28 21:10:15,229 INFO MainThread:407544 [wandb_init.py:init():1104] starting run threads in backend +2025-06-28 21:10:15,370 INFO MainThread:407544 [wandb_run.py:_console_start():2573] atexit reg +2025-06-28 21:10:15,370 INFO MainThread:407544 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-06-28 21:10:15,373 INFO MainThread:407544 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-06-28 21:10:15,373 INFO MainThread:407544 [wandb_run.py:_redirect():2513] Redirects installed. +2025-06-28 21:10:15,375 INFO MainThread:407544 [wandb_init.py:init():1150] run started, returning control to user process diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/run-qflz8r5n.wandb b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/run-qflz8r5n.wandb new file mode 100644 index 0000000000000000000000000000000000000000..14a669a739b959b4773ac5d41fc538101cffc701 Binary files /dev/null and b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/run-qflz8r5n.wandb differ diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug-internal.log b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ebf36c95e4de8d5d51433d81720389524f20f962 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug-internal.log @@ -0,0 +1,5 @@ +{"time":"2025-06-28T22:19:45.153786975+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug-core.log"} +{"time":"2025-06-28T22:20:15.259922664+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-28T22:20:47.297526211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-28T22:21:21.358317867+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-28T22:22:00.174480175+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug.log b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..94bb96a89336d9032e780c5a2c33d4cb85d8f64d --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug.log @@ -0,0 +1,94 @@ +2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_setup.py:_flush():70] Configure stats pid to 2313 +2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug.log +2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug-internal.log +2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_init.py:init():852] calling init triggers +2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_init.py:init():893] starting backend +2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_init.py:init():897] sending inform_init request +2025-06-28 22:19:45,145 INFO MainThread:2313 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-28 22:19:45,146 INFO MainThread:2313 [wandb_init.py:init():907] backend started and connected +2025-06-28 22:19:45,153 INFO MainThread:2313 [wandb_init.py:init():1005] updated telemetry +2025-06-28 22:19:45,155 INFO MainThread:2313 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-06-28 22:20:16,340 INFO Thread-3 (wrapped_target):2313 [retry.py:__call__():175] [no run ID] Retry attempt failed: +Traceback (most recent call last): + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 198, in _new_conn + sock = connection.create_connection( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/connection.py", line 85, in create_connection + raise err + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/connection.py", line 73, in create_connection + sock.connect(sa) +TimeoutError: timed out + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 787, in urlopen + response = self._make_request( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 488, in _make_request + raise new_e + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 464, in _make_request + self._validate_conn(conn) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1093, in _validate_conn + conn.connect() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 704, in connect + self.sock = sock = self._new_conn() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 207, in _new_conn + raise ConnectTimeoutError( +urllib3.exceptions.ConnectTimeoutError: (, 'Connection to api.wandb.ai timed out. (connect timeout=20)') + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/adapters.py", line 667, in send + resp = conn.urlopen( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 841, in urlopen + retries = retries.increment( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/retry.py", line 519, in increment + raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type] +urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='api.wandb.ai', port=443): Max retries exceeded with url: /graphql (Caused by ConnectTimeoutError(, 'Connection to api.wandb.ai timed out. (connect timeout=20)')) + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/retry.py", line 134, in __call__ + result = self._call_fn(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/internal/internal_api.py", line 398, in execute + return self.client.execute(*args, **kwargs) # type: ignore + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 52, in execute + result = self._get_result(document, *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 60, in _get_result + return self.transport.execute(document, *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/gql_request.py", line 58, in execute + request = self.session.post(self.url, **post_args) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 637, in post + return self.request("POST", url, data=data, json=json, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 589, in request + resp = self.send(prep, **send_kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 703, in send + r = adapter.send(request, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/adapters.py", line 688, in send + raise ConnectTimeout(e, request=request) +requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='api.wandb.ai', port=443): Max retries exceeded with url: /graphql (Caused by ConnectTimeoutError(, 'Connection to api.wandb.ai timed out. (connect timeout=20)')) +2025-06-28 22:20:20,942 WARNING MainThread:2313 [wandb_init.py:init():1681] [no run ID] interrupted +Traceback (most recent call last): + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1677, in init + return wi.init(run_settings, run_config, run_printer) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1055, in init + result = wait_with_progress( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress + return wait_all_with_progress( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress + return asyncio_compat.run(progress_loop_with_timeout) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/asyncio_compat.py", line 30, in run + return future.result() + File "/root/miniconda3/envs/protT3/lib/python3.10/concurrent/futures/_base.py", line 440, in result + self._condition.wait(timeout) + File "/root/miniconda3/envs/protT3/lib/python3.10/threading.py", line 320, in wait + waiter.acquire() +KeyboardInterrupt +2025-06-28 22:20:21,092 INFO MsgRouterThr:2313 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles. diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/config.yaml b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..21d7660a5fab99d9e9456df6dedbddc3afd9982d --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/config.yaml @@ -0,0 +1,429 @@ +_wandb: + value: + cli_version: 0.19.11 + m: + - "1": onto_test_fullset_p2t_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": trainer/global_step + "6": + - 3 + "7": [] + - "1": loader2/val_loss_lm/dataloader_idx_2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": lr + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_rerank_fullset_p2t_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_inbatch_t2p_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_inbatch_p2t_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_rerank_inbatch_t2p_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_rerank_inbatch_p2t_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader2/val_loss/dataloader_idx_2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader1/val_loss_ptc/dataloader_idx_1 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader0/val_loss_lm/dataloader_idx_0 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader1/val_loss_lm/dataloader_idx_1 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_rerank_fullset_t2p_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_rerank_fullset_t2p_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_rerank_inbatch_p2t_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_fullset_t2p_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_inbatch_p2t_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_fullset_t2p_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_rerank_inbatch_t2p_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader2/val_loss_ptc/dataloader_idx_2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_rerank_inbatch_p2t_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_fullset_p2t_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train_loss_ptm + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_rerank_inbatch_t2p_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_inbatch_p2t_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_rerank_fullset_t2p_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader0/val_loss_ptm/dataloader_idx_0 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader0/val_loss_ptc/dataloader_idx_0 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train_loss_ptc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_rerank_fullset_p2t_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_rerank_fullset_p2t_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_rerank_fullset_p2t_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_rerank_inbatch_p2t_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_inbatch_p2t_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader1/val_loss_ptm/dataloader_idx_1 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader0/val_loss/dataloader_idx_0 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train_loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train_loss_lm + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_rerank_fullset_t2p_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_fullset_t2p_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_inbatch_t2p_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_rerank_inbatch_t2p_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader1/val_loss/dataloader_idx_1 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader2/val_loss_ptm/dataloader_idx_2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_inbatch_t2p_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_fullset_p2t_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_fullset_t2p_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": swiss_test_fullset_p2t_rec20 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": onto_test_inbatch_t2p_acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.10.0 + t: + "1": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "2": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "3": + - 7 + - 23 + - 33 + - 55 + - 66 + "4": 3.10.0 + "5": 0.19.11 + "6": 4.52.3 + "8": + - 5 + "12": 0.19.11 + "13": linux-x86_64 +accelerator: + value: gpu +batch_size: + value: 96 +bert_hidden_dim: + value: 768 +bert_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft +check_val_every_n_epoch: + value: 1 +cross_attention_freq: + value: 2 +devices: + value: 0,1,2,3,4,5,6,7 +filename: + value: stage1_ckpt +init_checkpoint: + value: "" +init_lr: + value: 0.0001 +lm: + value: true +load_4bit: + value: false +lr_decay_rate: + value: 0.9 +match_batch_size: + value: 64 +max_epochs: + value: 20 +min_lr: + value: 1e-05 +mix_dataset: + value: true +mode: + value: train +num_query_token: + value: 8 +num_workers: + value: 8 +plm_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m +plm_tune: + value: freeze +pool_size: + value: 0 +precision: + value: bf16-mixed +projection_dim: + value: 256 +prot_aug: + value: None +prot_max_len: + value: 1024 +ptm: + value: true +rerank_cand_num: + value: 128 +retrieval_eval_epoch: + value: 10 +root: + value: data_small +save_every_n_epochs: + value: 5 +scheduler: + value: linear_warmup_cosine_lr +seed: + value: 42 +strategy: + value: ddp +temperature: + value: 0.1 +text_max_len: + value: 128 +use_wandb_logger: + value: true +warmup_lr: + value: 1e-06 +warmup_steps: + value: 1000 +weight_decay: + value: 0.05 diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/output.log b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1d146d64e30941dbeaf769f726d1c54131660c37 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/output.log @@ -0,0 +1,2 @@ + +Detected KeyboardInterrupt, attempting graceful shutdown ... diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/requirements.txt b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..319175f59dd507b489b3e7fc55c387d8ea6e4728 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/requirements.txt @@ -0,0 +1,225 @@ +stack-data==0.6.3 +yarl==1.20.0 +setuptools==78.1.1 +cloudpathlib==0.21.1 +pytz==2025.2 +nvidia-cufile-cu12==1.11.1.6 +shellingham==1.5.4 +nvidia-cusolver-cu12==11.6.1.9 +Jinja2==3.1.6 +pycocotools==2.0.8 +pandas==2.2.3 +scipy==1.15.3 +tenacity==9.1.2 +lightning-utilities==0.14.3 +cfgv==3.4.0 +hf-xet==1.1.2 +platformdirs==4.3.8 +smart-open==7.1.0 +text-unidecode==1.3 +nvidia-cublas-cu12==12.4.5.8 +omegaconf==2.3.0 +referencing==0.36.2 +mdurl==0.1.2 +gitdb==4.0.12 +identify==2.6.12 +ipython==8.36.0 +spacy-loggers==1.0.5 +distlib==0.3.9 +typing-inspection==0.4.1 +antlr4-python3-runtime==4.9.3 +multidict==6.4.4 +nvidia-curand-cu12==10.3.5.147 +prompt_toolkit==3.0.51 +Pygments==2.19.1 +numpy==2.2.6 +decord==0.6.0 +srsly==2.5.1 +watchdog==6.0.0 +pure_eval==0.2.3 +virtualenv==20.31.2 +altair==5.5.0 +matplotlib-inline==0.1.7 +bleach==6.2.0 +exceptiongroup==1.3.0 +fairscale==0.4.4 +confection==0.1.5 +fonttools==4.58.0 +nvidia-cuda-nvrtc-cu12==12.4.127 +ptyprocess==0.7.0 +pytorch-lightning==2.5.1.post0 +nodeenv==1.9.1 +nvidia-cudnn-cu12==9.1.0.70 +requests==2.32.3 +marisa-trie==1.2.1 +cachetools==5.5.2 +matplotlib==3.10.3 +typing_extensions==4.13.2 +asttokens==3.0.0 +torch==2.6.0 +PyYAML==6.0.2 +tifffile==2025.5.10 +spacy==3.8.7 +braceexpand==0.1.7 +plotly==6.1.1 +attrs==25.3.0 +py-cpuinfo==9.0.0 +frozenlist==1.6.0 +catalogue==2.0.10 +nvidia-cusparselt-cu12==0.6.2 +traitlets==5.14.3 +annotated-types==0.7.0 +language_data==1.3.0 +thinc==8.3.6 +imageio==2.37.0 +nvidia-cuda-runtime-cu12==12.4.127 +certifi==2025.4.26 +smmap==5.0.2 +python-magic==0.4.27 +triton==3.2.0 +weasel==0.4.1 +async-timeout==5.0.1 +wcwidth==0.2.13 +pillow==11.2.1 +torchmetrics==1.7.1 +kaggle==1.7.4.5 +regex==2024.11.6 +aiosignal==1.3.2 +nvidia-cusparse-cu12==12.3.1.170 +scikit-image==0.25.2 +nvidia-nvtx-cu12==12.4.127 +opendatasets==0.1.22 +iopath==0.1.10 +pyparsing==3.2.3 +portalocker==3.1.1 +executing==2.2.0 +contexttimer==0.3.3 +lazy_loader==0.4 +wrapt==1.17.2 +webdataset==0.2.111 +blis==1.3.0 +idna==3.10 +timm==0.4.12 +einops==0.8.1 +packaging==24.2 +decorator==5.2.1 +filelock==3.18.0 +python-slugify==8.0.4 +cycler==0.12.1 +charset-normalizer==3.4.2 +pydantic==2.11.5 +pydeck==0.9.1 +tzdata==2025.2 +jedi==0.19.2 +aiohappyeyeballs==2.6.1 +nvidia-nvjitlink-cu12==12.4.127 +salesforce-lavis==1.0.2 +parso==0.8.4 +nvidia-nccl-cu12==2.21.5 +toml==0.10.2 +python-dateutil==2.9.0.post0 +rich==14.0.0 +tqdm==4.67.1 +rpds-py==0.25.1 +opencv-python-headless==4.5.5.64 +tornado==6.5.1 +propcache==0.3.1 +webencodings==0.5.1 +murmurhash==1.0.13 +contourpy==1.3.2 +joblib==1.5.1 +networkx==3.4.2 +six==1.17.0 +markdown-it-py==3.0.0 +nvidia-cuda-cupti-cu12==12.4.127 +msgpack==1.1.0 +sentencepiece==0.2.0 +cymem==2.0.11 +nvidia-cufft-cu12==11.2.1.3 +absl-py==2.2.2 +hjson==3.1.0 +mpmath==1.3.0 +pydantic_core==2.33.2 +psutil==7.0.0 +nvidia-ml-py==12.575.51 +pyarrow==20.0.0 +kiwisolver==1.4.8 +sympy==1.13.1 +ninja==1.11.1.4 +rouge_score==0.1.2 +deepspeed==0.16.10+b666844f +spacy-legacy==3.0.12 +pycocoevalcap==1.2 +pexpect==4.9.0 +ftfy==6.3.1 +protobuf==6.31.0 +urllib3==2.4.0 +wheel==0.45.1 +nltk==3.9.1 +streamlit==1.45.1 +wasabi==1.1.3 +pre_commit==4.2.0 +safetensors==0.5.3 +jsonschema-specifications==2025.4.1 +langcodes==3.5.0 +GitPython==3.1.44 +blinker==1.9.0 +torchvision==0.21.0 +MarkupSafe==3.0.2 +dill==0.3.8 +yacs==0.1.8 +pathlib==1.0.1 +scikit-learn==1.6.1 +cffi==1.17.1 +pycparser==2.22 +flash-attn==2.7.1.post1 +cryptography==45.0.3 +pycryptodome==3.23.0 +cheroot==10.0.1 +more-itertools==10.7.0 +setproctitle==1.3.6 +delta-center-client==0.0.4 +jmespath==0.10.0 +xxhash==3.5.0 +pip==25.1.1 +aliyun-python-sdk-core==2.16.0 +jaraco.functools==4.1.0 +bigmodelvis==0.0.1 +aiohttp==3.12.2 +multiprocess==0.70.16 +opendelta==0.3.2 +docker-pycreds==0.4.0 +threadpoolctl==3.6.0 +click==8.2.1 +oss2==2.15.0 +crcmod==1.7 +transformers==4.52.3 +datasets==3.6.0 +jsonschema==4.24.0 +opencv-python==4.11.0.86 +wandb==0.19.11 +fsspec==2025.3.0 +tokenizers==0.21.1 +sentry-sdk==2.29.1 +preshed==3.0.10 +aliyun-python-sdk-kms==2.16.5 +huggingface-hub==0.32.1 +typer==0.16.0 +narwhals==1.41.0 +web.py==0.62 +autocommand==2.2.2 +importlib_metadata==8.0.0 +zipp==3.19.2 +jaraco.context==5.3.0 +typeguard==4.3.0 +jaraco.collections==5.1.0 +typing_extensions==4.12.2 +backports.tarfile==1.2.0 +jaraco.functools==4.0.1 +more-itertools==10.3.0 +platformdirs==4.2.2 +packaging==24.2 +tomli==2.0.1 +jaraco.text==3.12.1 +wheel==0.45.1 +inflect==7.3.1 diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..6e814d9d0c7d6fdbca6558c471a9ef16f86deb01 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/wandb-metadata.json @@ -0,0 +1,100 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-06-28T14:23:55.239654Z", + "args": [ + "--devices", + "0,1,2,3,4,5,6,7", + "--mode", + "train", + "--filename", + "stage1_ckpt", + "--num_query_token", + "8", + "--plm_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--save_every_n_epochs", + "5", + "--max_epochs", + "20", + "--batch_size", + "96", + "--precision", + "bf16-mixed", + "--mix_dataset", + "--num_workers", + "8", + "--use_wandb_logger", + "--strategy", + "ddp" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", + "codePath": "stage1.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage1_ckpt/", + "host": "dsw-265304-57b7b77cbc-vwbwc", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage1.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "1285099520" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/wandb-summary.json b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6999c7511a71bdd4a2b5cd198338ce3d88b4989f --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/wandb-summary.json @@ -0,0 +1 @@ +{"swiss_test_inbatch_t2p_acc":30,"train_loss_ptc":5.0676679611206055,"_step":14,"onto_test_rerank_fullset_t2p_rec20":49.5,"train_loss_ptm":0.6344426870346069,"swiss_test_rerank_fullset_t2p_acc":15,"onto_test_fullset_p2t_rec20":56,"loader1/val_loss/dataloader_idx_1":9.734654426574707,"swiss_test_rerank_fullset_t2p_rec20":85,"swiss_test_inbatch_p2t_acc":25,"train_loss":9.90674877166748,"onto_test_rerank_inbatch_t2p_acc":10.5,"onto_test_inbatch_t2p_acc":11,"onto_test_rerank_inbatch_p2t_rec20":83,"onto_test_rerank_inbatch_t2p_rec20":82.5,"swiss_test_rerank_inbatch_t2p_acc":22,"swiss_test_rerank_inbatch_t2p_rec20":93,"epoch":13,"train_loss_lm":4.204638481140137,"loader1/val_loss_ptm/dataloader_idx_1":0.6397265791893005,"loader2/val_loss/dataloader_idx_2":10.584230422973633,"loader1/val_loss_ptc/dataloader_idx_1":4.00998067855835,"swiss_test_rerank_inbatch_p2t_acc":16,"onto_test_rerank_fullset_p2t_rec20":41.5,"onto_test_fullset_t2p_rec20":50.5,"_wandb":{"runtime":4652},"loader0/val_loss/dataloader_idx_0":7.924057483673096,"onto_test_rerank_fullset_t2p_acc":5,"onto_test_rerank_fullset_p2t_acc":3,"loader0/val_loss_ptc/dataloader_idx_0":2.931434154510498,"swiss_test_fullset_t2p_rec20":85,"onto_test_fullset_p2t_acc":11,"_runtime":4531.685308266,"_timestamp":1.751125166924585e+09,"loader2/val_loss_ptc/dataloader_idx_2":4.096363544464111,"loader1/val_loss_lm/dataloader_idx_1":5.084947109222412,"onto_test_inbatch_p2t_acc":19,"loader0/val_loss_ptm/dataloader_idx_0":0.6371256709098816,"onto_test_rerank_inbatch_p2t_acc":9.5,"loader0/val_loss_lm/dataloader_idx_0":4.355497360229492,"swiss_test_fullset_t2p_acc":18,"swiss_test_inbatch_t2p_rec20":93,"loader2/val_loss_ptm/dataloader_idx_2":0.641204297542572,"swiss_test_fullset_p2t_acc":21,"onto_test_fullset_t2p_acc":4.5,"swiss_test_fullset_p2t_rec20":85,"swiss_test_rerank_fullset_p2t_rec20":73,"swiss_test_rerank_fullset_p2t_acc":7,"lr":6.203955126693472e-05,"onto_test_inbatch_t2p_rec20":83,"swiss_test_rerank_inbatch_p2t_rec20":92,"loader2/val_loss_lm/dataloader_idx_2":5.846662521362305,"swiss_test_inbatch_p2t_rec20":95,"onto_test_inbatch_p2t_rec20":91,"trainer/global_step":69} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug-internal.log b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..da5b425fc97aad42df6846dde7841ee1fe1626a1 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug-internal.log @@ -0,0 +1,19 @@ +{"time":"2025-06-28T22:23:55.245053432+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug-core.log"} +{"time":"2025-06-28T22:23:56.442748363+08:00","level":"INFO","msg":"created new stream","id":"e9wtzwz1"} +{"time":"2025-06-28T22:23:56.442786823+08:00","level":"INFO","msg":"stream: started","id":"e9wtzwz1"} +{"time":"2025-06-28T22:23:56.44283909+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"e9wtzwz1"} +{"time":"2025-06-28T22:23:56.44287811+08:00","level":"INFO","msg":"sender: started","stream_id":"e9wtzwz1"} +{"time":"2025-06-28T22:23:56.442850569+08:00","level":"INFO","msg":"handler: started","stream_id":"e9wtzwz1"} +{"time":"2025-06-28T22:23:57.657067842+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-06-28T23:13:08.786733475+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:41088->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-28T23:16:23.858735046+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:58168->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-28T23:20:12.333412842+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:50886->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-06-28T23:28:29.895934993+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": unexpected EOF"} +{"time":"2025-06-28T23:32:39.731699923+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:54496->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-28T23:35:17.938724051+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:33406->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-28T23:38:54.515701632+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:59930->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-28T23:41:28.010949965+08:00","level":"INFO","msg":"stream: closing","id":"e9wtzwz1"} +{"time":"2025-06-28T23:41:28.011132748+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-06-28T23:41:28.066664522+08:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-06-28T23:41:37.996261564+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-06-28T23:42:36.21077519+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:36806->172.67.193.61:443: read: connection timed out"} diff --git a/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug.log b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..07dbd5943b12ee0e0392651f2302bf189a4474ef --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug.log @@ -0,0 +1,23 @@ +2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Configure stats pid to 3589 +2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug.log +2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug-internal.log +2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():852] calling init triggers +2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():893] starting backend +2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():897] sending inform_init request +2025-06-28 22:23:55,237 INFO MainThread:3589 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-28 22:23:55,239 INFO MainThread:3589 [wandb_init.py:init():907] backend started and connected +2025-06-28 22:23:55,240 INFO MainThread:3589 [wandb_init.py:init():1005] updated telemetry +2025-06-28 22:23:55,249 INFO MainThread:3589 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-06-28 22:23:57,648 INFO MainThread:3589 [wandb_init.py:init():1104] starting run threads in backend +2025-06-28 22:23:57,823 INFO MainThread:3589 [wandb_run.py:_console_start():2573] atexit reg +2025-06-28 22:23:57,823 INFO MainThread:3589 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-06-28 22:23:57,827 INFO MainThread:3589 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-06-28 22:23:57,827 INFO MainThread:3589 [wandb_run.py:_redirect():2513] Redirects installed. +2025-06-28 22:23:57,829 INFO MainThread:3589 [wandb_init.py:init():1150] run started, returning control to user process +2025-06-28 23:41:27,993 INFO MsgRouterThr:3589 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles. diff --git a/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2a1f32d5d095c14c0ddc38b5326bc5f50c301ff1 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/files/wandb-metadata.json @@ -0,0 +1,107 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-07-07T11:01:45.766473Z", + "args": [ + "--devices", + "0,1,2,3,4,5,6,7", + "--mode", + "train", + "--filename", + "stage2.5_mol_instruction", + "--num_query_token", + "8", + "--save_every_n_epochs", + "1", + "--max_epochs", + "10", + "--batch_size", + "1", + "--precision", + "bf16-mixed", + "--num_workers", + "8", + "--plm_model", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--llm_name", + "/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300", + "--llm_tune", + "mid_lora", + "--stage1_path", + "/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt", + "--use_wandb_logger", + "--text_max_len", + "1024", + "--prot_max_len", + "2048" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", + "codePath": "stage2.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage2.5_mol_instruction/", + "host": "dsw-265304-7f6db6b4bb-g4b9r", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage2.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "1260933120" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug-internal.log b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..dc81cd83b559048c7268fa8e1be689e3a1bdcad3 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-07-07T19:01:45.769106543+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug-core.log"} +{"time":"2025-07-07T19:01:46.836668426+08:00","level":"INFO","msg":"created new stream","id":"vu5mgolt"} +{"time":"2025-07-07T19:01:46.836704412+08:00","level":"INFO","msg":"stream: started","id":"vu5mgolt"} +{"time":"2025-07-07T19:01:46.836753246+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"vu5mgolt"} +{"time":"2025-07-07T19:01:46.836778842+08:00","level":"INFO","msg":"handler: started","stream_id":"vu5mgolt"} +{"time":"2025-07-07T19:01:46.836807537+08:00","level":"INFO","msg":"sender: started","stream_id":"vu5mgolt"} +{"time":"2025-07-07T19:01:48.143178072+08:00","level":"INFO","msg":"Starting system monitor"} diff --git a/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug.log b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..49a16cb0f4274c52105b37126aa9bbb7055b8969 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug.log @@ -0,0 +1,22 @@ +2025-07-07 19:01:45,758 INFO MainThread:121602 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-07-07 19:01:45,758 INFO MainThread:121602 [wandb_setup.py:_flush():70] Configure stats pid to 121602 +2025-07-07 19:01:45,758 INFO MainThread:121602 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-07-07 19:01:45,758 INFO MainThread:121602 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-07-07 19:01:45,758 INFO MainThread:121602 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-07-07 19:01:45,759 INFO MainThread:121602 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug.log +2025-07-07 19:01:45,759 INFO MainThread:121602 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug-internal.log +2025-07-07 19:01:45,759 INFO MainThread:121602 [wandb_init.py:init():852] calling init triggers +2025-07-07 19:01:45,759 INFO MainThread:121602 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-07-07 19:01:45,759 INFO MainThread:121602 [wandb_init.py:init():893] starting backend +2025-07-07 19:01:45,759 INFO MainThread:121602 [wandb_init.py:init():897] sending inform_init request +2025-07-07 19:01:45,760 INFO MainThread:121602 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-07-07 19:01:45,762 INFO MainThread:121602 [wandb_init.py:init():907] backend started and connected +2025-07-07 19:01:45,767 INFO MainThread:121602 [wandb_init.py:init():1005] updated telemetry +2025-07-07 19:01:45,771 INFO MainThread:121602 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-07-07 19:01:48,130 INFO MainThread:121602 [wandb_init.py:init():1104] starting run threads in backend +2025-07-07 19:01:48,276 INFO MainThread:121602 [wandb_run.py:_console_start():2573] atexit reg +2025-07-07 19:01:48,276 INFO MainThread:121602 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-07-07 19:01:48,292 INFO MainThread:121602 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-07-07 19:01:48,292 INFO MainThread:121602 [wandb_run.py:_redirect():2513] Redirects installed. +2025-07-07 19:01:48,293 INFO MainThread:121602 [wandb_init.py:init():1150] run started, returning control to user process diff --git a/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/output.log b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1c83f83a2ed65ad2b0601745793b6fa79a30f954 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/output.log @@ -0,0 +1,4 @@ +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage2.5_mol_instruction exists and is not empty. +Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`. +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] +[rank: 5] Child process with PID 126090 terminated with code 1. Forcefully terminating all other processes to avoid zombies 🧟 diff --git a/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/requirements.txt b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0319522eabea736b5ac8dd313c08e1dda7da1ca --- /dev/null +++ b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/requirements.txt @@ -0,0 +1,225 @@ +pydantic_core==2.33.2 +psutil==7.0.0 +nvidia-cuda-nvrtc-cu12==12.4.127 +mpmath==1.3.0 +tzdata==2025.2 +contexttimer==0.3.3 +cycler==0.12.1 +python-magic==0.4.27 +pexpect==4.9.0 +sympy==1.13.1 +wrapt==1.17.2 +marisa-trie==1.2.1 +langcodes==3.5.0 +nvidia-nvtx-cu12==12.4.127 +ipython==8.36.0 +opencv-python-headless==4.5.5.64 +MarkupSafe==3.0.2 +jsonschema-specifications==2025.4.1 +wasabi==1.1.3 +blinker==1.9.0 +cfgv==3.4.0 +numpy==2.2.6 +idna==3.10 +nvidia-cufile-cu12==1.11.1.6 +ninja==1.11.1.4 +nvidia-nccl-cu12==2.21.5 +networkx==3.4.2 +certifi==2025.4.26 +deepspeed==0.16.10+b666844f +pure_eval==0.2.3 +packaging==24.2 +nltk==3.9.1 +contourpy==1.3.2 +pre_commit==4.2.0 +nodeenv==1.9.1 +setuptools==78.1.1 +annotated-types==0.7.0 +multidict==6.4.4 +requests==2.32.3 +tornado==6.5.1 +triton==3.2.0 +pillow==11.2.1 +decord==0.6.0 +shellingham==1.5.4 +streamlit==1.45.1 +pydeck==0.9.1 +confection==0.1.5 +exceptiongroup==1.3.0 +prompt_toolkit==3.0.51 +text-unidecode==1.3 +nvidia-cufft-cu12==11.2.1.3 +antlr4-python3-runtime==4.9.3 +fairscale==0.4.4 +rouge_score==0.1.2 +nvidia-cudnn-cu12==9.1.0.70 +tqdm==4.67.1 +rich==14.0.0 +frozenlist==1.6.0 +webencodings==0.5.1 +altair==5.5.0 +opendatasets==0.1.22 +nvidia-curand-cu12==10.3.5.147 +protobuf==6.31.0 +asttokens==3.0.0 +wheel==0.45.1 +hf-xet==1.1.2 +weasel==0.4.1 +aiosignal==1.3.2 +absl-py==2.2.2 +thinc==8.3.6 +torchvision==0.21.0 +pandas==2.2.3 +fonttools==4.58.0 +bleach==6.2.0 +typing-inspection==0.4.1 +ftfy==6.3.1 +typing_extensions==4.13.2 +nvidia-ml-py==12.575.51 +python-slugify==8.0.4 +lightning-utilities==0.14.3 +py-cpuinfo==9.0.0 +smmap==5.0.2 +regex==2024.11.6 +scikit-image==0.25.2 +iopath==0.1.10 +spacy-legacy==3.0.12 +hjson==3.1.0 +executing==2.2.0 +kiwisolver==1.4.8 +scipy==1.15.3 +aiohappyeyeballs==2.6.1 +toml==0.10.2 +jedi==0.19.2 +GitPython==3.1.44 +ptyprocess==0.7.0 +kaggle==1.7.4.5 +braceexpand==0.1.7 +wcwidth==0.2.13 +nvidia-cuda-runtime-cu12==12.4.127 +pytorch-lightning==2.5.1.post0 +Jinja2==3.1.6 +urllib3==2.4.0 +watchdog==6.0.0 +filelock==3.18.0 +propcache==0.3.1 +torch==2.6.0 +nvidia-cusparse-cu12==12.3.1.170 +cymem==2.0.11 +nvidia-cusolver-cu12==11.6.1.9 +murmurhash==1.0.13 +catalogue==2.0.10 +yarl==1.20.0 +charset-normalizer==3.4.2 +gitdb==4.0.12 +matplotlib==3.10.3 +portalocker==3.1.1 +platformdirs==4.3.8 +async-timeout==5.0.1 +parso==0.8.4 +markdown-it-py==3.0.0 +omegaconf==2.3.0 +cloudpathlib==0.21.1 +nvidia-cusparselt-cu12==0.6.2 +spacy-loggers==1.0.5 +srsly==2.5.1 +identify==2.6.12 +rpds-py==0.25.1 +spacy==3.8.7 +matplotlib-inline==0.1.7 +smart-open==7.1.0 +pydantic==2.11.5 +mdurl==0.1.2 +virtualenv==20.31.2 +pytz==2025.2 +pycocotools==2.0.8 +six==1.17.0 +decorator==5.2.1 +referencing==0.36.2 +sentencepiece==0.2.0 +PyYAML==6.0.2 +pycocoevalcap==1.2 +imageio==2.37.0 +distlib==0.3.9 +pyarrow==20.0.0 +tenacity==9.1.2 +language_data==1.3.0 +nvidia-cuda-cupti-cu12==12.4.127 +blis==1.3.0 +Pygments==2.19.1 +tifffile==2025.5.10 +pyparsing==3.2.3 +cachetools==5.5.2 +safetensors==0.5.3 +attrs==25.3.0 +webdataset==0.2.111 +plotly==6.1.1 +nvidia-cublas-cu12==12.4.5.8 +timm==0.4.12 +torchmetrics==1.7.1 +nvidia-nvjitlink-cu12==12.4.127 +stack-data==0.6.3 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +traitlets==5.14.3 +einops==0.8.1 +salesforce-lavis==1.0.2 +joblib==1.5.1 +msgpack==1.1.0 +tokenizers==0.21.1 +sentry-sdk==2.29.1 +oss2==2.15.0 +setproctitle==1.3.6 +pip==25.1.1 +cffi==1.17.1 +transformers==4.52.3 +narwhals==1.41.0 +aliyun-python-sdk-core==2.16.0 +jsonschema==4.24.0 +flash-attn==2.7.1.post1 +preshed==3.0.10 +multiprocess==0.70.16 +cryptography==45.0.3 +aliyun-python-sdk-kms==2.16.5 +scikit-learn==1.6.1 +huggingface-hub==0.32.1 +crcmod==1.7 +typer==0.16.0 +web.py==0.62 +docker-pycreds==0.4.0 +xxhash==3.5.0 +bigmodelvis==0.0.1 +datasets==3.6.0 +more-itertools==10.7.0 +yacs==0.1.8 +jmespath==0.10.0 +aiohttp==3.12.2 +opencv-python==4.11.0.86 +pycparser==2.22 +threadpoolctl==3.6.0 +jaraco.functools==4.1.0 +click==8.2.1 +wandb==0.19.11 +opendelta==0.3.2 +pycryptodome==3.23.0 +pathlib==1.0.1 +dill==0.3.8 +fsspec==2025.3.0 +delta-center-client==0.0.4 +cheroot==10.0.1 +typing_extensions==4.12.2 +platformdirs==4.2.2 +jaraco.text==3.12.1 +packaging==24.2 +inflect==7.3.1 +jaraco.context==5.3.0 +wheel==0.45.1 +typeguard==4.3.0 +more-itertools==10.3.0 +tomli==2.0.1 +importlib_metadata==8.0.0 +backports.tarfile==1.2.0 +zipp==3.19.2 +jaraco.collections==5.1.0 +autocommand==2.2.2 +jaraco.functools==4.0.1 diff --git a/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..161a774026b5648804941619b4dcbbdfc9f55c05 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/wandb-metadata.json @@ -0,0 +1,107 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-07-07T11:53:53.846220Z", + "args": [ + "--devices", + "0,1,2,3,4,5,6,7", + "--mode", + "train", + "--filename", + "stage2.5_mol_instruction", + "--num_query_token", + "8", + "--save_every_n_epochs", + "1", + "--max_epochs", + "10", + "--batch_size", + "1", + "--precision", + "bf16-mixed", + "--num_workers", + "8", + "--plm_model", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--llm_name", + "/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300", + "--llm_tune", + "mid_lora", + "--stage1_path", + "/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt", + "--use_wandb_logger", + "--text_max_len", + "1024", + "--prot_max_len", + "2048" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", + "codePath": "stage2.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage2.5_mol_instruction/", + "host": "dsw-265304-7f6db6b4bb-g4b9r", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage2.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "1260957696" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug-internal.log b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..366d066fb1487c2d933fc4b190857bd0c8372e42 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-07-07T19:53:53.851667884+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug-core.log"} +{"time":"2025-07-07T19:53:55.917977588+08:00","level":"INFO","msg":"created new stream","id":"qhvlkre6"} +{"time":"2025-07-07T19:53:55.918022111+08:00","level":"INFO","msg":"stream: started","id":"qhvlkre6"} +{"time":"2025-07-07T19:53:55.918057138+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"qhvlkre6"} +{"time":"2025-07-07T19:53:55.918080879+08:00","level":"INFO","msg":"handler: started","stream_id":"qhvlkre6"} +{"time":"2025-07-07T19:53:55.918147188+08:00","level":"INFO","msg":"sender: started","stream_id":"qhvlkre6"} +{"time":"2025-07-07T19:53:57.234333371+08:00","level":"INFO","msg":"Starting system monitor"} diff --git a/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug.log b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..6c50d9abda8eb80845104d42e201892f11c8fed4 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug.log @@ -0,0 +1,22 @@ +2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_setup.py:_flush():70] Configure stats pid to 125661 +2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug.log +2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug-internal.log +2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_init.py:init():852] calling init triggers +2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_init.py:init():893] starting backend +2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_init.py:init():897] sending inform_init request +2025-07-07 19:53:53,842 INFO MainThread:125661 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-07-07 19:53:53,846 INFO MainThread:125661 [wandb_init.py:init():907] backend started and connected +2025-07-07 19:53:53,847 INFO MainThread:125661 [wandb_init.py:init():1005] updated telemetry +2025-07-07 19:53:53,852 INFO MainThread:125661 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-07-07 19:53:57,184 INFO MainThread:125661 [wandb_init.py:init():1104] starting run threads in backend +2025-07-07 19:53:57,375 INFO MainThread:125661 [wandb_run.py:_console_start():2573] atexit reg +2025-07-07 19:53:57,375 INFO MainThread:125661 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-07-07 19:53:57,379 INFO MainThread:125661 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-07-07 19:53:57,379 INFO MainThread:125661 [wandb_run.py:_redirect():2513] Redirects installed. +2025-07-07 19:53:57,380 INFO MainThread:125661 [wandb_init.py:init():1150] run started, returning control to user process diff --git a/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/run-qhvlkre6.wandb b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/run-qhvlkre6.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_200229-yex1pcwt/files/config.yaml b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_200229-yex1pcwt/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9c2260ab7b10ac5ad8b704628897a93f776935ad --- /dev/null +++ b/ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_200229-yex1pcwt/files/config.yaml @@ -0,0 +1,216 @@ +_wandb: + value: + cli_version: 0.19.11 + m: + - "1": epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": trainer/global_step + "6": + - 3 + "7": [] + - "1": dataset0/rouge_2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/bleu4 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/rouge_1 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataloader0/val loss/dataloader_idx_0 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/rouge_l + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/bleu2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/meteor_score + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": lr + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.10.0 + t: + "1": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "2": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "3": + - 7 + - 23 + - 55 + - 66 + "4": 3.10.0 + "5": 0.19.11 + "6": 4.52.3 + "8": + - 5 + "12": 0.19.11 + "13": linux-x86_64 +a_max_len: + value: 36 +accelerator: + value: gpu +accumulate_grad_batches: + value: 1 +batch_size: + value: 2 +bert_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft +caption_eval_epoch: + value: 10 +check_val_every_n_epoch: + value: 1 +cross_attention_freq: + value: 2 +devices: + value: 0,1,2,3,4,5,6,7 +do_sample: + value: false +enable_flash: + value: false +enbale_gradient_checkpointing: + value: false +filename: + value: stage2.5_mol_instruction +filter_side_qa: + value: false +inference_batch_size: + value: 4 +init_checkpoint: + value: "" +init_lr: + value: 0.0001 +llm_name: + value: /oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300 +llm_tune: + value: mid_lora +lora_alpha: + value: 16 +lora_dropout: + value: 0.1 +lora_r: + value: 8 +lr_decay_rate: + value: 0.9 +max_epochs: + value: 10 +max_inference_len: + value: 128 +min_inference_len: + value: 1 +min_lr: + value: 1e-05 +mix_dataset: + value: false +mode: + value: train +num_beams: + value: 5 +num_query_token: + value: 8 +num_workers: + value: 8 +peft_config: + value: "" +peft_dir: + value: "" +plm_model: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m +plm_tune: + value: freeze +precision: + value: bf16-mixed +prompt: + value: 'The protein has the following properties: ' +prot_max_len: + value: 1024 +q_max_len: + value: 29 +root: + value: data +save_every_n_epochs: + value: 1 +scheduler: + value: linear_warmup_cosine_lr +seed: + value: 42 +stage1_path: + value: /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt +stage2_path: + value: "" +strategy: + value: deepspeed +text_max_len: + value: 1024 +use_wandb_logger: + value: true +warmup_lr: + value: 1e-06 +warmup_steps: + value: 1000 +weight_decay: + value: 0.05 diff --git a/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_153250-690krh73/logs/debug.log b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_153250-690krh73/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..3b7d7765f1ce10ed6cb04436fb518db1b79b7c0a --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_153250-690krh73/logs/debug.log @@ -0,0 +1,24 @@ +2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_setup.py:_flush():70] Configure stats pid to 50671 +2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2_07041521/wandb/run-20250704_153250-690krh73/logs/debug.log +2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2_07041521/wandb/run-20250704_153250-690krh73/logs/debug-internal.log +2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_init.py:init():852] calling init triggers +2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_init.py:init():893] starting backend +2025-07-04 15:32:50,808 INFO MainThread:50671 [wandb_init.py:init():897] sending inform_init request +2025-07-04 15:32:50,809 INFO MainThread:50671 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-07-04 15:32:50,809 INFO MainThread:50671 [wandb_init.py:init():907] backend started and connected +2025-07-04 15:32:50,810 INFO MainThread:50671 [wandb_init.py:init():1005] updated telemetry +2025-07-04 15:32:50,811 INFO MainThread:50671 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-07-04 15:32:53,521 INFO MainThread:50671 [wandb_init.py:init():1104] starting run threads in backend +2025-07-04 15:32:53,737 INFO MainThread:50671 [wandb_run.py:_console_start():2573] atexit reg +2025-07-04 15:32:53,737 INFO MainThread:50671 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-07-04 15:32:53,744 INFO MainThread:50671 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-07-04 15:32:53,744 INFO MainThread:50671 [wandb_run.py:_redirect():2513] Redirects installed. +2025-07-04 15:32:53,764 INFO MainThread:50671 [wandb_init.py:init():1150] run started, returning control to user process +2025-07-04 15:33:02,588 INFO MainThread:50671 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage2_07041521', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 15, 'accumulate_grad_batches': 1, 'check_val_every_n_epoch': 1, 'enable_flash': False, 'use_wandb_logger': True, 'mix_dataset': True, 'save_every_n_epochs': 5, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'cross_attention_freq': 2, 'num_query_token': 8, 'llm_name': '/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300', 'num_beams': 5, 'do_sample': False, 'max_inference_len': 128, 'min_inference_len': 1, 'llm_tune': 'mid_lora', 'peft_config': '', 'peft_dir': '', 'plm_model': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'enbale_gradient_checkpointing': False, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'stage1_path': '/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt', 'stage2_path': '', 'init_checkpoint': '', 'caption_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'inference_batch_size': 4, 'root': 'data', 'text_max_len': 128, 'q_max_len': 29, 'a_max_len': 36, 'prot_max_len': 1024, 'prompt': 'The protein has the following properties: ', 'filter_side_qa': False} +2025-07-04 15:33:07,743 INFO MsgRouterThr:50671 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/config.yaml b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dfc80161b4229bbcf1b793d9e03d71af6de4b039 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/config.yaml @@ -0,0 +1,222 @@ +_wandb: + value: + cli_version: 0.19.11 + m: + - "1": dataloader2/val loss/dataloader_idx_2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": trainer/global_step + "6": + - 3 + "7": [] + - "1": dataloader0/val loss/dataloader_idx_0 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/meteor_score + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/rouge_l + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": lr + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/rouge_2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/bleu2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/bleu4 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/rouge_1 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.10.0 + t: + "1": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "2": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "3": + - 7 + - 23 + - 55 + - 66 + "4": 3.10.0 + "5": 0.19.11 + "6": 4.52.3 + "8": + - 5 + "12": 0.19.11 + "13": linux-x86_64 +a_max_len: + value: 36 +accelerator: + value: gpu +accumulate_grad_batches: + value: 1 +batch_size: + value: 32 +bert_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft +caption_eval_epoch: + value: 10 +check_val_every_n_epoch: + value: 1 +cross_attention_freq: + value: 2 +devices: + value: 0,1,2,3,4,5,6,7 +do_sample: + value: false +enable_flash: + value: false +enbale_gradient_checkpointing: + value: false +filename: + value: stage2_07041521 +filter_side_qa: + value: false +inference_batch_size: + value: 4 +init_checkpoint: + value: "" +init_lr: + value: 0.0001 +llm_name: + value: /oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300 +llm_tune: + value: mid_lora +lora_alpha: + value: 16 +lora_dropout: + value: 0.1 +lora_r: + value: 8 +lr_decay_rate: + value: 0.9 +max_epochs: + value: 15 +max_inference_len: + value: 128 +min_inference_len: + value: 1 +min_lr: + value: 1e-05 +mix_dataset: + value: true +mode: + value: train +num_beams: + value: 5 +num_query_token: + value: 8 +num_workers: + value: 8 +peft_config: + value: "" +peft_dir: + value: "" +plm_model: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m +plm_tune: + value: freeze +precision: + value: bf16-mixed +prompt: + value: 'The protein has the following properties: ' +prot_max_len: + value: 1024 +q_max_len: + value: 29 +root: + value: data +save_every_n_epochs: + value: 5 +scheduler: + value: linear_warmup_cosine_lr +seed: + value: 42 +stage1_path: + value: /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt +stage2_path: + value: "" +strategy: + value: deepspeed +text_max_len: + value: 128 +use_wandb_logger: + value: true +warmup_lr: + value: 1e-06 +warmup_steps: + value: 1000 +weight_decay: + value: 0.05 diff --git a/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/output.log b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..bab96f25fa5652f2efaf1fc699fe236e244f49c3 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/output.log @@ -0,0 +1,35 @@ +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage2_07041521 exists and is not empty. +Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`. +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] + + | Name | Type | Params | Mode +------------------------------------------- +0 | blip2 | Blip2OPT | 7.9 B | train +------------------------------------------- +104 M Trainable params +7.8 B Non-trainable params +7.9 B Total params +31,459.025Total estimated model params size (MB) +174 Modules in train mode +1203 Modules in eval mode +Epoch 9: 100%|██████████████████████████████████████████| 1682/1682 [34:15<00:00, 0.82it/s, v_num=rt6r]BLEU-2 score: 14.521351656885983 +BLEU-4 score: 12.023162430085268██████████████████████████████████████| 313/313 [20:52<00:00, 0.25it/s] +/nas/shared/kilab/wangyujia/ProtT3/model/dist_funs.py:18: FutureWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + sd = self.module.state_dict(destination, prefix, keep_vars) +20000it [01:44, 191.35it/s] +20000it [00:23, 848.56it/s] +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/acc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices. +Average Meteor score: 20.348670863196457 +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/bleu2', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices. +ROUGE score: +rouge1: 20.24471429685868 +rouge2: 13.338771592014881 +rougeL: 18.14378222211823 +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/bleu4', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices. +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_1', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices. +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_2', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices. +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_l', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices. +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/meteor_score', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices. +Epoch 14: 100%|█████████████████████████████████████████| 1682/1682 [35:23<00:00, 0.79it/s, v_num=rt6r] + +`Trainer.fit` stopped: `max_epochs=15` reached. diff --git a/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/requirements.txt b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0319522eabea736b5ac8dd313c08e1dda7da1ca --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/requirements.txt @@ -0,0 +1,225 @@ +pydantic_core==2.33.2 +psutil==7.0.0 +nvidia-cuda-nvrtc-cu12==12.4.127 +mpmath==1.3.0 +tzdata==2025.2 +contexttimer==0.3.3 +cycler==0.12.1 +python-magic==0.4.27 +pexpect==4.9.0 +sympy==1.13.1 +wrapt==1.17.2 +marisa-trie==1.2.1 +langcodes==3.5.0 +nvidia-nvtx-cu12==12.4.127 +ipython==8.36.0 +opencv-python-headless==4.5.5.64 +MarkupSafe==3.0.2 +jsonschema-specifications==2025.4.1 +wasabi==1.1.3 +blinker==1.9.0 +cfgv==3.4.0 +numpy==2.2.6 +idna==3.10 +nvidia-cufile-cu12==1.11.1.6 +ninja==1.11.1.4 +nvidia-nccl-cu12==2.21.5 +networkx==3.4.2 +certifi==2025.4.26 +deepspeed==0.16.10+b666844f +pure_eval==0.2.3 +packaging==24.2 +nltk==3.9.1 +contourpy==1.3.2 +pre_commit==4.2.0 +nodeenv==1.9.1 +setuptools==78.1.1 +annotated-types==0.7.0 +multidict==6.4.4 +requests==2.32.3 +tornado==6.5.1 +triton==3.2.0 +pillow==11.2.1 +decord==0.6.0 +shellingham==1.5.4 +streamlit==1.45.1 +pydeck==0.9.1 +confection==0.1.5 +exceptiongroup==1.3.0 +prompt_toolkit==3.0.51 +text-unidecode==1.3 +nvidia-cufft-cu12==11.2.1.3 +antlr4-python3-runtime==4.9.3 +fairscale==0.4.4 +rouge_score==0.1.2 +nvidia-cudnn-cu12==9.1.0.70 +tqdm==4.67.1 +rich==14.0.0 +frozenlist==1.6.0 +webencodings==0.5.1 +altair==5.5.0 +opendatasets==0.1.22 +nvidia-curand-cu12==10.3.5.147 +protobuf==6.31.0 +asttokens==3.0.0 +wheel==0.45.1 +hf-xet==1.1.2 +weasel==0.4.1 +aiosignal==1.3.2 +absl-py==2.2.2 +thinc==8.3.6 +torchvision==0.21.0 +pandas==2.2.3 +fonttools==4.58.0 +bleach==6.2.0 +typing-inspection==0.4.1 +ftfy==6.3.1 +typing_extensions==4.13.2 +nvidia-ml-py==12.575.51 +python-slugify==8.0.4 +lightning-utilities==0.14.3 +py-cpuinfo==9.0.0 +smmap==5.0.2 +regex==2024.11.6 +scikit-image==0.25.2 +iopath==0.1.10 +spacy-legacy==3.0.12 +hjson==3.1.0 +executing==2.2.0 +kiwisolver==1.4.8 +scipy==1.15.3 +aiohappyeyeballs==2.6.1 +toml==0.10.2 +jedi==0.19.2 +GitPython==3.1.44 +ptyprocess==0.7.0 +kaggle==1.7.4.5 +braceexpand==0.1.7 +wcwidth==0.2.13 +nvidia-cuda-runtime-cu12==12.4.127 +pytorch-lightning==2.5.1.post0 +Jinja2==3.1.6 +urllib3==2.4.0 +watchdog==6.0.0 +filelock==3.18.0 +propcache==0.3.1 +torch==2.6.0 +nvidia-cusparse-cu12==12.3.1.170 +cymem==2.0.11 +nvidia-cusolver-cu12==11.6.1.9 +murmurhash==1.0.13 +catalogue==2.0.10 +yarl==1.20.0 +charset-normalizer==3.4.2 +gitdb==4.0.12 +matplotlib==3.10.3 +portalocker==3.1.1 +platformdirs==4.3.8 +async-timeout==5.0.1 +parso==0.8.4 +markdown-it-py==3.0.0 +omegaconf==2.3.0 +cloudpathlib==0.21.1 +nvidia-cusparselt-cu12==0.6.2 +spacy-loggers==1.0.5 +srsly==2.5.1 +identify==2.6.12 +rpds-py==0.25.1 +spacy==3.8.7 +matplotlib-inline==0.1.7 +smart-open==7.1.0 +pydantic==2.11.5 +mdurl==0.1.2 +virtualenv==20.31.2 +pytz==2025.2 +pycocotools==2.0.8 +six==1.17.0 +decorator==5.2.1 +referencing==0.36.2 +sentencepiece==0.2.0 +PyYAML==6.0.2 +pycocoevalcap==1.2 +imageio==2.37.0 +distlib==0.3.9 +pyarrow==20.0.0 +tenacity==9.1.2 +language_data==1.3.0 +nvidia-cuda-cupti-cu12==12.4.127 +blis==1.3.0 +Pygments==2.19.1 +tifffile==2025.5.10 +pyparsing==3.2.3 +cachetools==5.5.2 +safetensors==0.5.3 +attrs==25.3.0 +webdataset==0.2.111 +plotly==6.1.1 +nvidia-cublas-cu12==12.4.5.8 +timm==0.4.12 +torchmetrics==1.7.1 +nvidia-nvjitlink-cu12==12.4.127 +stack-data==0.6.3 +python-dateutil==2.9.0.post0 +lazy_loader==0.4 +traitlets==5.14.3 +einops==0.8.1 +salesforce-lavis==1.0.2 +joblib==1.5.1 +msgpack==1.1.0 +tokenizers==0.21.1 +sentry-sdk==2.29.1 +oss2==2.15.0 +setproctitle==1.3.6 +pip==25.1.1 +cffi==1.17.1 +transformers==4.52.3 +narwhals==1.41.0 +aliyun-python-sdk-core==2.16.0 +jsonschema==4.24.0 +flash-attn==2.7.1.post1 +preshed==3.0.10 +multiprocess==0.70.16 +cryptography==45.0.3 +aliyun-python-sdk-kms==2.16.5 +scikit-learn==1.6.1 +huggingface-hub==0.32.1 +crcmod==1.7 +typer==0.16.0 +web.py==0.62 +docker-pycreds==0.4.0 +xxhash==3.5.0 +bigmodelvis==0.0.1 +datasets==3.6.0 +more-itertools==10.7.0 +yacs==0.1.8 +jmespath==0.10.0 +aiohttp==3.12.2 +opencv-python==4.11.0.86 +pycparser==2.22 +threadpoolctl==3.6.0 +jaraco.functools==4.1.0 +click==8.2.1 +wandb==0.19.11 +opendelta==0.3.2 +pycryptodome==3.23.0 +pathlib==1.0.1 +dill==0.3.8 +fsspec==2025.3.0 +delta-center-client==0.0.4 +cheroot==10.0.1 +typing_extensions==4.12.2 +platformdirs==4.2.2 +jaraco.text==3.12.1 +packaging==24.2 +inflect==7.3.1 +jaraco.context==5.3.0 +wheel==0.45.1 +typeguard==4.3.0 +more-itertools==10.3.0 +tomli==2.0.1 +importlib_metadata==8.0.0 +backports.tarfile==1.2.0 +zipp==3.19.2 +jaraco.collections==5.1.0 +autocommand==2.2.2 +jaraco.functools==4.0.1 diff --git a/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..fc0e42748408f66f9c67fad4b64cefb8ac188261 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/wandb-metadata.json @@ -0,0 +1,104 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-07-04T07:46:08.916822Z", + "args": [ + "--devices", + "0,1,2,3,4,5,6,7", + "--mode", + "train", + "--filename", + "stage2_07041521", + "--num_query_token", + "8", + "--save_every_n_epochs", + "5", + "--max_epochs", + "15", + "--batch_size", + "32", + "--precision", + "bf16-mixed", + "--num_workers", + "8", + "--plm_model", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--llm_name", + "/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300", + "--llm_tune", + "mid_lora", + "--mix_dataset", + "--stage1_path", + "/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt", + "--use_wandb_logger" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", + "codePath": "stage2.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage2_07041521/", + "host": "dsw-265304-b8d7644bb-bs7r7", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage2.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "1266618368" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/wandb-summary.json b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..cc9619494d4d6a3cd3d600128d6a794bcb40fb95 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/wandb-summary.json @@ -0,0 +1 @@ +{"dataset0/rouge_2":13.33877182006836,"epoch":14,"dataset0/bleu4":12.023162841796875,"dataset0/bleu2":14.52135181427002,"dataset0/acc":0,"dataset0/rouge_l":18.143781661987305,"dataloader2/val loss/dataloader_idx_2":2.210709571838379,"_step":518,"loss":0.12754811346530914,"dataset0/meteor_score":20.348670959472656,"_wandb":{"runtime":34542},"lr":1.0983357242366765e-05,"dataset0/rouge_1":20.244714736938477,"trainer/global_step":25229,"_timestamp":1.7516496969635456e+09,"dataloader0/val loss/dataloader_idx_0":0.3717030882835388,"_runtime":34528.047018257} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug-internal.log b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..aaa738ba8f64b3fc18e1d63d339e2d9a0cdc8fe9 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug-internal.log @@ -0,0 +1,95 @@ +{"time":"2025-07-04T15:46:08.966654664+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug-core.log"} +{"time":"2025-07-04T15:46:10.62729617+08:00","level":"INFO","msg":"created new stream","id":"ds7lrt6r"} +{"time":"2025-07-04T15:46:10.627339189+08:00","level":"INFO","msg":"stream: started","id":"ds7lrt6r"} +{"time":"2025-07-04T15:46:10.627374947+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"ds7lrt6r"} +{"time":"2025-07-04T15:46:10.627404904+08:00","level":"INFO","msg":"handler: started","stream_id":"ds7lrt6r"} +{"time":"2025-07-04T15:46:10.627398441+08:00","level":"INFO","msg":"sender: started","stream_id":"ds7lrt6r"} +{"time":"2025-07-04T15:46:12.482815718+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-07-04T21:58:22.739226816+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:36512->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-04T22:01:34.73927758+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:56674->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-04T22:01:43.328655381+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-07-04T22:02:15.686026868+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-07-04T22:02:50.214808897+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-07-04T22:03:28.950188225+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-07-04T22:04:14.982810813+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-07-04T22:05:22.734102314+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-07-04T22:06:52.750047504+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-07-04T22:08:22.775414283+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-07-04T22:08:43.307492899+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"} +{"time":"2025-07-04T22:09:52.781192276+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-07-04T22:11:13.328660156+08:00","level":"WARN","msg":"sender: taking a long time","seconds":600.000386663,"work":"WorkRecord(*service_go_proto.Request_StopStatus); Control(local:true mailbox_slot:\"2nedcn0bl5yp\" connection_id:\"127.0.0.1:57318\")"} +{"time":"2025-07-04T22:11:22.78819562+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-07-04T22:12:52.789417986+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-07-04T22:13:05.234783312+08:00","level":"WARN","msg":"runwork: taking a long time","seconds":600.000850585,"work":"WorkRecord(*service_go_proto.Record_OutputRaw); Control(connection_id:\"127.0.0.1:57318\")"} +{"time":"2025-07-04T22:13:12.48514539+08:00","level":"WARN","msg":"runwork: taking a long time","seconds":600.000329899,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"} +{"time":"2025-07-04T22:13:12.516285255+08:00","level":"WARN","msg":"runwork: taking a long time","seconds":600.00042574,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"} +{"time":"2025-07-04T22:14:22.815202117+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-07-04T22:15:52.82570124+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-07-04T22:17:22.890129793+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-07-04T22:18:29.532924447+08:00","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":1036.204664967,"work":"WorkRecord(*service_go_proto.Request_StopStatus); Control(local:true mailbox_slot:\"2nedcn0bl5yp\" connection_id:\"127.0.0.1:57318\")"} +{"time":"2025-07-04T22:18:29.532969506+08:00","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":924.299032896,"work":"WorkRecord(*service_go_proto.Record_OutputRaw); Control(connection_id:\"127.0.0.1:57318\")"} +{"time":"2025-07-04T22:18:29.532991216+08:00","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":917.048223115,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"} +{"time":"2025-07-04T22:18:29.532997449+08:00","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":917.017162756,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"} +{"time":"2025-07-04T22:22:02.242147225+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:53384->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-04T22:26:53.14669432+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"} +{"time":"2025-07-04T22:27:48.780316277+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"} +{"time":"2025-07-04T22:31:07.795211328+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:59586->104.21.20.172:443: read: connection timed out"} +{"time":"2025-07-04T22:34:24.403211244+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:36792->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-04T22:42:22.611231819+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:52098->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-04T22:43:30.389313147+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:48310->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-04T22:44:01.015059936+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"} +{"time":"2025-07-04T22:46:14.846692259+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"} +{"time":"2025-07-04T22:48:31.349968366+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:39256->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-04T22:49:38.015743829+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:38292->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-04T22:50:09.683679573+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": http2: client conn is closed"} +{"time":"2025-07-04T22:52:08.558045187+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:59988->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-04T22:54:22.824301514+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:38894->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-04T22:59:26.307679579+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:51684->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-04T23:02:32.979198883+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:58210->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-04T23:10:21.459215862+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:35172->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-04T23:11:15.167490198+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:54942->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-04T23:16:10.863292487+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:43428->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-04T23:20:22.264503678+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-07-04T23:23:20.723212059+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:38048->104.21.20.172:443: read: connection timed out"} +{"time":"2025-07-04T23:25:26.885991394+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:42070->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-04T23:26:43.632088162+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-07-04T23:29:40.115213912+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:39740->104.21.20.172:443: read: connection timed out"} +{"time":"2025-07-04T23:30:43.633929839+08:00","level":"ERROR","msg":"sender: sendStopStatus: failed to get run stopped status: net/http: request canceled (Client.Timeout or context cancellation while reading body)"} +{"time":"2025-07-04T23:31:28.634946467+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-07-04T23:32:54.67521322+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:51016->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-04T23:39:42.739210995+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:58242->104.21.20.172:443: read: connection timed out"} +{"time":"2025-07-04T23:45:14.003206141+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:41730->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-04T23:48:41.876197491+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:39056->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-04T23:52:10.25916891+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:56968->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-04T23:55:53.491192209+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:52384->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-04T23:58:33.235181122+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:41192->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-05T00:01:39.09118175+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:54870->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-05T00:03:30.660362141+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"} +{"time":"2025-07-05T00:06:22.739197171+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:56300->104.21.20.172:443: read: connection timed out"} +{"time":"2025-07-05T00:07:01.140306954+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:33830->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-05T00:10:11.603177492+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:38990->104.21.20.172:443: read: connection timed out"} +{"time":"2025-07-05T00:17:09.907203145+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:58654->104.21.20.172:443: read: connection timed out"} +{"time":"2025-07-05T00:20:10.131225125+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:39570->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-05T00:23:22.643197817+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:57940->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-05T00:26:13.651201419+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:37792->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-05T00:28:56.467221564+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:38640->104.21.20.172:443: read: connection timed out"} +{"time":"2025-07-05T00:32:27.924195852+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:33266->104.21.20.172:443: read: connection timed out"} +{"time":"2025-07-05T00:33:00.356828932+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:56794->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-05T00:36:52.115188168+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:44060->104.21.20.172:443: read: connection timed out"} +{"time":"2025-07-05T00:40:28.83076072+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-07-05T00:41:16.304855216+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"} +{"time":"2025-07-05T00:44:59.539204741+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:54050->104.21.20.172:443: read: connection timed out"} +{"time":"2025-07-05T00:46:47.38055068+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:47426->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-05T00:48:09.874069624+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:60378->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-05T00:50:24.818784704+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"} +{"time":"2025-07-05T01:02:13.78019116+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:44352->104.21.20.172:443: read: connection timed out"} +{"time":"2025-07-05T01:10:57.377024443+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": context deadline exceeded"} +{"time":"2025-07-05T01:20:40.211190451+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:50414->104.21.20.172:443: read: connection timed out"} +{"time":"2025-07-05T01:21:51.097875981+08:00","level":"INFO","msg":"stream: closing","id":"ds7lrt6r"} +{"time":"2025-07-05T01:21:51.097937445+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-07-05T01:21:51.099273597+08:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-07-05T01:21:58.380081154+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-07-05T01:22:06.296069471+08:00","level":"INFO","msg":"handler: closed","stream_id":"ds7lrt6r"} +{"time":"2025-07-05T01:22:06.296102451+08:00","level":"INFO","msg":"sender: closed","stream_id":"ds7lrt6r"} +{"time":"2025-07-05T01:22:06.296100202+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"ds7lrt6r"} +{"time":"2025-07-05T01:22:06.302257653+08:00","level":"INFO","msg":"stream: closed","id":"ds7lrt6r"} diff --git a/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug.log b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..e0a5da7326151ac446a54dfb5f8bb681a2ab0fa8 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug.log @@ -0,0 +1,24 @@ +2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_setup.py:_flush():70] Configure stats pid to 56865 +2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug.log +2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug-internal.log +2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_init.py:init():852] calling init triggers +2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_init.py:init():893] starting backend +2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_init.py:init():897] sending inform_init request +2025-07-04 15:46:08,914 INFO MainThread:56865 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-07-04 15:46:08,916 INFO MainThread:56865 [wandb_init.py:init():907] backend started and connected +2025-07-04 15:46:08,917 INFO MainThread:56865 [wandb_init.py:init():1005] updated telemetry +2025-07-04 15:46:08,922 INFO MainThread:56865 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-07-04 15:46:12,402 INFO MainThread:56865 [wandb_init.py:init():1104] starting run threads in backend +2025-07-04 15:46:12,679 INFO MainThread:56865 [wandb_run.py:_console_start():2573] atexit reg +2025-07-04 15:46:12,680 INFO MainThread:56865 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-07-04 15:46:12,685 INFO MainThread:56865 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-07-04 15:46:12,686 INFO MainThread:56865 [wandb_run.py:_redirect():2513] Redirects installed. +2025-07-04 15:46:12,697 INFO MainThread:56865 [wandb_init.py:init():1150] run started, returning control to user process +2025-07-04 15:46:21,744 INFO MainThread:56865 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage2_07041521', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 15, 'accumulate_grad_batches': 1, 'check_val_every_n_epoch': 1, 'enable_flash': False, 'use_wandb_logger': True, 'mix_dataset': True, 'save_every_n_epochs': 5, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'cross_attention_freq': 2, 'num_query_token': 8, 'llm_name': '/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300', 'num_beams': 5, 'do_sample': False, 'max_inference_len': 128, 'min_inference_len': 1, 'llm_tune': 'mid_lora', 'peft_config': '', 'peft_dir': '', 'plm_model': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'enbale_gradient_checkpointing': False, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'stage1_path': '/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt', 'stage2_path': '', 'init_checkpoint': '', 'caption_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'inference_batch_size': 4, 'root': 'data', 'text_max_len': 128, 'q_max_len': 29, 'a_max_len': 36, 'prot_max_len': 1024, 'prompt': 'The protein has the following properties: ', 'filter_side_qa': False} +2025-07-05 01:21:51,095 INFO MsgRouterThr:56865 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/debug-internal.log b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..1efc19f987e5b1c198147f823733a6ec3c28098e --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/debug-internal.log @@ -0,0 +1,37 @@ +{"time":"2025-07-07T04:12:31.576270753+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/logs/debug-core.log"} +{"time":"2025-07-07T04:12:32.610923664+08:00","level":"INFO","msg":"created new stream","id":"gtrtcbb9"} +{"time":"2025-07-07T04:12:32.61096355+08:00","level":"INFO","msg":"stream: started","id":"gtrtcbb9"} +{"time":"2025-07-07T04:12:32.610992679+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"gtrtcbb9"} +{"time":"2025-07-07T04:12:32.611029092+08:00","level":"INFO","msg":"sender: started","stream_id":"gtrtcbb9"} +{"time":"2025-07-07T04:12:32.611068278+08:00","level":"INFO","msg":"handler: started","stream_id":"gtrtcbb9"} +{"time":"2025-07-07T04:12:33.863206149+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-07-07T12:23:49.503077697+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:48684->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-07T15:56:55.312158345+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070337_2datasets_noconstruct/gtrtcbb9/file_stream\": read tcp 10.1.2.112:32778->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-07T16:16:19.961421973+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.bandw.top/graphql","body":"Cloudflare encountered an error processing this request: Bad Gateway"} +{"time":"2025-07-07T22:42:18.689446152+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070337_2datasets_noconstruct/gtrtcbb9/file_stream\": unexpected EOF"} +{"time":"2025-07-08T00:47:05.732182871+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070337_2datasets_noconstruct/gtrtcbb9/file_stream\": unexpected EOF"} +{"time":"2025-07-08T00:55:03.85032934+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070337_2datasets_noconstruct/gtrtcbb9/file_stream\": read tcp 10.1.2.112:35640->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-08T01:02:05.32396859+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:40958->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-08T05:26:50.313027055+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:33866->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-08T07:35:50.188790186+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-07-08T21:17:21.227560805+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:52938->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-09T00:03:21.269215674+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:60070->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-09T00:04:06.207861293+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:48626->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-09T00:47:00.752106793+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070337_2datasets_noconstruct/gtrtcbb9/file_stream\": read tcp 10.1.2.112:60404->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-09T01:31:51.271112465+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:54950->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-09T02:00:06.381543562+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:52386->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-09T03:47:21.233980657+08:00","level":"INFO","msg":"api: retrying HTTP error","status":504,"url":"https://api.bandw.top/files/gia0603yucca/stage2_07070337_2datasets_noconstruct/gtrtcbb9/file_stream","body":"error code: 504"} +{"time":"2025-07-09T06:32:51.954073668+08:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.bandw.top/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"} +{"time":"2025-07-09T06:32:54.402822916+08:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.bandw.top/graphql","body":"\r\n500 Internal Server Error\r\n\r\n

500 Internal Server Error

\r\n
openresty
\r\n\r\n\r\n"} +{"time":"2025-07-09T06:32:59.256841538+08:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.bandw.top/graphql","body":"\r\n500 Internal Server Error\r\n\r\n

500 Internal Server Error

\r\n
openresty
\r\n\r\n\r\n"} +{"time":"2025-07-09T08:06:06.165959794+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-07-09T09:53:21.494777402+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:43336->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-09T18:32:06.535906689+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-07-09T18:36:24.226402129+08:00","level":"INFO","msg":"stream: closing","id":"gtrtcbb9"} +{"time":"2025-07-09T18:36:24.226481822+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-07-09T18:36:24.228410556+08:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-07-09T18:38:36.14051944+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-07-09T18:38:39.275501505+08:00","level":"INFO","msg":"handler: closed","stream_id":"gtrtcbb9"} +{"time":"2025-07-09T18:38:39.275541064+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"gtrtcbb9"} +{"time":"2025-07-09T18:38:39.275567447+08:00","level":"INFO","msg":"sender: closed","stream_id":"gtrtcbb9"} +{"time":"2025-07-09T18:38:39.28791943+08:00","level":"INFO","msg":"stream: closed","id":"gtrtcbb9"} diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/debug.log b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..f6e91291cf56ab96aea6a51fd2d870ba349039dd --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/debug.log @@ -0,0 +1,24 @@ +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_setup.py:_flush():70] Configure stats pid to 15116 +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/logs/debug.log +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/logs/debug-internal.log +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_init.py:init():852] calling init triggers +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_init.py:init():893] starting backend +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_init.py:init():897] sending inform_init request +2025-07-07 04:12:31,566 INFO MainThread:15116 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-07-07 04:12:31,569 INFO MainThread:15116 [wandb_init.py:init():907] backend started and connected +2025-07-07 04:12:31,570 INFO MainThread:15116 [wandb_init.py:init():1005] updated telemetry +2025-07-07 04:12:31,573 INFO MainThread:15116 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-07-07 04:12:33,813 INFO MainThread:15116 [wandb_init.py:init():1104] starting run threads in backend +2025-07-07 04:12:34,000 INFO MainThread:15116 [wandb_run.py:_console_start():2573] atexit reg +2025-07-07 04:12:34,000 INFO MainThread:15116 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-07-07 04:12:34,005 INFO MainThread:15116 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-07-07 04:12:34,005 INFO MainThread:15116 [wandb_run.py:_redirect():2513] Redirects installed. +2025-07-07 04:12:34,006 INFO MainThread:15116 [wandb_init.py:init():1150] run started, returning control to user process +2025-07-07 04:12:42,512 INFO MainThread:15116 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage2_07070337_2datasets_noconstruct', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 10, 'accumulate_grad_batches': 1, 'check_val_every_n_epoch': 1, 'enable_flash': False, 'use_wandb_logger': True, 'mix_dataset': True, 'save_every_n_epochs': 5, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'cross_attention_freq': 2, 'num_query_token': 8, 'llm_name': '/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300', 'num_beams': 5, 'do_sample': False, 'max_inference_len': 128, 'min_inference_len': 1, 'llm_tune': 'mid_lora', 'peft_config': '', 'peft_dir': '', 'plm_model': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'enbale_gradient_checkpointing': False, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'stage1_path': '/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt', 'stage2_path': '', 'init_checkpoint': '', 'caption_eval_epoch': 10, 'num_workers': 8, 'batch_size': 8, 'inference_batch_size': 4, 'root': 'data', 'text_max_len': 1024, 'q_max_len': 29, 'a_max_len': 36, 'prot_max_len': 1024, 'prompt': 'The protein has the following properties: ', 'filter_side_qa': False} +2025-07-09 18:36:24,222 INFO MsgRouterThr:15116 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/config.yaml b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6d563a957924bb5081ea913a84a72d021c9b9102 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/config.yaml @@ -0,0 +1,150 @@ +_wandb: + value: + cli_version: 0.19.11 + m: + - "1": trainer/global_step + "6": + - 3 + "7": [] + python_version: 3.10.0 + t: + "1": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "2": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "3": + - 7 + - 23 + - 55 + - 66 + "4": 3.10.0 + "5": 0.19.11 + "6": 4.52.3 + "8": + - 5 + "12": 0.19.11 + "13": linux-x86_64 +a_max_len: + value: 36 +accelerator: + value: gpu +accumulate_grad_batches: + value: 1 +batch_size: + value: 32 +bert_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft +caption_eval_epoch: + value: 10 +check_val_every_n_epoch: + value: 1 +cross_attention_freq: + value: 2 +devices: + value: 0,1,2,3,4,5,6,7 +do_sample: + value: false +enable_flash: + value: false +enbale_gradient_checkpointing: + value: false +filename: + value: stage2_07070337_2datasets_noconstruct +filter_side_qa: + value: false +inference_batch_size: + value: 4 +init_checkpoint: + value: "" +init_lr: + value: 0.0001 +llm_name: + value: /oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300 +llm_tune: + value: mid_lora +lora_alpha: + value: 16 +lora_dropout: + value: 0.1 +lora_r: + value: 8 +lr_decay_rate: + value: 0.9 +max_epochs: + value: 10 +max_inference_len: + value: 128 +min_inference_len: + value: 1 +min_lr: + value: 1e-05 +mix_dataset: + value: true +mode: + value: train +num_beams: + value: 5 +num_query_token: + value: 8 +num_workers: + value: 8 +peft_config: + value: "" +peft_dir: + value: "" +plm_model: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m +plm_tune: + value: freeze +precision: + value: bf16-mixed +prompt: + value: 'The protein has the following properties: ' +prot_max_len: + value: 1024 +q_max_len: + value: 29 +root: + value: data +save_every_n_epochs: + value: 5 +scheduler: + value: linear_warmup_cosine_lr +seed: + value: 42 +stage1_path: + value: /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt +stage2_path: + value: "" +strategy: + value: deepspeed +text_max_len: + value: 1024 +use_wandb_logger: + value: true +warmup_lr: + value: 1e-06 +warmup_steps: + value: 1000 +weight_decay: + value: 0.05 diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/output.log b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..c92d0a7b42e50c616ec77e868acaa9c8e328332e --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/output.log @@ -0,0 +1,235 @@ +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct exists and is not empty. +Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`. +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] + + | Name | Type | Params | Mode +------------------------------------------- +0 | blip2 | Blip2OPT | 7.9 B | train +------------------------------------------- +104 M Trainable params +7.8 B Non-trainable params +7.9 B Total params +31,459.025Total estimated model params size (MB) +174 Modules in train mode +1203 Modules in eval mode +Epoch 0: 0%| | 0/3331 [00:00 + main(get_args()) + File "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", line 93, in main + trainer.fit(model, datamodule=dm)#, ckpt_path=args.ckpt_path) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit + call._call_and_handle_interrupt( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt + return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch + return function(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl + self._run(model, ckpt_path=ckpt_path) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run + results = self._run_stage() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1056, in _run_stage + self.fit_loop.run() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 216, in run + self.advance() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 455, in advance + self.epoch_loop.run(self._data_fetcher) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 150, in run + self.advance(data_fetcher) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 320, in advance + batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 192, in run + self._optimizer_step(batch_idx, closure) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 270, in _optimizer_step + call._call_lightning_module_hook( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 176, in _call_lightning_module_hook + output = fn(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1302, in optimizer_step + optimizer.step(closure=optimizer_closure) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py", line 154, in step + step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 270, in optimizer_step + optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 239, in optimizer_step + return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/deepspeed.py", line 129, in optimizer_step + closure_result = closure() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 146, in __call__ + self._result = self.closure(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context + return func(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 131, in closure + step_output = self._step_fn() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 319, in _training_step + training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values()) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook + output = fn(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 390, in training_step + return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__ + wrapper_output = wrapper_module(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn + ret_val = func(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward + loss = self.module(*inputs, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl + return inner() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner + result = forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward + out = method(*_args, **_kwargs) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage2.py", line 283, in training_step + loss = self.blip2(batch) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_opt.py", line 212, in forward + outputs = self.llm_model( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/utils/generic.py", line 969, in wrapper + output = func(self, *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 703, in forward + outputs: BaseModelOutputWithPast = self.model( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/utils/generic.py", line 969, in wrapper + output = func(self, *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 436, in forward + layer_outputs = decoder_layer( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/modeling_layers.py", line 48, in __call__ + return super().__call__(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 272, in forward + hidden_states = self.post_attention_layernorm(hidden_states) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 221, in forward + return self.weight * hidden_states.to(input_dtype) +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 228.00 MiB. GPU 0 has a total capacity of 79.35 GiB of which 108.19 MiB is free. Process 402605 has 79.24 GiB memory in use. Of the allocated memory 76.81 GiB is allocated by PyTorch, and 819.32 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", line 131, in +[rank0]: main(get_args()) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", line 93, in main +[rank0]: trainer.fit(model, datamodule=dm)#, ckpt_path=args.ckpt_path) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit +[rank0]: call._call_and_handle_interrupt( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt +[rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch +[rank0]: return function(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl +[rank0]: self._run(model, ckpt_path=ckpt_path) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run +[rank0]: results = self._run_stage() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1056, in _run_stage +[rank0]: self.fit_loop.run() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 216, in run +[rank0]: self.advance() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 455, in advance +[rank0]: self.epoch_loop.run(self._data_fetcher) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 150, in run +[rank0]: self.advance(data_fetcher) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 320, in advance +[rank0]: batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 192, in run +[rank0]: self._optimizer_step(batch_idx, closure) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 270, in _optimizer_step +[rank0]: call._call_lightning_module_hook( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 176, in _call_lightning_module_hook +[rank0]: output = fn(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1302, in optimizer_step +[rank0]: optimizer.step(closure=optimizer_closure) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py", line 154, in step +[rank0]: step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 270, in optimizer_step +[rank0]: optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 239, in optimizer_step +[rank0]: return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/deepspeed.py", line 129, in optimizer_step +[rank0]: closure_result = closure() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 146, in __call__ +[rank0]: self._result = self.closure(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context +[rank0]: return func(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 131, in closure +[rank0]: step_output = self._step_fn() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 319, in _training_step +[rank0]: training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values()) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook +[rank0]: output = fn(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 390, in training_step +[rank0]: return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__ +[rank0]: wrapper_output = wrapper_module(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn +[rank0]: ret_val = func(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward +[rank0]: loss = self.module(*inputs, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl +[rank0]: return inner() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner +[rank0]: result = forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward +[rank0]: out = method(*_args, **_kwargs) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage2.py", line 283, in training_step +[rank0]: loss = self.blip2(batch) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_opt.py", line 212, in forward +[rank0]: outputs = self.llm_model( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/utils/generic.py", line 969, in wrapper +[rank0]: output = func(self, *args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 703, in forward +[rank0]: outputs: BaseModelOutputWithPast = self.model( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/utils/generic.py", line 969, in wrapper +[rank0]: output = func(self, *args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 436, in forward +[rank0]: layer_outputs = decoder_layer( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/modeling_layers.py", line 48, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 272, in forward +[rank0]: hidden_states = self.post_attention_layernorm(hidden_states) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 221, in forward +[rank0]: return self.weight * hidden_states.to(input_dtype) +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 228.00 MiB. GPU 0 has a total capacity of 79.35 GiB of which 108.19 MiB is free. Process 402605 has 79.24 GiB memory in use. Of the allocated memory 76.81 GiB is allocated by PyTorch, and 819.32 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/requirements.txt b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..7acac5f5fa52febebc01067eba0bccb92b3c4aea --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/requirements.txt @@ -0,0 +1,225 @@ +gitdb==4.0.12 +virtualenv==20.31.2 +wasabi==1.1.3 +spacy-loggers==1.0.5 +srsly==2.5.1 +urllib3==2.4.0 +annotated-types==0.7.0 +watchdog==6.0.0 +smart-open==7.1.0 +blis==1.3.0 +nvidia-cuda-nvrtc-cu12==12.4.127 +executing==2.2.0 +altair==5.5.0 +torchmetrics==1.7.1 +mdurl==0.1.2 +protobuf==6.31.0 +psutil==7.0.0 +nvidia-cusparselt-cu12==0.6.2 +toml==0.10.2 +tzdata==2025.2 +distlib==0.3.9 +hf-xet==1.1.2 +idna==3.10 +setuptools==78.1.1 +ftfy==6.3.1 +pydeck==0.9.1 +wcwidth==0.2.13 +pydantic==2.11.5 +fairscale==0.4.4 +nvidia-nvtx-cu12==12.4.127 +plotly==6.1.1 +pexpect==4.9.0 +pyparsing==3.2.3 +torch==2.6.0 +catalogue==2.0.10 +pyarrow==20.0.0 +traitlets==5.14.3 +parso==0.8.4 +multidict==6.4.4 +pandas==2.2.3 +charset-normalizer==3.4.2 +braceexpand==0.1.7 +decorator==5.2.1 +webencodings==0.5.1 +nltk==3.9.1 +propcache==0.3.1 +pydantic_core==2.33.2 +platformdirs==4.3.8 +pycocoevalcap==1.2 +typing-inspection==0.4.1 +pycocotools==2.0.8 +cfgv==3.4.0 +contexttimer==0.3.3 +nvidia-cuda-runtime-cu12==12.4.127 +python-magic==0.4.27 +yarl==1.20.0 +referencing==0.36.2 +nvidia-curand-cu12==10.3.5.147 +scipy==1.15.3 +scikit-image==0.25.2 +aiosignal==1.3.2 +stack-data==0.6.3 +weasel==0.4.1 +nvidia-cuda-cupti-cu12==12.4.127 +smmap==5.0.2 +PyYAML==6.0.2 +nvidia-nccl-cu12==2.21.5 +thinc==8.3.6 +certifi==2025.4.26 +salesforce-lavis==1.0.2 +nvidia-cusparse-cu12==12.3.1.170 +decord==0.6.0 +prompt_toolkit==3.0.51 +nvidia-cufft-cu12==11.2.1.3 +ninja==1.11.1.4 +deepspeed==0.16.10+b666844f +MarkupSafe==3.0.2 +rpds-py==0.25.1 +filelock==3.18.0 +torchvision==0.21.0 +ipython==8.36.0 +contourpy==1.3.2 +tifffile==2025.5.10 +cachetools==5.5.2 +py-cpuinfo==9.0.0 +opencv-python-headless==4.5.5.64 +tqdm==4.67.1 +opendatasets==0.1.22 +pytz==2025.2 +bleach==6.2.0 +sentencepiece==0.2.0 +marisa-trie==1.2.1 +spacy-legacy==3.0.12 +timm==0.4.12 +tornado==6.5.1 +shellingham==1.5.4 +rouge_score==0.1.2 +langcodes==3.5.0 +GitPython==3.1.44 +nvidia-cufile-cu12==1.11.1.6 +portalocker==3.1.1 +iopath==0.1.10 +spacy==3.8.7 +wrapt==1.17.2 +ptyprocess==0.7.0 +jedi==0.19.2 +async-timeout==5.0.1 +wheel==0.45.1 +cloudpathlib==0.21.1 +hjson==3.1.0 +attrs==25.3.0 +pytorch-lightning==2.5.1.post0 +text-unidecode==1.3 +absl-py==2.2.2 +lightning-utilities==0.14.3 +lazy_loader==0.4 +triton==3.2.0 +Pygments==2.19.1 +pillow==11.2.1 +mpmath==1.3.0 +regex==2024.11.6 +matplotlib-inline==0.1.7 +nvidia-cusolver-cu12==11.6.1.9 +matplotlib==3.10.3 +einops==0.8.1 +nvidia-nvjitlink-cu12==12.4.127 +frozenlist==1.6.0 +webdataset==0.2.111 +exceptiongroup==1.3.0 +numpy==2.2.6 +networkx==3.4.2 +streamlit==1.45.1 +nvidia-ml-py==12.575.51 +msgpack==1.1.0 +language_data==1.3.0 +python-dateutil==2.9.0.post0 +pure_eval==0.2.3 +rich==14.0.0 +packaging==24.2 +asttokens==3.0.0 +blinker==1.9.0 +nvidia-cublas-cu12==12.4.5.8 +pre_commit==4.2.0 +nodeenv==1.9.1 +identify==2.6.12 +python-slugify==8.0.4 +kaggle==1.7.4.5 +cymem==2.0.11 +omegaconf==2.3.0 +kiwisolver==1.4.8 +joblib==1.5.1 +Jinja2==3.1.6 +murmurhash==1.0.13 +aiohappyeyeballs==2.6.1 +typing_extensions==4.13.2 +sympy==1.13.1 +jsonschema-specifications==2025.4.1 +cycler==0.12.1 +safetensors==0.5.3 +markdown-it-py==3.0.0 +confection==0.1.5 +tenacity==9.1.2 +imageio==2.37.0 +six==1.17.0 +nvidia-cudnn-cu12==9.1.0.70 +fonttools==4.58.0 +requests==2.32.3 +antlr4-python3-runtime==4.9.3 +delta-center-client==0.0.4 +typer==0.16.0 +flash-attn==2.7.1.post1 +aiohttp==3.12.2 +wandb==0.19.11 +setproctitle==1.3.6 +docker-pycreds==0.4.0 +cheroot==10.0.1 +jmespath==0.10.0 +xxhash==3.5.0 +scikit-learn==1.6.1 +opencv-python==4.11.0.86 +tokenizers==0.21.1 +multiprocess==0.70.16 +preshed==3.0.10 +huggingface-hub==0.32.1 +transformers==4.52.3 +jaraco.functools==4.1.0 +crcmod==1.7 +pycryptodome==3.23.0 +sentry-sdk==2.29.1 +fsspec==2025.3.0 +opendelta==0.3.2 +jsonschema==4.24.0 +oss2==2.15.0 +web.py==0.62 +threadpoolctl==3.6.0 +datasets==3.6.0 +dill==0.3.8 +more-itertools==10.7.0 +narwhals==1.41.0 +pip==25.1.1 +click==8.2.1 +pycparser==2.22 +cryptography==45.0.3 +aliyun-python-sdk-core==2.16.0 +yacs==0.1.8 +cffi==1.17.1 +bigmodelvis==0.0.1 +aliyun-python-sdk-kms==2.16.5 +pathlib==1.0.1 +zipp==3.19.2 +autocommand==2.2.2 +typeguard==4.3.0 +jaraco.context==5.3.0 +backports.tarfile==1.2.0 +typing_extensions==4.12.2 +wheel==0.45.1 +importlib_metadata==8.0.0 +jaraco.text==3.12.1 +inflect==7.3.1 +jaraco.collections==5.1.0 +packaging==24.2 +jaraco.functools==4.0.1 +platformdirs==4.2.2 +more-itertools==10.3.0 +tomli==2.0.1 diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..1963e36dc20980fdd93dee8e84488be4735c4d91 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/wandb-metadata.json @@ -0,0 +1,104 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-07-06T19:46:22.411659Z", + "args": [ + "--devices", + "0,1,2,3,4,5,6,7", + "--mode", + "train", + "--filename", + "stage2_07070337_2datasets_noconstruct", + "--num_query_token", + "8", + "--save_every_n_epochs", + "5", + "--max_epochs", + "10", + "--batch_size", + "32", + "--precision", + "bf16-mixed", + "--num_workers", + "8", + "--plm_model", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--llm_name", + "/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300", + "--llm_tune", + "mid_lora", + "--mix_dataset", + "--stage1_path", + "/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt", + "--use_wandb_logger" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", + "codePath": "stage2.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage2_07070337_2datasets_noconstruct/", + "host": "dsw-266702-55dc696568-n7mtt", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage2.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "1260404736" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/wandb-summary.json b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ee24a1c4da4e48bd8079e52dd9559dac7dee4ef7 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":314}} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/logs/debug-internal.log b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..481a4ade84dc7ac40ff81003f05675a77098271e --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/logs/debug-internal.log @@ -0,0 +1,10 @@ +{"time":"2025-07-07T03:46:22.41748301+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/logs/debug-core.log"} +{"time":"2025-07-07T03:46:23.486697713+08:00","level":"INFO","msg":"created new stream","id":"ub8khdxy"} +{"time":"2025-07-07T03:46:23.486728668+08:00","level":"INFO","msg":"stream: started","id":"ub8khdxy"} +{"time":"2025-07-07T03:46:23.486766821+08:00","level":"INFO","msg":"sender: started","stream_id":"ub8khdxy"} +{"time":"2025-07-07T03:46:23.486754473+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"ub8khdxy"} +{"time":"2025-07-07T03:46:23.486818475+08:00","level":"INFO","msg":"handler: started","stream_id":"ub8khdxy"} +{"time":"2025-07-07T03:46:24.669888563+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-07-07T03:51:37.065749405+08:00","level":"INFO","msg":"stream: closing","id":"ub8khdxy"} +{"time":"2025-07-07T03:51:37.06580996+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-07-07T03:51:37.066902914+08:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/logs/debug.log b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..6c369c567cb8812a86a87d6a6d8419dd0c20e295 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/logs/debug.log @@ -0,0 +1,24 @@ +2025-07-07 03:46:22,405 INFO MainThread:3011 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-07-07 03:46:22,405 INFO MainThread:3011 [wandb_setup.py:_flush():70] Configure stats pid to 3011 +2025-07-07 03:46:22,405 INFO MainThread:3011 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-07-07 03:46:22,405 INFO MainThread:3011 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-07-07 03:46:22,405 INFO MainThread:3011 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-07-07 03:46:22,405 INFO MainThread:3011 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/logs/debug.log +2025-07-07 03:46:22,405 INFO MainThread:3011 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/logs/debug-internal.log +2025-07-07 03:46:22,405 INFO MainThread:3011 [wandb_init.py:init():852] calling init triggers +2025-07-07 03:46:22,405 INFO MainThread:3011 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-07-07 03:46:22,405 INFO MainThread:3011 [wandb_init.py:init():893] starting backend +2025-07-07 03:46:22,405 INFO MainThread:3011 [wandb_init.py:init():897] sending inform_init request +2025-07-07 03:46:22,407 INFO MainThread:3011 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-07-07 03:46:22,409 INFO MainThread:3011 [wandb_init.py:init():907] backend started and connected +2025-07-07 03:46:22,412 INFO MainThread:3011 [wandb_init.py:init():1005] updated telemetry +2025-07-07 03:46:22,414 INFO MainThread:3011 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-07-07 03:46:24,656 INFO MainThread:3011 [wandb_init.py:init():1104] starting run threads in backend +2025-07-07 03:46:24,836 INFO MainThread:3011 [wandb_run.py:_console_start():2573] atexit reg +2025-07-07 03:46:24,837 INFO MainThread:3011 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-07-07 03:46:24,840 INFO MainThread:3011 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-07-07 03:46:24,840 INFO MainThread:3011 [wandb_run.py:_redirect():2513] Redirects installed. +2025-07-07 03:46:24,842 INFO MainThread:3011 [wandb_init.py:init():1150] run started, returning control to user process +2025-07-07 03:51:15,095 INFO MainThread:3011 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage2_07070337_2datasets_noconstruct', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 10, 'accumulate_grad_batches': 1, 'check_val_every_n_epoch': 1, 'enable_flash': False, 'use_wandb_logger': True, 'mix_dataset': True, 'save_every_n_epochs': 5, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'cross_attention_freq': 2, 'num_query_token': 8, 'llm_name': '/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300', 'num_beams': 5, 'do_sample': False, 'max_inference_len': 128, 'min_inference_len': 1, 'llm_tune': 'mid_lora', 'peft_config': '', 'peft_dir': '', 'plm_model': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'enbale_gradient_checkpointing': False, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'stage1_path': '/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt', 'stage2_path': '', 'init_checkpoint': '', 'caption_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'inference_batch_size': 4, 'root': 'data', 'text_max_len': 1024, 'q_max_len': 29, 'a_max_len': 36, 'prot_max_len': 1024, 'prompt': 'The protein has the following properties: ', 'filter_side_qa': False} +2025-07-07 03:51:37,064 INFO MsgRouterThr:3011 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/run-ub8khdxy.wandb b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/run-ub8khdxy.wandb new file mode 100644 index 0000000000000000000000000000000000000000..f4605c79305fac7620119045532bf7fa016226f0 Binary files /dev/null and b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_034622-ub8khdxy/run-ub8khdxy.wandb differ diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/config.yaml b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a45a7f33ea7d68e2369ed28e0db6fc4d4a333211 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/config.yaml @@ -0,0 +1,150 @@ +_wandb: + value: + cli_version: 0.19.11 + m: + - "1": trainer/global_step + "6": + - 3 + "7": [] + python_version: 3.10.0 + t: + "1": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "2": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "3": + - 7 + - 23 + - 55 + - 66 + "4": 3.10.0 + "5": 0.19.11 + "6": 4.52.3 + "8": + - 5 + "12": 0.19.11 + "13": linux-x86_64 +a_max_len: + value: 36 +accelerator: + value: gpu +accumulate_grad_batches: + value: 1 +batch_size: + value: 16 +bert_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft +caption_eval_epoch: + value: 10 +check_val_every_n_epoch: + value: 1 +cross_attention_freq: + value: 2 +devices: + value: 0,1,2,3,4,5,6,7 +do_sample: + value: false +enable_flash: + value: false +enbale_gradient_checkpointing: + value: false +filename: + value: stage2_07070337_2datasets_noconstruct +filter_side_qa: + value: false +inference_batch_size: + value: 4 +init_checkpoint: + value: "" +init_lr: + value: 0.0001 +llm_name: + value: /oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300 +llm_tune: + value: mid_lora +lora_alpha: + value: 16 +lora_dropout: + value: 0.1 +lora_r: + value: 8 +lr_decay_rate: + value: 0.9 +max_epochs: + value: 10 +max_inference_len: + value: 128 +min_inference_len: + value: 1 +min_lr: + value: 1e-05 +mix_dataset: + value: true +mode: + value: train +num_beams: + value: 5 +num_query_token: + value: 8 +num_workers: + value: 8 +peft_config: + value: "" +peft_dir: + value: "" +plm_model: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m +plm_tune: + value: freeze +precision: + value: bf16-mixed +prompt: + value: 'The protein has the following properties: ' +prot_max_len: + value: 1024 +q_max_len: + value: 29 +root: + value: data +save_every_n_epochs: + value: 5 +scheduler: + value: linear_warmup_cosine_lr +seed: + value: 42 +stage1_path: + value: /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt +stage2_path: + value: "" +strategy: + value: deepspeed +text_max_len: + value: 1024 +use_wandb_logger: + value: true +warmup_lr: + value: 1e-06 +warmup_steps: + value: 1000 +weight_decay: + value: 0.05 diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/output.log b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..66058a96c7742b0fdd525680d4a5738abedeefe4 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/output.log @@ -0,0 +1,239 @@ +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct exists and is not empty. +Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`. +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] + + | Name | Type | Params | Mode +------------------------------------------- +0 | blip2 | Blip2OPT | 7.9 B | train +------------------------------------------- +104 M Trainable params +7.8 B Non-trainable params +7.9 B Total params +31,459.025Total estimated model params size (MB) +174 Modules in train mode +1203 Modules in eval mode +Epoch 0: 0%| | 0/6663 [00:00 + main(get_args()) + File "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", line 93, in main + trainer.fit(model, datamodule=dm)#, ckpt_path=args.ckpt_path) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit + call._call_and_handle_interrupt( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt + return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch + return function(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl + self._run(model, ckpt_path=ckpt_path) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run + results = self._run_stage() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1056, in _run_stage + self.fit_loop.run() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 216, in run + self.advance() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 455, in advance + self.epoch_loop.run(self._data_fetcher) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 150, in run + self.advance(data_fetcher) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 320, in advance + batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 192, in run + self._optimizer_step(batch_idx, closure) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 270, in _optimizer_step + call._call_lightning_module_hook( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 176, in _call_lightning_module_hook + output = fn(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1302, in optimizer_step + optimizer.step(closure=optimizer_closure) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py", line 154, in step + step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 270, in optimizer_step + optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 239, in optimizer_step + return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/deepspeed.py", line 129, in optimizer_step + closure_result = closure() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 146, in __call__ + self._result = self.closure(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context + return func(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 131, in closure + step_output = self._step_fn() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 319, in _training_step + training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values()) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook + output = fn(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 390, in training_step + return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__ + wrapper_output = wrapper_module(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn + ret_val = func(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward + loss = self.module(*inputs, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl + return inner() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner + result = forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward + out = method(*_args, **_kwargs) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage2.py", line 283, in training_step + loss = self.blip2(batch) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_opt.py", line 212, in forward + outputs = self.llm_model( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/utils/generic.py", line 969, in wrapper + output = func(self, *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 703, in forward + outputs: BaseModelOutputWithPast = self.model( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/utils/generic.py", line 969, in wrapper + output = func(self, *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 436, in forward + layer_outputs = decoder_layer( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/modeling_layers.py", line 48, in __call__ + return super().__call__(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 257, in forward + hidden_states, self_attn_weights = self.self_attn( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 189, in forward + attn_output, attn_weights = attention_interface( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/integrations/sdpa_attention.py", line 54, in sdpa_attention_forward + attn_output = torch.nn.functional.scaled_dot_product_attention( +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 114.00 MiB. GPU 0 has a total capacity of 79.35 GiB of which 40.19 MiB is free. Process 494248 has 79.30 GiB memory in use. Of the allocated memory 77.23 GiB is allocated by PyTorch, and 456.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +[rank0]: Traceback (most recent call last): +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", line 131, in +[rank0]: main(get_args()) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", line 93, in main +[rank0]: trainer.fit(model, datamodule=dm)#, ckpt_path=args.ckpt_path) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit +[rank0]: call._call_and_handle_interrupt( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt +[rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch +[rank0]: return function(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl +[rank0]: self._run(model, ckpt_path=ckpt_path) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run +[rank0]: results = self._run_stage() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1056, in _run_stage +[rank0]: self.fit_loop.run() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 216, in run +[rank0]: self.advance() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 455, in advance +[rank0]: self.epoch_loop.run(self._data_fetcher) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 150, in run +[rank0]: self.advance(data_fetcher) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 320, in advance +[rank0]: batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 192, in run +[rank0]: self._optimizer_step(batch_idx, closure) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 270, in _optimizer_step +[rank0]: call._call_lightning_module_hook( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 176, in _call_lightning_module_hook +[rank0]: output = fn(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1302, in optimizer_step +[rank0]: optimizer.step(closure=optimizer_closure) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py", line 154, in step +[rank0]: step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 270, in optimizer_step +[rank0]: optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 239, in optimizer_step +[rank0]: return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/deepspeed.py", line 129, in optimizer_step +[rank0]: closure_result = closure() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 146, in __call__ +[rank0]: self._result = self.closure(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context +[rank0]: return func(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 131, in closure +[rank0]: step_output = self._step_fn() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 319, in _training_step +[rank0]: training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values()) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook +[rank0]: output = fn(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 390, in training_step +[rank0]: return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__ +[rank0]: wrapper_output = wrapper_module(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn +[rank0]: ret_val = func(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward +[rank0]: loss = self.module(*inputs, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl +[rank0]: return inner() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner +[rank0]: result = forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward +[rank0]: out = method(*_args, **_kwargs) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage2.py", line 283, in training_step +[rank0]: loss = self.blip2(batch) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_opt.py", line 212, in forward +[rank0]: outputs = self.llm_model( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/utils/generic.py", line 969, in wrapper +[rank0]: output = func(self, *args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 703, in forward +[rank0]: outputs: BaseModelOutputWithPast = self.model( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/utils/generic.py", line 969, in wrapper +[rank0]: output = func(self, *args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 436, in forward +[rank0]: layer_outputs = decoder_layer( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/modeling_layers.py", line 48, in __call__ +[rank0]: return super().__call__(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 257, in forward +[rank0]: hidden_states, self_attn_weights = self.self_attn( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 189, in forward +[rank0]: attn_output, attn_weights = attention_interface( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/integrations/sdpa_attention.py", line 54, in sdpa_attention_forward +[rank0]: attn_output = torch.nn.functional.scaled_dot_product_attention( +[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 114.00 MiB. GPU 0 has a total capacity of 79.35 GiB of which 40.19 MiB is free. Process 494248 has 79.30 GiB memory in use. Of the allocated memory 77.23 GiB is allocated by PyTorch, and 456.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/requirements.txt b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..7acac5f5fa52febebc01067eba0bccb92b3c4aea --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/requirements.txt @@ -0,0 +1,225 @@ +gitdb==4.0.12 +virtualenv==20.31.2 +wasabi==1.1.3 +spacy-loggers==1.0.5 +srsly==2.5.1 +urllib3==2.4.0 +annotated-types==0.7.0 +watchdog==6.0.0 +smart-open==7.1.0 +blis==1.3.0 +nvidia-cuda-nvrtc-cu12==12.4.127 +executing==2.2.0 +altair==5.5.0 +torchmetrics==1.7.1 +mdurl==0.1.2 +protobuf==6.31.0 +psutil==7.0.0 +nvidia-cusparselt-cu12==0.6.2 +toml==0.10.2 +tzdata==2025.2 +distlib==0.3.9 +hf-xet==1.1.2 +idna==3.10 +setuptools==78.1.1 +ftfy==6.3.1 +pydeck==0.9.1 +wcwidth==0.2.13 +pydantic==2.11.5 +fairscale==0.4.4 +nvidia-nvtx-cu12==12.4.127 +plotly==6.1.1 +pexpect==4.9.0 +pyparsing==3.2.3 +torch==2.6.0 +catalogue==2.0.10 +pyarrow==20.0.0 +traitlets==5.14.3 +parso==0.8.4 +multidict==6.4.4 +pandas==2.2.3 +charset-normalizer==3.4.2 +braceexpand==0.1.7 +decorator==5.2.1 +webencodings==0.5.1 +nltk==3.9.1 +propcache==0.3.1 +pydantic_core==2.33.2 +platformdirs==4.3.8 +pycocoevalcap==1.2 +typing-inspection==0.4.1 +pycocotools==2.0.8 +cfgv==3.4.0 +contexttimer==0.3.3 +nvidia-cuda-runtime-cu12==12.4.127 +python-magic==0.4.27 +yarl==1.20.0 +referencing==0.36.2 +nvidia-curand-cu12==10.3.5.147 +scipy==1.15.3 +scikit-image==0.25.2 +aiosignal==1.3.2 +stack-data==0.6.3 +weasel==0.4.1 +nvidia-cuda-cupti-cu12==12.4.127 +smmap==5.0.2 +PyYAML==6.0.2 +nvidia-nccl-cu12==2.21.5 +thinc==8.3.6 +certifi==2025.4.26 +salesforce-lavis==1.0.2 +nvidia-cusparse-cu12==12.3.1.170 +decord==0.6.0 +prompt_toolkit==3.0.51 +nvidia-cufft-cu12==11.2.1.3 +ninja==1.11.1.4 +deepspeed==0.16.10+b666844f +MarkupSafe==3.0.2 +rpds-py==0.25.1 +filelock==3.18.0 +torchvision==0.21.0 +ipython==8.36.0 +contourpy==1.3.2 +tifffile==2025.5.10 +cachetools==5.5.2 +py-cpuinfo==9.0.0 +opencv-python-headless==4.5.5.64 +tqdm==4.67.1 +opendatasets==0.1.22 +pytz==2025.2 +bleach==6.2.0 +sentencepiece==0.2.0 +marisa-trie==1.2.1 +spacy-legacy==3.0.12 +timm==0.4.12 +tornado==6.5.1 +shellingham==1.5.4 +rouge_score==0.1.2 +langcodes==3.5.0 +GitPython==3.1.44 +nvidia-cufile-cu12==1.11.1.6 +portalocker==3.1.1 +iopath==0.1.10 +spacy==3.8.7 +wrapt==1.17.2 +ptyprocess==0.7.0 +jedi==0.19.2 +async-timeout==5.0.1 +wheel==0.45.1 +cloudpathlib==0.21.1 +hjson==3.1.0 +attrs==25.3.0 +pytorch-lightning==2.5.1.post0 +text-unidecode==1.3 +absl-py==2.2.2 +lightning-utilities==0.14.3 +lazy_loader==0.4 +triton==3.2.0 +Pygments==2.19.1 +pillow==11.2.1 +mpmath==1.3.0 +regex==2024.11.6 +matplotlib-inline==0.1.7 +nvidia-cusolver-cu12==11.6.1.9 +matplotlib==3.10.3 +einops==0.8.1 +nvidia-nvjitlink-cu12==12.4.127 +frozenlist==1.6.0 +webdataset==0.2.111 +exceptiongroup==1.3.0 +numpy==2.2.6 +networkx==3.4.2 +streamlit==1.45.1 +nvidia-ml-py==12.575.51 +msgpack==1.1.0 +language_data==1.3.0 +python-dateutil==2.9.0.post0 +pure_eval==0.2.3 +rich==14.0.0 +packaging==24.2 +asttokens==3.0.0 +blinker==1.9.0 +nvidia-cublas-cu12==12.4.5.8 +pre_commit==4.2.0 +nodeenv==1.9.1 +identify==2.6.12 +python-slugify==8.0.4 +kaggle==1.7.4.5 +cymem==2.0.11 +omegaconf==2.3.0 +kiwisolver==1.4.8 +joblib==1.5.1 +Jinja2==3.1.6 +murmurhash==1.0.13 +aiohappyeyeballs==2.6.1 +typing_extensions==4.13.2 +sympy==1.13.1 +jsonschema-specifications==2025.4.1 +cycler==0.12.1 +safetensors==0.5.3 +markdown-it-py==3.0.0 +confection==0.1.5 +tenacity==9.1.2 +imageio==2.37.0 +six==1.17.0 +nvidia-cudnn-cu12==9.1.0.70 +fonttools==4.58.0 +requests==2.32.3 +antlr4-python3-runtime==4.9.3 +delta-center-client==0.0.4 +typer==0.16.0 +flash-attn==2.7.1.post1 +aiohttp==3.12.2 +wandb==0.19.11 +setproctitle==1.3.6 +docker-pycreds==0.4.0 +cheroot==10.0.1 +jmespath==0.10.0 +xxhash==3.5.0 +scikit-learn==1.6.1 +opencv-python==4.11.0.86 +tokenizers==0.21.1 +multiprocess==0.70.16 +preshed==3.0.10 +huggingface-hub==0.32.1 +transformers==4.52.3 +jaraco.functools==4.1.0 +crcmod==1.7 +pycryptodome==3.23.0 +sentry-sdk==2.29.1 +fsspec==2025.3.0 +opendelta==0.3.2 +jsonschema==4.24.0 +oss2==2.15.0 +web.py==0.62 +threadpoolctl==3.6.0 +datasets==3.6.0 +dill==0.3.8 +more-itertools==10.7.0 +narwhals==1.41.0 +pip==25.1.1 +click==8.2.1 +pycparser==2.22 +cryptography==45.0.3 +aliyun-python-sdk-core==2.16.0 +yacs==0.1.8 +cffi==1.17.1 +bigmodelvis==0.0.1 +aliyun-python-sdk-kms==2.16.5 +pathlib==1.0.1 +zipp==3.19.2 +autocommand==2.2.2 +typeguard==4.3.0 +jaraco.context==5.3.0 +backports.tarfile==1.2.0 +typing_extensions==4.12.2 +wheel==0.45.1 +importlib_metadata==8.0.0 +jaraco.text==3.12.1 +inflect==7.3.1 +jaraco.collections==5.1.0 +packaging==24.2 +jaraco.functools==4.0.1 +platformdirs==4.2.2 +more-itertools==10.3.0 +tomli==2.0.1 diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..8a858a784570a1b22814eb9a8ebda6d31a0c9277 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/wandb-metadata.json @@ -0,0 +1,104 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-07-06T20:01:33.725255Z", + "args": [ + "--devices", + "0,1,2,3,4,5,6,7", + "--mode", + "train", + "--filename", + "stage2_07070337_2datasets_noconstruct", + "--num_query_token", + "8", + "--save_every_n_epochs", + "5", + "--max_epochs", + "10", + "--batch_size", + "16", + "--precision", + "bf16-mixed", + "--num_workers", + "8", + "--plm_model", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--llm_name", + "/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300", + "--llm_tune", + "mid_lora", + "--mix_dataset", + "--stage1_path", + "/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt", + "--use_wandb_logger" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", + "codePath": "stage2.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage2_07070337_2datasets_noconstruct/", + "host": "dsw-266702-55dc696568-n7mtt", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage2.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "1260429312" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/wandb-summary.json b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..c084a6c3839257c4ea88721bf18b86802d587473 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":27}} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/logs/debug-internal.log b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..7909d92c788a8d0f7e78b86cf772e74fac7a5153 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/logs/debug-internal.log @@ -0,0 +1,15 @@ +{"time":"2025-07-07T04:01:33.741515119+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/logs/debug-core.log"} +{"time":"2025-07-07T04:01:34.828488835+08:00","level":"INFO","msg":"created new stream","id":"qbd0oc6y"} +{"time":"2025-07-07T04:01:34.828543414+08:00","level":"INFO","msg":"stream: started","id":"qbd0oc6y"} +{"time":"2025-07-07T04:01:34.828586908+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"qbd0oc6y"} +{"time":"2025-07-07T04:01:34.828594722+08:00","level":"INFO","msg":"sender: started","stream_id":"qbd0oc6y"} +{"time":"2025-07-07T04:01:34.828668755+08:00","level":"INFO","msg":"handler: started","stream_id":"qbd0oc6y"} +{"time":"2025-07-07T04:01:36.085560463+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-07-07T04:02:01.343746812+08:00","level":"INFO","msg":"stream: closing","id":"qbd0oc6y"} +{"time":"2025-07-07T04:02:01.343821818+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-07-07T04:02:01.344607167+08:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-07-07T04:02:03.200391124+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-07-07T04:02:04.585366525+08:00","level":"INFO","msg":"handler: closed","stream_id":"qbd0oc6y"} +{"time":"2025-07-07T04:02:04.585411205+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"qbd0oc6y"} +{"time":"2025-07-07T04:02:04.585438803+08:00","level":"INFO","msg":"sender: closed","stream_id":"qbd0oc6y"} +{"time":"2025-07-07T04:02:04.589737518+08:00","level":"INFO","msg":"stream: closed","id":"qbd0oc6y"} diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/logs/debug.log b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..1dabdbd4cedc33fd5e4d17b7017fba02f5a296b9 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/logs/debug.log @@ -0,0 +1,24 @@ +2025-07-07 04:01:33,718 INFO MainThread:9224 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-07-07 04:01:33,718 INFO MainThread:9224 [wandb_setup.py:_flush():70] Configure stats pid to 9224 +2025-07-07 04:01:33,718 INFO MainThread:9224 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-07-07 04:01:33,718 INFO MainThread:9224 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-07-07 04:01:33,718 INFO MainThread:9224 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-07-07 04:01:33,718 INFO MainThread:9224 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/logs/debug.log +2025-07-07 04:01:33,718 INFO MainThread:9224 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/logs/debug-internal.log +2025-07-07 04:01:33,718 INFO MainThread:9224 [wandb_init.py:init():852] calling init triggers +2025-07-07 04:01:33,718 INFO MainThread:9224 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-07-07 04:01:33,718 INFO MainThread:9224 [wandb_init.py:init():893] starting backend +2025-07-07 04:01:33,718 INFO MainThread:9224 [wandb_init.py:init():897] sending inform_init request +2025-07-07 04:01:33,720 INFO MainThread:9224 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-07-07 04:01:33,721 INFO MainThread:9224 [wandb_init.py:init():907] backend started and connected +2025-07-07 04:01:33,726 INFO MainThread:9224 [wandb_init.py:init():1005] updated telemetry +2025-07-07 04:01:33,726 INFO MainThread:9224 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-07-07 04:01:36,076 INFO MainThread:9224 [wandb_init.py:init():1104] starting run threads in backend +2025-07-07 04:01:36,221 INFO MainThread:9224 [wandb_run.py:_console_start():2573] atexit reg +2025-07-07 04:01:36,221 INFO MainThread:9224 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-07-07 04:01:36,225 INFO MainThread:9224 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-07-07 04:01:36,225 INFO MainThread:9224 [wandb_run.py:_redirect():2513] Redirects installed. +2025-07-07 04:01:36,226 INFO MainThread:9224 [wandb_init.py:init():1150] run started, returning control to user process +2025-07-07 04:01:44,655 INFO MainThread:9224 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage2_07070337_2datasets_noconstruct', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 10, 'accumulate_grad_batches': 1, 'check_val_every_n_epoch': 1, 'enable_flash': False, 'use_wandb_logger': True, 'mix_dataset': True, 'save_every_n_epochs': 5, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'cross_attention_freq': 2, 'num_query_token': 8, 'llm_name': '/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300', 'num_beams': 5, 'do_sample': False, 'max_inference_len': 128, 'min_inference_len': 1, 'llm_tune': 'mid_lora', 'peft_config': '', 'peft_dir': '', 'plm_model': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'enbale_gradient_checkpointing': False, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'stage1_path': '/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt', 'stage2_path': '', 'init_checkpoint': '', 'caption_eval_epoch': 10, 'num_workers': 8, 'batch_size': 16, 'inference_batch_size': 4, 'root': 'data', 'text_max_len': 1024, 'q_max_len': 29, 'a_max_len': 36, 'prot_max_len': 1024, 'prompt': 'The protein has the following properties: ', 'filter_side_qa': False} +2025-07-07 04:02:01,342 INFO MsgRouterThr:9224 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/run-qbd0oc6y.wandb b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/run-qbd0oc6y.wandb new file mode 100644 index 0000000000000000000000000000000000000000..86abec47f9406c320d328e222c7fe151cc4ff093 Binary files /dev/null and b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_040133-qbd0oc6y/run-qbd0oc6y.wandb differ diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/config.yaml b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8889615071241a904202e68799d2dbefccf92ec0 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/config.yaml @@ -0,0 +1,222 @@ +_wandb: + value: + cli_version: 0.19.11 + m: + - "1": dataset0/rouge_2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": trainer/global_step + "6": + - 3 + "7": [] + - "1": loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": lr + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataloader2/val loss/dataloader_idx_2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/rouge_1 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/rouge_l + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/bleu2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataloader0/val loss/dataloader_idx_0 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/meteor_score + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/acc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": dataset0/bleu4 + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.10.0 + t: + "1": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "2": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "3": + - 7 + - 23 + - 55 + - 66 + "4": 3.10.0 + "5": 0.19.11 + "6": 4.52.3 + "8": + - 5 + "12": 0.19.11 + "13": linux-x86_64 +a_max_len: + value: 36 +accelerator: + value: gpu +accumulate_grad_batches: + value: 1 +batch_size: + value: 8 +bert_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft +caption_eval_epoch: + value: 10 +check_val_every_n_epoch: + value: 1 +cross_attention_freq: + value: 2 +devices: + value: 0,1,2,3,4,5,6,7 +do_sample: + value: false +enable_flash: + value: false +enbale_gradient_checkpointing: + value: false +filename: + value: stage2_07070337_2datasets_noconstruct +filter_side_qa: + value: false +inference_batch_size: + value: 4 +init_checkpoint: + value: "" +init_lr: + value: 0.0001 +llm_name: + value: /oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300 +llm_tune: + value: mid_lora +lora_alpha: + value: 16 +lora_dropout: + value: 0.1 +lora_r: + value: 8 +lr_decay_rate: + value: 0.9 +max_epochs: + value: 10 +max_inference_len: + value: 128 +min_inference_len: + value: 1 +min_lr: + value: 1e-05 +mix_dataset: + value: true +mode: + value: train +num_beams: + value: 5 +num_query_token: + value: 8 +num_workers: + value: 8 +peft_config: + value: "" +peft_dir: + value: "" +plm_model: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m +plm_tune: + value: freeze +precision: + value: bf16-mixed +prompt: + value: 'The protein has the following properties: ' +prot_max_len: + value: 1024 +q_max_len: + value: 29 +root: + value: data +save_every_n_epochs: + value: 5 +scheduler: + value: linear_warmup_cosine_lr +seed: + value: 42 +stage1_path: + value: /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt +stage2_path: + value: "" +strategy: + value: deepspeed +text_max_len: + value: 1024 +use_wandb_logger: + value: true +warmup_lr: + value: 1e-06 +warmup_steps: + value: 1000 +weight_decay: + value: 0.05 diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/output.log b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..fc3ee51feb0a2ad2be1274a7bce2666374e52a14 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/output.log @@ -0,0 +1,35 @@ +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct exists and is not empty. +Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`. +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] + + | Name | Type | Params | Mode +------------------------------------------- +0 | blip2 | Blip2OPT | 7.9 B | train +------------------------------------------- +104 M Trainable params +7.8 B Non-trainable params +7.9 B Total params +31,459.025Total estimated model params size (MB) +174 Modules in train mode +1203 Modules in eval mode +Epoch 9: 100%|█████████████████████████████████████████████████████████████| 13326/13326 [6:05:11<00:00, 0.61it/s, v_num=cbb9]BLEU-2 score: 25.61839477406372 +BLEU-4 score: 19.679000379664167█████████████████████████████████████████████████████████████| 313/313 [23:27<00:00, 0.22it/s] +/nas/shared/kilab/wangyujia/ProtT3/model/dist_funs.py:18: FutureWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + sd = self.module.state_dict(destination, prefix, keep_vars) +20000it [01:34, 211.46it/s] +20000it [00:35, 559.31it/s] +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/acc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices. +Average Meteor score: 26.87041636768526 +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/bleu2', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices. +ROUGE score: +rouge1: 31.383168994263787 +rouge2: 19.331697430302135 +rougeL: 26.314038584048582 +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/bleu4', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices. +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_1', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices. +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_2', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices. +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_l', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices. +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/meteor_score', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices. +Epoch 9: 100%|█████████████████████████████████████████████████████████████| 13326/13326 [6:59:02<00:00, 0.53it/s, v_num=cbb9] + +`Trainer.fit` stopped: `max_epochs=10` reached. diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/requirements.txt b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..7acac5f5fa52febebc01067eba0bccb92b3c4aea --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/requirements.txt @@ -0,0 +1,225 @@ +gitdb==4.0.12 +virtualenv==20.31.2 +wasabi==1.1.3 +spacy-loggers==1.0.5 +srsly==2.5.1 +urllib3==2.4.0 +annotated-types==0.7.0 +watchdog==6.0.0 +smart-open==7.1.0 +blis==1.3.0 +nvidia-cuda-nvrtc-cu12==12.4.127 +executing==2.2.0 +altair==5.5.0 +torchmetrics==1.7.1 +mdurl==0.1.2 +protobuf==6.31.0 +psutil==7.0.0 +nvidia-cusparselt-cu12==0.6.2 +toml==0.10.2 +tzdata==2025.2 +distlib==0.3.9 +hf-xet==1.1.2 +idna==3.10 +setuptools==78.1.1 +ftfy==6.3.1 +pydeck==0.9.1 +wcwidth==0.2.13 +pydantic==2.11.5 +fairscale==0.4.4 +nvidia-nvtx-cu12==12.4.127 +plotly==6.1.1 +pexpect==4.9.0 +pyparsing==3.2.3 +torch==2.6.0 +catalogue==2.0.10 +pyarrow==20.0.0 +traitlets==5.14.3 +parso==0.8.4 +multidict==6.4.4 +pandas==2.2.3 +charset-normalizer==3.4.2 +braceexpand==0.1.7 +decorator==5.2.1 +webencodings==0.5.1 +nltk==3.9.1 +propcache==0.3.1 +pydantic_core==2.33.2 +platformdirs==4.3.8 +pycocoevalcap==1.2 +typing-inspection==0.4.1 +pycocotools==2.0.8 +cfgv==3.4.0 +contexttimer==0.3.3 +nvidia-cuda-runtime-cu12==12.4.127 +python-magic==0.4.27 +yarl==1.20.0 +referencing==0.36.2 +nvidia-curand-cu12==10.3.5.147 +scipy==1.15.3 +scikit-image==0.25.2 +aiosignal==1.3.2 +stack-data==0.6.3 +weasel==0.4.1 +nvidia-cuda-cupti-cu12==12.4.127 +smmap==5.0.2 +PyYAML==6.0.2 +nvidia-nccl-cu12==2.21.5 +thinc==8.3.6 +certifi==2025.4.26 +salesforce-lavis==1.0.2 +nvidia-cusparse-cu12==12.3.1.170 +decord==0.6.0 +prompt_toolkit==3.0.51 +nvidia-cufft-cu12==11.2.1.3 +ninja==1.11.1.4 +deepspeed==0.16.10+b666844f +MarkupSafe==3.0.2 +rpds-py==0.25.1 +filelock==3.18.0 +torchvision==0.21.0 +ipython==8.36.0 +contourpy==1.3.2 +tifffile==2025.5.10 +cachetools==5.5.2 +py-cpuinfo==9.0.0 +opencv-python-headless==4.5.5.64 +tqdm==4.67.1 +opendatasets==0.1.22 +pytz==2025.2 +bleach==6.2.0 +sentencepiece==0.2.0 +marisa-trie==1.2.1 +spacy-legacy==3.0.12 +timm==0.4.12 +tornado==6.5.1 +shellingham==1.5.4 +rouge_score==0.1.2 +langcodes==3.5.0 +GitPython==3.1.44 +nvidia-cufile-cu12==1.11.1.6 +portalocker==3.1.1 +iopath==0.1.10 +spacy==3.8.7 +wrapt==1.17.2 +ptyprocess==0.7.0 +jedi==0.19.2 +async-timeout==5.0.1 +wheel==0.45.1 +cloudpathlib==0.21.1 +hjson==3.1.0 +attrs==25.3.0 +pytorch-lightning==2.5.1.post0 +text-unidecode==1.3 +absl-py==2.2.2 +lightning-utilities==0.14.3 +lazy_loader==0.4 +triton==3.2.0 +Pygments==2.19.1 +pillow==11.2.1 +mpmath==1.3.0 +regex==2024.11.6 +matplotlib-inline==0.1.7 +nvidia-cusolver-cu12==11.6.1.9 +matplotlib==3.10.3 +einops==0.8.1 +nvidia-nvjitlink-cu12==12.4.127 +frozenlist==1.6.0 +webdataset==0.2.111 +exceptiongroup==1.3.0 +numpy==2.2.6 +networkx==3.4.2 +streamlit==1.45.1 +nvidia-ml-py==12.575.51 +msgpack==1.1.0 +language_data==1.3.0 +python-dateutil==2.9.0.post0 +pure_eval==0.2.3 +rich==14.0.0 +packaging==24.2 +asttokens==3.0.0 +blinker==1.9.0 +nvidia-cublas-cu12==12.4.5.8 +pre_commit==4.2.0 +nodeenv==1.9.1 +identify==2.6.12 +python-slugify==8.0.4 +kaggle==1.7.4.5 +cymem==2.0.11 +omegaconf==2.3.0 +kiwisolver==1.4.8 +joblib==1.5.1 +Jinja2==3.1.6 +murmurhash==1.0.13 +aiohappyeyeballs==2.6.1 +typing_extensions==4.13.2 +sympy==1.13.1 +jsonschema-specifications==2025.4.1 +cycler==0.12.1 +safetensors==0.5.3 +markdown-it-py==3.0.0 +confection==0.1.5 +tenacity==9.1.2 +imageio==2.37.0 +six==1.17.0 +nvidia-cudnn-cu12==9.1.0.70 +fonttools==4.58.0 +requests==2.32.3 +antlr4-python3-runtime==4.9.3 +delta-center-client==0.0.4 +typer==0.16.0 +flash-attn==2.7.1.post1 +aiohttp==3.12.2 +wandb==0.19.11 +setproctitle==1.3.6 +docker-pycreds==0.4.0 +cheroot==10.0.1 +jmespath==0.10.0 +xxhash==3.5.0 +scikit-learn==1.6.1 +opencv-python==4.11.0.86 +tokenizers==0.21.1 +multiprocess==0.70.16 +preshed==3.0.10 +huggingface-hub==0.32.1 +transformers==4.52.3 +jaraco.functools==4.1.0 +crcmod==1.7 +pycryptodome==3.23.0 +sentry-sdk==2.29.1 +fsspec==2025.3.0 +opendelta==0.3.2 +jsonschema==4.24.0 +oss2==2.15.0 +web.py==0.62 +threadpoolctl==3.6.0 +datasets==3.6.0 +dill==0.3.8 +more-itertools==10.7.0 +narwhals==1.41.0 +pip==25.1.1 +click==8.2.1 +pycparser==2.22 +cryptography==45.0.3 +aliyun-python-sdk-core==2.16.0 +yacs==0.1.8 +cffi==1.17.1 +bigmodelvis==0.0.1 +aliyun-python-sdk-kms==2.16.5 +pathlib==1.0.1 +zipp==3.19.2 +autocommand==2.2.2 +typeguard==4.3.0 +jaraco.context==5.3.0 +backports.tarfile==1.2.0 +typing_extensions==4.12.2 +wheel==0.45.1 +importlib_metadata==8.0.0 +jaraco.text==3.12.1 +inflect==7.3.1 +jaraco.collections==5.1.0 +packaging==24.2 +jaraco.functools==4.0.1 +platformdirs==4.2.2 +more-itertools==10.3.0 +tomli==2.0.1 diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..3b39eb4035c385a7e0df6670e82d06c4e7dada4d --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/wandb-metadata.json @@ -0,0 +1,104 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-07-06T20:12:31.569419Z", + "args": [ + "--devices", + "0,1,2,3,4,5,6,7", + "--mode", + "train", + "--filename", + "stage2_07070337_2datasets_noconstruct", + "--num_query_token", + "8", + "--save_every_n_epochs", + "5", + "--max_epochs", + "10", + "--batch_size", + "8", + "--precision", + "bf16-mixed", + "--num_workers", + "8", + "--plm_model", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--llm_name", + "/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300", + "--llm_tune", + "mid_lora", + "--mix_dataset", + "--stage1_path", + "/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt", + "--use_wandb_logger" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", + "codePath": "stage2.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage2_07070337_2datasets_noconstruct/", + "host": "dsw-266702-55dc696568-n7mtt", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage2.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "1260433408" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/wandb-summary.json b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..5e8b9d3b748fb284dc88a8c3452a45b9cf298db7 --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":224632},"dataset0/rouge_l":26.31403923034668,"epoch":9,"dataloader0/val loss/dataloader_idx_0":0.37918829917907715,"_step":2674,"_timestamp":1.7520573705882266e+09,"dataset0/acc":0,"_runtime":224619.019046321,"dataset0/bleu4":19.679000854492188,"dataset0/rouge_2":19.331697463989258,"dataset0/rouge_1":31.383169174194336,"dataloader2/val loss/dataloader_idx_2":0.21475322544574738,"dataset0/bleu2":25.61839485168457,"dataset0/meteor_score":26.87041664123535,"trainer/global_step":133259,"lr":1.2202456673549023e-05,"loss":0.10850226879119873} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/logs/debug-internal.log b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..1efc19f987e5b1c198147f823733a6ec3c28098e --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/logs/debug-internal.log @@ -0,0 +1,37 @@ +{"time":"2025-07-07T04:12:31.576270753+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/logs/debug-core.log"} +{"time":"2025-07-07T04:12:32.610923664+08:00","level":"INFO","msg":"created new stream","id":"gtrtcbb9"} +{"time":"2025-07-07T04:12:32.61096355+08:00","level":"INFO","msg":"stream: started","id":"gtrtcbb9"} +{"time":"2025-07-07T04:12:32.610992679+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"gtrtcbb9"} +{"time":"2025-07-07T04:12:32.611029092+08:00","level":"INFO","msg":"sender: started","stream_id":"gtrtcbb9"} +{"time":"2025-07-07T04:12:32.611068278+08:00","level":"INFO","msg":"handler: started","stream_id":"gtrtcbb9"} +{"time":"2025-07-07T04:12:33.863206149+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-07-07T12:23:49.503077697+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:48684->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-07T15:56:55.312158345+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070337_2datasets_noconstruct/gtrtcbb9/file_stream\": read tcp 10.1.2.112:32778->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-07T16:16:19.961421973+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.bandw.top/graphql","body":"Cloudflare encountered an error processing this request: Bad Gateway"} +{"time":"2025-07-07T22:42:18.689446152+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070337_2datasets_noconstruct/gtrtcbb9/file_stream\": unexpected EOF"} +{"time":"2025-07-08T00:47:05.732182871+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070337_2datasets_noconstruct/gtrtcbb9/file_stream\": unexpected EOF"} +{"time":"2025-07-08T00:55:03.85032934+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070337_2datasets_noconstruct/gtrtcbb9/file_stream\": read tcp 10.1.2.112:35640->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-08T01:02:05.32396859+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:40958->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-08T05:26:50.313027055+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:33866->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-08T07:35:50.188790186+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-07-08T21:17:21.227560805+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:52938->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-09T00:03:21.269215674+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:60070->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-09T00:04:06.207861293+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:48626->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-09T00:47:00.752106793+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070337_2datasets_noconstruct/gtrtcbb9/file_stream\": read tcp 10.1.2.112:60404->172.67.193.61:443: read: connection timed out"} +{"time":"2025-07-09T01:31:51.271112465+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:54950->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-07-09T02:00:06.381543562+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:52386->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-09T03:47:21.233980657+08:00","level":"INFO","msg":"api: retrying HTTP error","status":504,"url":"https://api.bandw.top/files/gia0603yucca/stage2_07070337_2datasets_noconstruct/gtrtcbb9/file_stream","body":"error code: 504"} +{"time":"2025-07-09T06:32:51.954073668+08:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.bandw.top/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"} +{"time":"2025-07-09T06:32:54.402822916+08:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.bandw.top/graphql","body":"\r\n500 Internal Server Error\r\n\r\n

500 Internal Server Error

\r\n
openresty
\r\n\r\n\r\n"} +{"time":"2025-07-09T06:32:59.256841538+08:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.bandw.top/graphql","body":"\r\n500 Internal Server Error\r\n\r\n

500 Internal Server Error

\r\n
openresty
\r\n\r\n\r\n"} +{"time":"2025-07-09T08:06:06.165959794+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-07-09T09:53:21.494777402+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.112:43336->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-07-09T18:32:06.535906689+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-07-09T18:36:24.226402129+08:00","level":"INFO","msg":"stream: closing","id":"gtrtcbb9"} +{"time":"2025-07-09T18:36:24.226481822+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-07-09T18:36:24.228410556+08:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-07-09T18:38:36.14051944+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-07-09T18:38:39.275501505+08:00","level":"INFO","msg":"handler: closed","stream_id":"gtrtcbb9"} +{"time":"2025-07-09T18:38:39.275541064+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"gtrtcbb9"} +{"time":"2025-07-09T18:38:39.275567447+08:00","level":"INFO","msg":"sender: closed","stream_id":"gtrtcbb9"} +{"time":"2025-07-09T18:38:39.28791943+08:00","level":"INFO","msg":"stream: closed","id":"gtrtcbb9"} diff --git a/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/logs/debug.log b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..f6e91291cf56ab96aea6a51fd2d870ba349039dd --- /dev/null +++ b/ProtT3/all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/logs/debug.log @@ -0,0 +1,24 @@ +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_setup.py:_flush():70] Configure stats pid to 15116 +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/logs/debug.log +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/logs/debug-internal.log +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_init.py:init():852] calling init triggers +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_init.py:init():893] starting backend +2025-07-07 04:12:31,564 INFO MainThread:15116 [wandb_init.py:init():897] sending inform_init request +2025-07-07 04:12:31,566 INFO MainThread:15116 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-07-07 04:12:31,569 INFO MainThread:15116 [wandb_init.py:init():907] backend started and connected +2025-07-07 04:12:31,570 INFO MainThread:15116 [wandb_init.py:init():1005] updated telemetry +2025-07-07 04:12:31,573 INFO MainThread:15116 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-07-07 04:12:33,813 INFO MainThread:15116 [wandb_init.py:init():1104] starting run threads in backend +2025-07-07 04:12:34,000 INFO MainThread:15116 [wandb_run.py:_console_start():2573] atexit reg +2025-07-07 04:12:34,000 INFO MainThread:15116 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-07-07 04:12:34,005 INFO MainThread:15116 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-07-07 04:12:34,005 INFO MainThread:15116 [wandb_run.py:_redirect():2513] Redirects installed. +2025-07-07 04:12:34,006 INFO MainThread:15116 [wandb_init.py:init():1150] run started, returning control to user process +2025-07-07 04:12:42,512 INFO MainThread:15116 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage2_07070337_2datasets_noconstruct', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 10, 'accumulate_grad_batches': 1, 'check_val_every_n_epoch': 1, 'enable_flash': False, 'use_wandb_logger': True, 'mix_dataset': True, 'save_every_n_epochs': 5, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'cross_attention_freq': 2, 'num_query_token': 8, 'llm_name': '/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300', 'num_beams': 5, 'do_sample': False, 'max_inference_len': 128, 'min_inference_len': 1, 'llm_tune': 'mid_lora', 'peft_config': '', 'peft_dir': '', 'plm_model': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'enbale_gradient_checkpointing': False, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'stage1_path': '/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt', 'stage2_path': '', 'init_checkpoint': '', 'caption_eval_epoch': 10, 'num_workers': 8, 'batch_size': 8, 'inference_batch_size': 4, 'root': 'data', 'text_max_len': 1024, 'q_max_len': 29, 'a_max_len': 36, 'prot_max_len': 1024, 'prompt': 'The protein has the following properties: ', 'filter_side_qa': False} +2025-07-09 18:36:24,222 INFO MsgRouterThr:15116 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.