yuccaaa commited on
Commit
bbcacd6
·
verified ·
1 Parent(s): 14ebfd8

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. .gitattributes +3 -0
  3. =0.12.10 +33 -0
  4. README.md +105 -0
  5. all_checkpoints/.gitignore +2 -0
  6. all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  7. all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  8. all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  9. all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  10. all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  11. all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  12. all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  13. all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  14. all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
  15. all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt +3 -0
  16. all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/latest +3 -0
  17. all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/zero_to_fp32.py +3 -0
  18. all_checkpoints/stage1_06290009_deepspeed/wandb/debug-internal.log +42 -0
  19. all_checkpoints/stage1_06290009_deepspeed/wandb/debug.log +24 -0
  20. all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/config.yaml +236 -0
  21. all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/output.log +21 -0
  22. all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/requirements.txt +225 -0
  23. all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-metadata.json +97 -0
  24. all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-summary.json +1 -0
  25. all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log +42 -0
  26. all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log +24 -0
  27. all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/run-vgvxxzqc.wandb +3 -0
  28. all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  29. all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  30. all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  31. all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  32. all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  33. all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  34. all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  35. all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  36. all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
  37. all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/latest +3 -0
  38. all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/zero_to_fp32.py +3 -0
  39. all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  40. all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  41. all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  42. all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  43. all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  44. all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  45. all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  46. all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  47. all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
  48. all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt +3 -0
  49. all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/latest +3 -0
  50. all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/zero_to_fp32.py +3 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/run-vgvxxzqc.wandb filter=lfs diff=lfs merge=lfs -text
37
+ all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/run-6bkqzmou.wandb filter=lfs diff=lfs merge=lfs -text
38
+ all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/run-gtrtcbb9.wandb filter=lfs diff=lfs merge=lfs -text
=0.12.10 ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
2
+ Collecting wandb
3
+ Downloading https://mirrors.aliyun.com/pypi/packages/f9/31/eeb2878b26566c04c3e9b8b20b3ec3c54a2be50535088d36a37c008e07a3/wandb-0.19.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.4 MB)
4
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.4/21.4 MB 6.2 MB/s eta 0:00:00
5
+ Requirement already satisfied: click!=8.0.0,>=7.1 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (8.2.1)
6
+ Collecting docker-pycreds>=0.4.0 (from wandb)
7
+ Downloading https://mirrors.aliyun.com/pypi/packages/f5/e8/f6bd1eee09314e7e6dee49cbe2c5e22314ccdb38db16c9fc72d2fa80d054/docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
8
+ Requirement already satisfied: gitpython!=3.1.29,>=1.0.0 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (3.1.44)
9
+ Requirement already satisfied: platformdirs in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (4.3.8)
10
+ Requirement already satisfied: protobuf!=4.21.0,!=5.28.0,<7,>=3.19.0 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (6.31.0)
11
+ Requirement already satisfied: psutil>=5.0.0 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (7.0.0)
12
+ Requirement already satisfied: pydantic<3 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (2.11.5)
13
+ Requirement already satisfied: pyyaml in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (6.0.2)
14
+ Requirement already satisfied: requests<3,>=2.0.0 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (2.32.3)
15
+ Collecting sentry-sdk>=2.0.0 (from wandb)
16
+ Downloading https://mirrors.aliyun.com/pypi/packages/f0/e5/da07b0bd832cefd52d16f2b9bbbe31624d57552602c06631686b93ccb1bd/sentry_sdk-2.29.1-py2.py3-none-any.whl (341 kB)
17
+ Collecting setproctitle (from wandb)
18
+ Downloading https://mirrors.aliyun.com/pypi/packages/67/2b/c3cbd4a4462c1143465d8c151f1d51bbfb418e60a96a754329d28d416575/setproctitle-1.3.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
19
+ Requirement already satisfied: setuptools in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (78.1.1)
20
+ Requirement already satisfied: typing-extensions<5,>=4.4 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (4.13.2)
21
+ Requirement already satisfied: annotated-types>=0.6.0 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from pydantic<3->wandb) (0.7.0)
22
+ Requirement already satisfied: pydantic-core==2.33.2 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from pydantic<3->wandb) (2.33.2)
23
+ Requirement already satisfied: typing-inspection>=0.4.0 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from pydantic<3->wandb) (0.4.1)
24
+ Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (3.4.2)
25
+ Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (3.10)
26
+ Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (2.4.0)
27
+ Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (2025.4.26)
28
+ Requirement already satisfied: six>=1.4.0 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from docker-pycreds>=0.4.0->wandb) (1.17.0)
29
+ Requirement already satisfied: gitdb<5,>=4.0.1 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from gitpython!=3.1.29,>=1.0.0->wandb) (4.0.12)
30
+ Requirement already satisfied: smmap<6,>=3.0.1 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb) (5.0.2)
31
+ Installing collected packages: setproctitle, sentry-sdk, docker-pycreds, wandb
32
+
33
+ Successfully installed docker-pycreds-0.4.0 sentry-sdk-2.29.1 setproctitle-1.3.6 wandb-0.19.11
README.md ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ProtT3: Protein-to-Text Generation for Text-based Protein Understanding
2
+
3
+ Codes of our ACL2024 paper.
4
+
5
+ Authors: Zhiyuan Liu, An Zhang, Hao Fei, Enzhi Zhang, Xiang Wang, Kenji Kawaguchi, Tat-Seng Chua
6
+
7
+
8
+ ## Dependencies
9
+
10
+ python==3.8
11
+
12
+ * Install PyTorch with cuda-11.7 using conda by following the instructions in [link](https://pytorch.org/get-started/locally/)
13
+ * Install flash-attention by running `pip install flash-attn --no-build-isolation`. You might need to install the following dependencies first, for building the flash-attention module:
14
+ * `pip install packaging ninja`
15
+ * `conda install -c "nvidia/label/cuda-11.7.1" cuda-nvcc`
16
+ * `conda install -c "nvidia/label/cuda-11.7.1" cuda-libraries-dev`
17
+ * Install the lastest version of opendela by runing `pip install git+https://github.com/thunlp/OpenDelta.git`
18
+ * Install Lavis: `pip install rouge_score nltk salesforce-lavis`
19
+ * Install others: `pip install -U transformers pytorch-lightning`
20
+ * Install the lastest version of deepspeed: `pip install git+https://github.com/microsoft/DeepSpeed.git`
21
+ * Download nltk corpus:
22
+ ```
23
+ import nltk
24
+ nltk.download('wordnet')
25
+ ```
26
+
27
+ ## Dataset
28
+
29
+ Download our pre-processed datasets from [link](https://osf.io/23azs/?view_only=185575515e714f4798499bf06513a730), and unzip the datasets under the `./data` directory
30
+
31
+ ## Reproduce results by training from scratch
32
+
33
+ * Reproduce results in stage 1:
34
+
35
+ ```sh
36
+ python stage1.py --devices '0,1,2,3' --mode train --filename stage1_ckpt --num_query_token 8 --plm_name "facebook/esm2_t30_150M_UR50D" --save_every_n_epochs 10 --batch_size 32 --precision 'bf16-mixed' --num_workers 8
37
+ ```
38
+
39
+ * Convert stage1's DeepSpeed checkpoint to PyTorch format by running
40
+
41
+ ```sh
42
+ python convert.py --input /path/to/stage1/ckpt/address --output /path/to/ckpt/saving/address
43
+ ```
44
+
45
+ * Reproduce results in stage 2:
46
+
47
+ * Protein Captioning:
48
+
49
+ ```sh
50
+ python stage2.py --devices '0,1,2,3' --mode train --filename protein_captioning_swiss_dataset --num_query_token 8 --save_every_n_epochs 10 --batch_size 32 --precision 'bf16-mixed' --num_workers 8 --llm_tune mid_lora --enable_flash --root './data/SwissProtV3' --stage1_path /path/to/ckpt/saving/address;
51
+ ```
52
+
53
+ * Protein Question-Answering:
54
+
55
+ ```sh
56
+ python stage2.py --devices '0,1,2,3' --mode train --filename prot_qa --num_query_token 8 --save_every_n_epochs 10 --num_workers 8 --batch_size 128 --accumulate_grad_batches 1 --precision 'bf16-mixed' --root "data/PDBDataset" --llm_tune mid_lora --prompt "Question: {} Answer:" --inference_batch 32 --max_inference_len 36 --stage1_path /path/to/ckpt/saving/address;
57
+ ```
58
+
59
+ * After running one of the two scripts above, the model's protein-to-text generation resuults will be saved at `./all_checkpoint/[filename]/lightning_logs/[version_x]/dataset0_predictions.txt`. You can evaluate the results by running
60
+
61
+ ```sh
62
+ ## for question-answering evaluation
63
+ python read_results --path ./all_checkpoint/[filename]/lightning_logs/[version_x]/dataset0_predictions.txt --qa_question
64
+
65
+ ## for protein captioning evaluation
66
+ python read_results --path ./all_checkpoint/[filename]/lightning_logs/[version_x]/dataset0_predictions.txt
67
+ ```
68
+
69
+ ## Reproduce results by loading our checkpoints
70
+
71
+ Download our released checkpoints from [link](https://osf.io/23azs/?view_only=185575515e714f4798499bf06513a730)
72
+
73
+ * Reproduce results in stage 1:
74
+
75
+ ```sh
76
+ python stage1.py --devices '0,1,2,3' --mode eval --filename stage1_ckpt --num_query_token 8 --plm_name "facebook/esm2_t30_150M_UR50D" --save_every_n_epochs 10 --batch_size 32 --precision 'bf16-mixed' --num_workers 8 --init_checkpoint /path/to/stage1.ckpt;
77
+ ```
78
+
79
+ * Reproduce results in stage 2:
80
+
81
+ * Protein Captioning:
82
+
83
+ ```sh
84
+ python stage2.py --devices '0,1,2,3' --mode train --filename protein_captioning_swiss_dataset --num_query_token 8 --save_every_n_epochs 10 --batch_size 32 --precision 'bf16-mixed' --num_workers 8 --llm_tune mid_lora --enable_flash --root './data/SwissProtV3' --init_checkpoint /path/to/swiss_ft.ckpt;
85
+ ```
86
+
87
+ * Protein Question-Answering:
88
+
89
+ ```sh
90
+ python stage2.py --devices '0,1,2,3' --mode train --filename prot_qa --num_query_token 8 --save_every_n_epochs 10 --num_workers 8 --batch_size 128 --accumulate_grad_batches 1 --precision 'bf16-mixed' --root "data/PDBDataset" --llm_tune mid_lora --prompt "Question: {} Answer:" --inference_batch 32 --max_inference_len 36 --init_checkpoint /path/to/pdbqa_ft.ckpt;
91
+ ```
92
+
93
+
94
+ ## Citation
95
+
96
+ ```bib
97
+ @inproceedings{liu2024prott,
98
+ title={ProtT3: Protein-to-Text Generation for Text-based Protein Understanding},
99
+ author={Liu, Zhiyuan and Zhang, An and Fei, Hao and Zhang, Enzhi and Wang, Xiang and Kawaguchi, Kenji and Chua, Tat-Seng},
100
+ booktitle={{ACL}},
101
+ publisher = {Association for Computational Linguistics},
102
+ year={2024},
103
+ url={https://openreview.net/forum?id=ZmIjOPil2b}
104
+ }
105
+ ```
all_checkpoints/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *
2
+ !.gitignore
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1d6798603170f2f7903bb9a0d21a5ed25e860528296107c8b31cd2914a86965
3
+ size 269461360
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ab0580d136844b18a0e388946d57dca52ab782052d7885d0c033a07708b3957
3
+ size 269466288
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e71ea3d389287f043d6acb2922ee075f8ceaf30ac73b145c5cb709ca3a4cfea1
3
+ size 269466352
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bd10e7db6ec37fcbdb2baa6f51691e815bde2e5a112b516675fb556c4674e87
3
+ size 269465648
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b5975d03715bc1cc0eb9de733fbb4475c0540e790c6f00d0aae625e8ba08230
3
+ size 269466096
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61d27b7a3de97f24ef54d58dc7e2a57d2568ed520177746f47540c3f4d26a847
3
+ size 269466352
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08bcb9b3d8e2b0e8e71b925b4c0ef0da6702e9609c252dc90607d9d4e3801b4c
3
+ size 269466096
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d426f595bbbde8b5a76ead76e06cee6febe35c7cf6edcf40aaf06560d47100e
3
+ size 269465904
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2691cb5de812f5198012ce9c319ee4faae20c95a97a5639b3796d13561d44b49
3
+ size 359448824
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aabff8ac8cffa262ba0de81d44b32342f1fce2f1b6efe66064bb9d99b3ef01bd
3
+ size 718651620
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/latest ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47320987f9a49d5b00119b960f247a956773f57543982b8bfcb6da5bb3afd9ef
3
+ size 10
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/zero_to_fp32.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46497565ccf2b4a8b1f6f18c8341042f3749605a94335c81f69df1bd268af64f
3
+ size 33272
all_checkpoints/stage1_06290009_deepspeed/wandb/debug-internal.log ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-29T00:12:00.930911849+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-core.log"}
2
+ {"time":"2025-06-29T00:12:31.03707486+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
3
+ {"time":"2025-06-29T00:12:36.876219526+08:00","level":"INFO","msg":"created new stream","id":"vgvxxzqc"}
4
+ {"time":"2025-06-29T00:12:36.876272436+08:00","level":"INFO","msg":"stream: started","id":"vgvxxzqc"}
5
+ {"time":"2025-06-29T00:12:36.876317878+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"vgvxxzqc"}
6
+ {"time":"2025-06-29T00:12:36.876360145+08:00","level":"INFO","msg":"handler: started","stream_id":"vgvxxzqc"}
7
+ {"time":"2025-06-29T00:12:36.876401621+08:00","level":"INFO","msg":"sender: started","stream_id":"vgvxxzqc"}
8
+ {"time":"2025-06-29T00:12:39.839397838+08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2025-06-29T00:13:20.87652364+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58014->104.21.20.172:443: read: connection reset by peer"}
10
+ {"time":"2025-06-29T00:16:17.426969211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
11
+ {"time":"2025-06-29T00:17:58.51721346+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
12
+ {"time":"2025-06-29T00:26:10.222503445+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51478->172.67.193.61:443: read: connection timed out"}
13
+ {"time":"2025-06-29T00:28:34.807681677+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46066->104.21.20.172:443: read: connection reset by peer"}
14
+ {"time":"2025-06-29T00:33:01.870533298+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55734->104.21.20.172:443: read: connection timed out"}
15
+ {"time":"2025-06-29T00:33:23.754842161+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46042->172.67.193.61:443: read: connection reset by peer"}
16
+ {"time":"2025-06-29T00:36:47.149592254+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:57176->172.67.193.61:443: read: connection timed out"}
17
+ {"time":"2025-06-29T00:41:05.645712211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
18
+ {"time":"2025-06-29T00:47:20.493525378+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58300->104.21.20.172:443: read: connection timed out"}
19
+ {"time":"2025-06-29T00:47:45.629981285+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
20
+ {"time":"2025-06-29T00:51:19.597480154+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:60802->104.21.20.172:443: read: connection timed out"}
21
+ {"time":"2025-06-29T00:54:09.777365701+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51532->104.21.20.172:443: read: connection reset by peer"}
22
+ {"time":"2025-06-29T01:00:20.78154967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:34934->172.67.193.61:443: read: connection timed out"}
23
+ {"time":"2025-06-29T01:04:21.421531776+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55938->104.21.20.172:443: read: connection timed out"}
24
+ {"time":"2025-06-29T01:05:41.05509194+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45678->172.67.193.61:443: read: connection reset by peer"}
25
+ {"time":"2025-06-29T01:08:44.130266043+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
26
+ {"time":"2025-06-29T01:10:59.724692621+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
27
+ {"time":"2025-06-29T01:14:35.821570745+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:35232->104.21.20.172:443: read: connection timed out"}
28
+ {"time":"2025-06-29T01:17:35.533530754+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:44258->104.21.20.172:443: read: connection timed out"}
29
+ {"time":"2025-06-29T01:23:12.630381225+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:48108->172.67.193.61:443: read: connection reset by peer"}
30
+ {"time":"2025-06-29T01:24:55.067569821+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
31
+ {"time":"2025-06-29T01:25:27.188065501+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
32
+ {"time":"2025-06-29T01:25:56.782489942+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45024->104.21.20.172:443: read: connection timed out"}
33
+ {"time":"2025-06-29T01:26:49.538892815+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
34
+ {"time":"2025-06-29T01:29:46.157546022+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:42508->172.67.193.61:443: read: connection timed out"}
35
+ {"time":"2025-06-29T05:53:57.9412544+08:00","level":"INFO","msg":"stream: closing","id":"vgvxxzqc"}
36
+ {"time":"2025-06-29T05:53:57.941286983+08:00","level":"INFO","msg":"Stopping system monitor"}
37
+ {"time":"2025-06-29T05:53:57.942366437+08:00","level":"INFO","msg":"Stopped system monitor"}
38
+ {"time":"2025-06-29T05:54:00.869660002+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
39
+ {"time":"2025-06-29T05:54:03.731237694+08:00","level":"INFO","msg":"handler: closed","stream_id":"vgvxxzqc"}
40
+ {"time":"2025-06-29T05:54:03.731282348+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"vgvxxzqc"}
41
+ {"time":"2025-06-29T05:54:03.731313818+08:00","level":"INFO","msg":"sender: closed","stream_id":"vgvxxzqc"}
42
+ {"time":"2025-06-29T05:54:03.735031072+08:00","level":"INFO","msg":"stream: closed","id":"vgvxxzqc"}
all_checkpoints/stage1_06290009_deepspeed/wandb/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Configure stats pid to 2351
3
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log
7
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log
8
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():893] starting backend
12
+ 2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-29 00:12:00,923 INFO MainThread:2351 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-29 00:12:00,924 INFO MainThread:2351 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-29 00:12:00,926 INFO MainThread:2351 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-29 00:12:00,929 INFO MainThread:2351 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-29 00:12:39,788 INFO MainThread:2351 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-29 00:12:40,036 INFO MainThread:2351 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-29 00:12:46,669 INFO MainThread:2351 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06290009_deepspeed', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 168, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
24
+ 2025-06-29 05:53:57,929 INFO MsgRouterThr:2351 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/config.yaml ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": trainer/global_step
6
+ "6":
7
+ - 3
8
+ "7": []
9
+ - "1": train_loss
10
+ "5": 1
11
+ "6":
12
+ - 1
13
+ - 3
14
+ "7": []
15
+ - "1": loader1/val_loss/dataloader_idx_1
16
+ "5": 1
17
+ "6":
18
+ - 1
19
+ - 3
20
+ "7": []
21
+ - "1": loader0/val_loss_lm/dataloader_idx_0
22
+ "5": 1
23
+ "6":
24
+ - 1
25
+ - 3
26
+ "7": []
27
+ - "1": loader1/val_loss_ptm/dataloader_idx_1
28
+ "5": 1
29
+ "6":
30
+ - 1
31
+ - 3
32
+ "7": []
33
+ - "1": loader2/val_loss_lm/dataloader_idx_2
34
+ "5": 1
35
+ "6":
36
+ - 1
37
+ - 3
38
+ "7": []
39
+ - "1": loader1/val_loss_ptc/dataloader_idx_1
40
+ "5": 1
41
+ "6":
42
+ - 1
43
+ - 3
44
+ "7": []
45
+ - "1": epoch
46
+ "5": 1
47
+ "6":
48
+ - 1
49
+ - 3
50
+ "7": []
51
+ - "1": lr
52
+ "5": 1
53
+ "6":
54
+ - 1
55
+ - 3
56
+ "7": []
57
+ - "1": loader2/val_loss_ptc/dataloader_idx_2
58
+ "5": 1
59
+ "6":
60
+ - 1
61
+ - 3
62
+ "7": []
63
+ - "1": loader0/val_loss_ptm/dataloader_idx_0
64
+ "5": 1
65
+ "6":
66
+ - 1
67
+ - 3
68
+ "7": []
69
+ - "1": train_loss_ptc
70
+ "5": 1
71
+ "6":
72
+ - 1
73
+ - 3
74
+ "7": []
75
+ - "1": train_loss_ptm
76
+ "5": 1
77
+ "6":
78
+ - 1
79
+ - 3
80
+ "7": []
81
+ - "1": train_loss_lm
82
+ "5": 1
83
+ "6":
84
+ - 1
85
+ - 3
86
+ "7": []
87
+ - "1": loader2/val_loss_ptm/dataloader_idx_2
88
+ "5": 1
89
+ "6":
90
+ - 1
91
+ - 3
92
+ "7": []
93
+ - "1": loader0/val_loss/dataloader_idx_0
94
+ "5": 1
95
+ "6":
96
+ - 1
97
+ - 3
98
+ "7": []
99
+ - "1": loader2/val_loss/dataloader_idx_2
100
+ "5": 1
101
+ "6":
102
+ - 1
103
+ - 3
104
+ "7": []
105
+ - "1": loader1/val_loss_lm/dataloader_idx_1
106
+ "5": 1
107
+ "6":
108
+ - 1
109
+ - 3
110
+ "7": []
111
+ - "1": loader0/val_loss_ptc/dataloader_idx_0
112
+ "5": 1
113
+ "6":
114
+ - 1
115
+ - 3
116
+ "7": []
117
+ python_version: 3.10.0
118
+ t:
119
+ "1":
120
+ - 1
121
+ - 5
122
+ - 9
123
+ - 11
124
+ - 33
125
+ - 41
126
+ - 49
127
+ - 53
128
+ - 55
129
+ - 63
130
+ - 103
131
+ "2":
132
+ - 1
133
+ - 5
134
+ - 9
135
+ - 11
136
+ - 33
137
+ - 41
138
+ - 49
139
+ - 53
140
+ - 55
141
+ - 63
142
+ - 103
143
+ "3":
144
+ - 7
145
+ - 23
146
+ - 55
147
+ - 66
148
+ "4": 3.10.0
149
+ "5": 0.19.11
150
+ "6": 4.52.3
151
+ "8":
152
+ - 5
153
+ "12": 0.19.11
154
+ "13": linux-x86_64
155
+ accelerator:
156
+ value: gpu
157
+ batch_size:
158
+ value: 168
159
+ bert_hidden_dim:
160
+ value: 768
161
+ bert_name:
162
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
163
+ check_val_every_n_epoch:
164
+ value: 1
165
+ cross_attention_freq:
166
+ value: 2
167
+ devices:
168
+ value: 0,1,2,3,4,5,6,7
169
+ filename:
170
+ value: stage1_06290009_deepspeed
171
+ init_checkpoint:
172
+ value: ""
173
+ init_lr:
174
+ value: 0.0001
175
+ lm:
176
+ value: true
177
+ load_4bit:
178
+ value: false
179
+ lr_decay_rate:
180
+ value: 0.9
181
+ match_batch_size:
182
+ value: 64
183
+ max_epochs:
184
+ value: 20
185
+ min_lr:
186
+ value: 1e-05
187
+ mix_dataset:
188
+ value: true
189
+ mode:
190
+ value: train
191
+ num_query_token:
192
+ value: 8
193
+ num_workers:
194
+ value: 8
195
+ plm_name:
196
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
197
+ plm_tune:
198
+ value: freeze
199
+ pool_size:
200
+ value: 0
201
+ precision:
202
+ value: bf16-mixed
203
+ projection_dim:
204
+ value: 256
205
+ prot_aug:
206
+ value: None
207
+ prot_max_len:
208
+ value: 1024
209
+ ptm:
210
+ value: true
211
+ rerank_cand_num:
212
+ value: 128
213
+ retrieval_eval_epoch:
214
+ value: 10
215
+ root:
216
+ value: data
217
+ save_every_n_epochs:
218
+ value: 5
219
+ scheduler:
220
+ value: linear_warmup_cosine_lr
221
+ seed:
222
+ value: 42
223
+ strategy:
224
+ value: deepspeed
225
+ temperature:
226
+ value: 0.1
227
+ text_max_len:
228
+ value: 128
229
+ use_wandb_logger:
230
+ value: true
231
+ warmup_lr:
232
+ value: 1e-06
233
+ warmup_steps:
234
+ value: 1000
235
+ weight_decay:
236
+ value: 0.05
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/output.log ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed exists and is not empty.
2
+ Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
3
+ LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
4
+
5
+ | Name | Type | Params | Mode
6
+ ------------------------------------------------------
7
+ 0 | blip2qformer | Blip2Qformer | 327 M | train
8
+ ------------------------------------------------------
9
+ 179 M Trainable params
10
+ 147 M Non-trainable params
11
+ 327 M Total params
12
+ 1,309.467 Total estimated model params size (MB)
13
+ 5 Modules in train mode
14
+ 926 Modules in eval mode
15
+ Epoch 19: 100%|███████████████████████████████████████████| 320/320 [17:07<00:00, 0.31it/s, v_num=xzqc]
16
+ /nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:220: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
17
+ with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
18
+
19
+ /nas/shared/kilab/wangyujia/ProtT3/model/dist_funs.py:18: FutureWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
20
+ sd = self.module.state_dict(destination, prefix, keep_vars)
21
+ `Trainer.fit` stopped: `max_epochs=20` reached.
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ opendatasets==0.1.22
2
+ salesforce-lavis==1.0.2
3
+ Pygments==2.19.1
4
+ nvidia-nccl-cu12==2.21.5
5
+ tornado==6.5.1
6
+ nvidia-cuda-runtime-cu12==12.4.127
7
+ requests==2.32.3
8
+ nvidia-cuda-cupti-cu12==12.4.127
9
+ decord==0.6.0
10
+ braceexpand==0.1.7
11
+ frozenlist==1.6.0
12
+ markdown-it-py==3.0.0
13
+ shellingham==1.5.4
14
+ absl-py==2.2.2
15
+ pycocoevalcap==1.2
16
+ contexttimer==0.3.3
17
+ bleach==6.2.0
18
+ jsonschema-specifications==2025.4.1
19
+ pycocotools==2.0.8
20
+ python-slugify==8.0.4
21
+ tqdm==4.67.1
22
+ numpy==2.2.6
23
+ urllib3==2.4.0
24
+ deepspeed==0.16.10+b666844f
25
+ watchdog==6.0.0
26
+ wrapt==1.17.2
27
+ setuptools==78.1.1
28
+ matplotlib==3.10.3
29
+ pydeck==0.9.1
30
+ aiosignal==1.3.2
31
+ gitdb==4.0.12
32
+ hjson==3.1.0
33
+ timm==0.4.12
34
+ blis==1.3.0
35
+ PyYAML==6.0.2
36
+ referencing==0.36.2
37
+ contourpy==1.3.2
38
+ kaggle==1.7.4.5
39
+ triton==3.2.0
40
+ catalogue==2.0.10
41
+ idna==3.10
42
+ torch==2.6.0
43
+ text-unidecode==1.3
44
+ altair==5.5.0
45
+ cloudpathlib==0.21.1
46
+ protobuf==6.31.0
47
+ nvidia-cusolver-cu12==11.6.1.9
48
+ pytz==2025.2
49
+ sympy==1.13.1
50
+ spacy==3.8.7
51
+ MarkupSafe==3.0.2
52
+ thinc==8.3.6
53
+ nvidia-cudnn-cu12==9.1.0.70
54
+ wasabi==1.1.3
55
+ aiohappyeyeballs==2.6.1
56
+ nvidia-nvtx-cu12==12.4.127
57
+ rich==14.0.0
58
+ ipython==8.36.0
59
+ yarl==1.20.0
60
+ torchmetrics==1.7.1
61
+ multidict==6.4.4
62
+ cfgv==3.4.0
63
+ smmap==5.0.2
64
+ srsly==2.5.1
65
+ scikit-image==0.25.2
66
+ matplotlib-inline==0.1.7
67
+ annotated-types==0.7.0
68
+ lazy_loader==0.4
69
+ tenacity==9.1.2
70
+ GitPython==3.1.44
71
+ language_data==1.3.0
72
+ pydantic_core==2.33.2
73
+ sentencepiece==0.2.0
74
+ platformdirs==4.3.8
75
+ distlib==0.3.9
76
+ nvidia-cusparselt-cu12==0.6.2
77
+ blinker==1.9.0
78
+ regex==2024.11.6
79
+ tifffile==2025.5.10
80
+ py-cpuinfo==9.0.0
81
+ attrs==25.3.0
82
+ mdurl==0.1.2
83
+ prompt_toolkit==3.0.51
84
+ packaging==24.2
85
+ async-timeout==5.0.1
86
+ six==1.17.0
87
+ executing==2.2.0
88
+ parso==0.8.4
89
+ omegaconf==2.3.0
90
+ wcwidth==0.2.13
91
+ murmurhash==1.0.13
92
+ stack-data==0.6.3
93
+ nvidia-cufft-cu12==11.2.1.3
94
+ virtualenv==20.31.2
95
+ langcodes==3.5.0
96
+ fonttools==4.58.0
97
+ opencv-python-headless==4.5.5.64
98
+ jedi==0.19.2
99
+ torchvision==0.21.0
100
+ plotly==6.1.1
101
+ nodeenv==1.9.1
102
+ smart-open==7.1.0
103
+ toml==0.10.2
104
+ pytorch-lightning==2.5.1.post0
105
+ typing_extensions==4.13.2
106
+ safetensors==0.5.3
107
+ psutil==7.0.0
108
+ pillow==11.2.1
109
+ python-dateutil==2.9.0.post0
110
+ ftfy==6.3.1
111
+ scipy==1.15.3
112
+ webdataset==0.2.111
113
+ charset-normalizer==3.4.2
114
+ nvidia-nvjitlink-cu12==12.4.127
115
+ kiwisolver==1.4.8
116
+ nvidia-ml-py==12.575.51
117
+ confection==0.1.5
118
+ nvidia-curand-cu12==10.3.5.147
119
+ pandas==2.2.3
120
+ nltk==3.9.1
121
+ webencodings==0.5.1
122
+ pyarrow==20.0.0
123
+ asttokens==3.0.0
124
+ exceptiongroup==1.3.0
125
+ pre_commit==4.2.0
126
+ ninja==1.11.1.4
127
+ spacy-loggers==1.0.5
128
+ msgpack==1.1.0
129
+ lightning-utilities==0.14.3
130
+ nvidia-cublas-cu12==12.4.5.8
131
+ tzdata==2025.2
132
+ cycler==0.12.1
133
+ hf-xet==1.1.2
134
+ antlr4-python3-runtime==4.9.3
135
+ iopath==0.1.10
136
+ pexpect==4.9.0
137
+ imageio==2.37.0
138
+ streamlit==1.45.1
139
+ python-magic==0.4.27
140
+ networkx==3.4.2
141
+ portalocker==3.1.1
142
+ nvidia-cusparse-cu12==12.3.1.170
143
+ propcache==0.3.1
144
+ ptyprocess==0.7.0
145
+ fairscale==0.4.4
146
+ rpds-py==0.25.1
147
+ certifi==2025.4.26
148
+ rouge_score==0.1.2
149
+ traitlets==5.14.3
150
+ identify==2.6.12
151
+ spacy-legacy==3.0.12
152
+ weasel==0.4.1
153
+ mpmath==1.3.0
154
+ cymem==2.0.11
155
+ typing-inspection==0.4.1
156
+ nvidia-cuda-nvrtc-cu12==12.4.127
157
+ marisa-trie==1.2.1
158
+ einops==0.8.1
159
+ nvidia-cufile-cu12==1.11.1.6
160
+ pydantic==2.11.5
161
+ cachetools==5.5.2
162
+ joblib==1.5.1
163
+ Jinja2==3.1.6
164
+ filelock==3.18.0
165
+ pyparsing==3.2.3
166
+ pure_eval==0.2.3
167
+ decorator==5.2.1
168
+ wheel==0.45.1
169
+ pycryptodome==3.23.0
170
+ cheroot==10.0.1
171
+ multiprocess==0.70.16
172
+ aiohttp==3.12.2
173
+ crcmod==1.7
174
+ fsspec==2025.3.0
175
+ jmespath==0.10.0
176
+ preshed==3.0.10
177
+ jaraco.functools==4.1.0
178
+ cryptography==45.0.3
179
+ sentry-sdk==2.29.1
180
+ tokenizers==0.21.1
181
+ opendelta==0.3.2
182
+ pycparser==2.22
183
+ narwhals==1.41.0
184
+ scikit-learn==1.6.1
185
+ dill==0.3.8
186
+ oss2==2.15.0
187
+ yacs==0.1.8
188
+ more-itertools==10.7.0
189
+ pip==25.1.1
190
+ threadpoolctl==3.6.0
191
+ flash-attn==2.7.1.post1
192
+ bigmodelvis==0.0.1
193
+ pathlib==1.0.1
194
+ delta-center-client==0.0.4
195
+ xxhash==3.5.0
196
+ wandb==0.19.11
197
+ setproctitle==1.3.6
198
+ aliyun-python-sdk-core==2.16.0
199
+ transformers==4.52.3
200
+ aliyun-python-sdk-kms==2.16.5
201
+ datasets==3.6.0
202
+ typer==0.16.0
203
+ docker-pycreds==0.4.0
204
+ click==8.2.1
205
+ huggingface-hub==0.32.1
206
+ web.py==0.62
207
+ cffi==1.17.1
208
+ opencv-python==4.11.0.86
209
+ jsonschema==4.24.0
210
+ typing_extensions==4.12.2
211
+ jaraco.functools==4.0.1
212
+ jaraco.text==3.12.1
213
+ jaraco.collections==5.1.0
214
+ inflect==7.3.1
215
+ more-itertools==10.3.0
216
+ packaging==24.2
217
+ importlib_metadata==8.0.0
218
+ backports.tarfile==1.2.0
219
+ typeguard==4.3.0
220
+ zipp==3.19.2
221
+ platformdirs==4.2.2
222
+ autocommand==2.2.2
223
+ jaraco.context==5.3.0
224
+ tomli==2.0.1
225
+ wheel==0.45.1
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-06-28T16:12:00.926076Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_06290009_deepspeed",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "20",
22
+ "--batch_size",
23
+ "168",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "8",
29
+ "--use_wandb_logger"
30
+ ],
31
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
32
+ "codePath": "stage1.py",
33
+ "root": "./all_checkpoints/stage1_06290009_deepspeed/",
34
+ "host": "dsw-251511-c65bb988c-9g24f",
35
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
36
+ "codePathLocal": "stage1.py",
37
+ "cpu_count": 64,
38
+ "cpu_count_logical": 64,
39
+ "gpu": "NVIDIA A800-SXM4-80GB",
40
+ "gpu_count": 8,
41
+ "disk": {
42
+ "/": {
43
+ "total": "1623302262784",
44
+ "used": "987680768"
45
+ }
46
+ },
47
+ "memory": {
48
+ "total": "549755813888"
49
+ },
50
+ "cpu": {
51
+ "count": 64,
52
+ "countLogical": 64
53
+ },
54
+ "gpu_nvidia": [
55
+ {
56
+ "name": "NVIDIA A800-SXM4-80GB",
57
+ "memoryTotal": "85198045184",
58
+ "architecture": "Ampere"
59
+ },
60
+ {
61
+ "name": "NVIDIA A800-SXM4-80GB",
62
+ "memoryTotal": "85198045184",
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA A800-SXM4-80GB",
67
+ "memoryTotal": "85198045184",
68
+ "architecture": "Ampere"
69
+ },
70
+ {
71
+ "name": "NVIDIA A800-SXM4-80GB",
72
+ "memoryTotal": "85198045184",
73
+ "architecture": "Ampere"
74
+ },
75
+ {
76
+ "name": "NVIDIA A800-SXM4-80GB",
77
+ "memoryTotal": "85198045184",
78
+ "architecture": "Ampere"
79
+ },
80
+ {
81
+ "name": "NVIDIA A800-SXM4-80GB",
82
+ "memoryTotal": "85198045184",
83
+ "architecture": "Ampere"
84
+ },
85
+ {
86
+ "name": "NVIDIA A800-SXM4-80GB",
87
+ "memoryTotal": "85198045184",
88
+ "architecture": "Ampere"
89
+ },
90
+ {
91
+ "name": "NVIDIA A800-SXM4-80GB",
92
+ "memoryTotal": "85198045184",
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "12.1"
97
+ }
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"loader1/val_loss_lm/dataloader_idx_1":5.460818767547607,"lr":1.0554024811426643e-05,"_timestamp":1.7511476307831557e+09,"loader1/val_loss/dataloader_idx_1":14.773687362670898,"train_loss":2.630859375,"loader2/val_loss_ptm/dataloader_idx_2":2.988206148147583,"loader1/val_loss_ptm/dataloader_idx_1":3.491912603378296,"loader2/val_loss_lm/dataloader_idx_2":6.094737529754639,"_runtime":20509.857471191,"loader2/val_loss_ptc/dataloader_idx_2":5.59119987487793,"_wandb":{"runtime":20517},"train_loss_ptc":0.64306640625,"train_loss_ptm":0.291015625,"loader0/val_loss_ptc/dataloader_idx_0":1.0085703134536743,"loader1/val_loss_ptc/dataloader_idx_1":5.825062274932861,"loader0/val_loss_lm/dataloader_idx_0":2.1440062522888184,"loader2/val_loss/dataloader_idx_2":14.674175262451172,"epoch":19,"train_loss_lm":1.697265625,"_step":147,"loader0/val_loss/dataloader_idx_0":3.6699562072753906,"loader0/val_loss_ptm/dataloader_idx_0":0.5172882676124573,"trainer/global_step":6399}
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-29T00:12:00.930911849+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-core.log"}
2
+ {"time":"2025-06-29T00:12:31.03707486+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
3
+ {"time":"2025-06-29T00:12:36.876219526+08:00","level":"INFO","msg":"created new stream","id":"vgvxxzqc"}
4
+ {"time":"2025-06-29T00:12:36.876272436+08:00","level":"INFO","msg":"stream: started","id":"vgvxxzqc"}
5
+ {"time":"2025-06-29T00:12:36.876317878+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"vgvxxzqc"}
6
+ {"time":"2025-06-29T00:12:36.876360145+08:00","level":"INFO","msg":"handler: started","stream_id":"vgvxxzqc"}
7
+ {"time":"2025-06-29T00:12:36.876401621+08:00","level":"INFO","msg":"sender: started","stream_id":"vgvxxzqc"}
8
+ {"time":"2025-06-29T00:12:39.839397838+08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2025-06-29T00:13:20.87652364+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58014->104.21.20.172:443: read: connection reset by peer"}
10
+ {"time":"2025-06-29T00:16:17.426969211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
11
+ {"time":"2025-06-29T00:17:58.51721346+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
12
+ {"time":"2025-06-29T00:26:10.222503445+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51478->172.67.193.61:443: read: connection timed out"}
13
+ {"time":"2025-06-29T00:28:34.807681677+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46066->104.21.20.172:443: read: connection reset by peer"}
14
+ {"time":"2025-06-29T00:33:01.870533298+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55734->104.21.20.172:443: read: connection timed out"}
15
+ {"time":"2025-06-29T00:33:23.754842161+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46042->172.67.193.61:443: read: connection reset by peer"}
16
+ {"time":"2025-06-29T00:36:47.149592254+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:57176->172.67.193.61:443: read: connection timed out"}
17
+ {"time":"2025-06-29T00:41:05.645712211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
18
+ {"time":"2025-06-29T00:47:20.493525378+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58300->104.21.20.172:443: read: connection timed out"}
19
+ {"time":"2025-06-29T00:47:45.629981285+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
20
+ {"time":"2025-06-29T00:51:19.597480154+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:60802->104.21.20.172:443: read: connection timed out"}
21
+ {"time":"2025-06-29T00:54:09.777365701+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51532->104.21.20.172:443: read: connection reset by peer"}
22
+ {"time":"2025-06-29T01:00:20.78154967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:34934->172.67.193.61:443: read: connection timed out"}
23
+ {"time":"2025-06-29T01:04:21.421531776+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55938->104.21.20.172:443: read: connection timed out"}
24
+ {"time":"2025-06-29T01:05:41.05509194+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45678->172.67.193.61:443: read: connection reset by peer"}
25
+ {"time":"2025-06-29T01:08:44.130266043+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
26
+ {"time":"2025-06-29T01:10:59.724692621+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
27
+ {"time":"2025-06-29T01:14:35.821570745+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:35232->104.21.20.172:443: read: connection timed out"}
28
+ {"time":"2025-06-29T01:17:35.533530754+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:44258->104.21.20.172:443: read: connection timed out"}
29
+ {"time":"2025-06-29T01:23:12.630381225+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:48108->172.67.193.61:443: read: connection reset by peer"}
30
+ {"time":"2025-06-29T01:24:55.067569821+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
31
+ {"time":"2025-06-29T01:25:27.188065501+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
32
+ {"time":"2025-06-29T01:25:56.782489942+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45024->104.21.20.172:443: read: connection timed out"}
33
+ {"time":"2025-06-29T01:26:49.538892815+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
34
+ {"time":"2025-06-29T01:29:46.157546022+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:42508->172.67.193.61:443: read: connection timed out"}
35
+ {"time":"2025-06-29T05:53:57.9412544+08:00","level":"INFO","msg":"stream: closing","id":"vgvxxzqc"}
36
+ {"time":"2025-06-29T05:53:57.941286983+08:00","level":"INFO","msg":"Stopping system monitor"}
37
+ {"time":"2025-06-29T05:53:57.942366437+08:00","level":"INFO","msg":"Stopped system monitor"}
38
+ {"time":"2025-06-29T05:54:00.869660002+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
39
+ {"time":"2025-06-29T05:54:03.731237694+08:00","level":"INFO","msg":"handler: closed","stream_id":"vgvxxzqc"}
40
+ {"time":"2025-06-29T05:54:03.731282348+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"vgvxxzqc"}
41
+ {"time":"2025-06-29T05:54:03.731313818+08:00","level":"INFO","msg":"sender: closed","stream_id":"vgvxxzqc"}
42
+ {"time":"2025-06-29T05:54:03.735031072+08:00","level":"INFO","msg":"stream: closed","id":"vgvxxzqc"}
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Configure stats pid to 2351
3
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log
7
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log
8
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():893] starting backend
12
+ 2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-29 00:12:00,923 INFO MainThread:2351 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-29 00:12:00,924 INFO MainThread:2351 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-29 00:12:00,926 INFO MainThread:2351 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-29 00:12:00,929 INFO MainThread:2351 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-29 00:12:39,788 INFO MainThread:2351 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-29 00:12:40,036 INFO MainThread:2351 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-29 00:12:46,669 INFO MainThread:2351 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06290009_deepspeed', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 168, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
24
+ 2025-06-29 05:53:57,929 INFO MsgRouterThr:2351 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/run-vgvxxzqc.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02b9bae29f7eff7d1456d46d7a5fc028ecfa83d57215ed5ead79c22a39602cb8
3
+ size 7831005
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eef5f0c622ec387ea2170b4d959435d6318f6de5a9cc1feb0516c58c1a678732
3
+ size 269461360
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db4bea839400c6bc8cdb4aeecb922ce12df2e2d61dd5044829ecbda842413c2a
3
+ size 269466288
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15d343b92ad55cb228b7571bad35ee7aaebdbedbcfe4160f83250484719333b3
3
+ size 269466352
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1f3393bc0e7b8baae8bec392cd9d8599815f083eb9c2bc478332a65f1aff728
3
+ size 269465648
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a24b3d97ae95d888cfdd074705605bf04315162a2c87780e94c7d6fedbca9b49
3
+ size 269466096
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:208d3f152953084a010524ab98240fdece461a004a016925d05ac6e3f92847a6
3
+ size 269466352
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8860a1be45ca900d2da827852856116e0a548ce82432d94210d4713cbbd5acc4
3
+ size 269466096
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d33924194e0954a8e36ef63297fdc2b9754a2e9f368e8f302cab40552b8d5f20
3
+ size 269465904
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7ded329a604392d5b75d24097eff999add88eee739da2b384cb7b6a8b5c311a
3
+ size 359448824
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/latest ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47320987f9a49d5b00119b960f247a956773f57543982b8bfcb6da5bb3afd9ef
3
+ size 10
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/zero_to_fp32.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46497565ccf2b4a8b1f6f18c8341042f3749605a94335c81f69df1bd268af64f
3
+ size 33272
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:194d691a6ab90298daf6d5eaa20e3f9a8164e46912823a9fef49bf8b38aad782
3
+ size 269461360
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c4f598e2f0e66dbdc116fd5ca5e5311fe8b80a8f685e59a9a09d376d8b1666d
3
+ size 269466288
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79bb71cb535f9037a708d5df0bdd2f1f7503f2863210a6e70d95e82e1252efd5
3
+ size 269466352
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69968693b4d1f1dc91388c21530f05303a247181a06837f0d1f7102aa1d08569
3
+ size 269465648
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45daf7ab82636399b71a0e7f8e8f1e713eecccbc1b02231c42c37b2827efa557
3
+ size 269466096
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3933fad3f64b1ec08005ff3f4b41707aec3bc8ad131e058fc01f7a7011b5063c
3
+ size 269466352
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1972613b6bded397cb64a5cbfe6da779c1809df0dc8c91fff5d8a75aa82384b7
3
+ size 269466096
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19d448bb7539c16d523eaefd5f6417c700aed03f1a6c8395c99514462f86ee1e
3
+ size 269465904
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b212a5b5020f57247311f70f59e51d7f724cad099891b5d172b38506ecc64642
3
+ size 359448824
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23721c244e955e6c1b8ef31b4e95492811f6b9bb605d28345b5cae3095393a5b
3
+ size 718651684
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/latest ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47320987f9a49d5b00119b960f247a956773f57543982b8bfcb6da5bb3afd9ef
3
+ size 10
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/zero_to_fp32.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46497565ccf2b4a8b1f6f18c8341042f3749605a94335c81f69df1bd268af64f
3
+ size 33272