Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .DS_Store +0 -0
- .gitattributes +3 -0
- =0.12.10 +33 -0
- README.md +105 -0
- all_checkpoints/.gitignore +2 -0
- all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
- all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt +3 -0
- all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/latest +3 -0
- all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/zero_to_fp32.py +3 -0
- all_checkpoints/stage1_06290009_deepspeed/wandb/debug-internal.log +42 -0
- all_checkpoints/stage1_06290009_deepspeed/wandb/debug.log +24 -0
- all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/config.yaml +236 -0
- all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/output.log +21 -0
- all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/requirements.txt +225 -0
- all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-metadata.json +97 -0
- all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-summary.json +1 -0
- all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log +42 -0
- all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log +24 -0
- all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/run-vgvxxzqc.wandb +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/latest +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/zero_to_fp32.py +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/latest +3 -0
- all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/zero_to_fp32.py +3 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/run-vgvxxzqc.wandb filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/run-6bkqzmou.wandb filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/run-gtrtcbb9.wandb filter=lfs diff=lfs merge=lfs -text
|
=0.12.10
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
|
| 2 |
+
Collecting wandb
|
| 3 |
+
Downloading https://mirrors.aliyun.com/pypi/packages/f9/31/eeb2878b26566c04c3e9b8b20b3ec3c54a2be50535088d36a37c008e07a3/wandb-0.19.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.4 MB)
|
| 4 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.4/21.4 MB 6.2 MB/s eta 0:00:00
|
| 5 |
+
Requirement already satisfied: click!=8.0.0,>=7.1 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (8.2.1)
|
| 6 |
+
Collecting docker-pycreds>=0.4.0 (from wandb)
|
| 7 |
+
Downloading https://mirrors.aliyun.com/pypi/packages/f5/e8/f6bd1eee09314e7e6dee49cbe2c5e22314ccdb38db16c9fc72d2fa80d054/docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
|
| 8 |
+
Requirement already satisfied: gitpython!=3.1.29,>=1.0.0 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (3.1.44)
|
| 9 |
+
Requirement already satisfied: platformdirs in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (4.3.8)
|
| 10 |
+
Requirement already satisfied: protobuf!=4.21.0,!=5.28.0,<7,>=3.19.0 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (6.31.0)
|
| 11 |
+
Requirement already satisfied: psutil>=5.0.0 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (7.0.0)
|
| 12 |
+
Requirement already satisfied: pydantic<3 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (2.11.5)
|
| 13 |
+
Requirement already satisfied: pyyaml in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (6.0.2)
|
| 14 |
+
Requirement already satisfied: requests<3,>=2.0.0 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (2.32.3)
|
| 15 |
+
Collecting sentry-sdk>=2.0.0 (from wandb)
|
| 16 |
+
Downloading https://mirrors.aliyun.com/pypi/packages/f0/e5/da07b0bd832cefd52d16f2b9bbbe31624d57552602c06631686b93ccb1bd/sentry_sdk-2.29.1-py2.py3-none-any.whl (341 kB)
|
| 17 |
+
Collecting setproctitle (from wandb)
|
| 18 |
+
Downloading https://mirrors.aliyun.com/pypi/packages/67/2b/c3cbd4a4462c1143465d8c151f1d51bbfb418e60a96a754329d28d416575/setproctitle-1.3.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
|
| 19 |
+
Requirement already satisfied: setuptools in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (78.1.1)
|
| 20 |
+
Requirement already satisfied: typing-extensions<5,>=4.4 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from wandb) (4.13.2)
|
| 21 |
+
Requirement already satisfied: annotated-types>=0.6.0 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from pydantic<3->wandb) (0.7.0)
|
| 22 |
+
Requirement already satisfied: pydantic-core==2.33.2 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from pydantic<3->wandb) (2.33.2)
|
| 23 |
+
Requirement already satisfied: typing-inspection>=0.4.0 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from pydantic<3->wandb) (0.4.1)
|
| 24 |
+
Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (3.4.2)
|
| 25 |
+
Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (3.10)
|
| 26 |
+
Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (2.4.0)
|
| 27 |
+
Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from requests<3,>=2.0.0->wandb) (2025.4.26)
|
| 28 |
+
Requirement already satisfied: six>=1.4.0 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from docker-pycreds>=0.4.0->wandb) (1.17.0)
|
| 29 |
+
Requirement already satisfied: gitdb<5,>=4.0.1 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from gitpython!=3.1.29,>=1.0.0->wandb) (4.0.12)
|
| 30 |
+
Requirement already satisfied: smmap<6,>=3.0.1 in /root/miniconda3/envs/protT3/lib/python3.10/site-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb) (5.0.2)
|
| 31 |
+
Installing collected packages: setproctitle, sentry-sdk, docker-pycreds, wandb
|
| 32 |
+
|
| 33 |
+
Successfully installed docker-pycreds-0.4.0 sentry-sdk-2.29.1 setproctitle-1.3.6 wandb-0.19.11
|
README.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ProtT3: Protein-to-Text Generation for Text-based Protein Understanding
|
| 2 |
+
|
| 3 |
+
Codes of our ACL2024 paper.
|
| 4 |
+
|
| 5 |
+
Authors: Zhiyuan Liu, An Zhang, Hao Fei, Enzhi Zhang, Xiang Wang, Kenji Kawaguchi, Tat-Seng Chua
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
## Dependencies
|
| 9 |
+
|
| 10 |
+
python==3.8
|
| 11 |
+
|
| 12 |
+
* Install PyTorch with cuda-11.7 using conda by following the instructions in [link](https://pytorch.org/get-started/locally/)
|
| 13 |
+
* Install flash-attention by running `pip install flash-attn --no-build-isolation`. You might need to install the following dependencies first, for building the flash-attention module:
|
| 14 |
+
* `pip install packaging ninja`
|
| 15 |
+
* `conda install -c "nvidia/label/cuda-11.7.1" cuda-nvcc`
|
| 16 |
+
* `conda install -c "nvidia/label/cuda-11.7.1" cuda-libraries-dev`
|
| 17 |
+
* Install the lastest version of opendela by runing `pip install git+https://github.com/thunlp/OpenDelta.git`
|
| 18 |
+
* Install Lavis: `pip install rouge_score nltk salesforce-lavis`
|
| 19 |
+
* Install others: `pip install -U transformers pytorch-lightning`
|
| 20 |
+
* Install the lastest version of deepspeed: `pip install git+https://github.com/microsoft/DeepSpeed.git`
|
| 21 |
+
* Download nltk corpus:
|
| 22 |
+
```
|
| 23 |
+
import nltk
|
| 24 |
+
nltk.download('wordnet')
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
## Dataset
|
| 28 |
+
|
| 29 |
+
Download our pre-processed datasets from [link](https://osf.io/23azs/?view_only=185575515e714f4798499bf06513a730), and unzip the datasets under the `./data` directory
|
| 30 |
+
|
| 31 |
+
## Reproduce results by training from scratch
|
| 32 |
+
|
| 33 |
+
* Reproduce results in stage 1:
|
| 34 |
+
|
| 35 |
+
```sh
|
| 36 |
+
python stage1.py --devices '0,1,2,3' --mode train --filename stage1_ckpt --num_query_token 8 --plm_name "facebook/esm2_t30_150M_UR50D" --save_every_n_epochs 10 --batch_size 32 --precision 'bf16-mixed' --num_workers 8
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
* Convert stage1's DeepSpeed checkpoint to PyTorch format by running
|
| 40 |
+
|
| 41 |
+
```sh
|
| 42 |
+
python convert.py --input /path/to/stage1/ckpt/address --output /path/to/ckpt/saving/address
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
* Reproduce results in stage 2:
|
| 46 |
+
|
| 47 |
+
* Protein Captioning:
|
| 48 |
+
|
| 49 |
+
```sh
|
| 50 |
+
python stage2.py --devices '0,1,2,3' --mode train --filename protein_captioning_swiss_dataset --num_query_token 8 --save_every_n_epochs 10 --batch_size 32 --precision 'bf16-mixed' --num_workers 8 --llm_tune mid_lora --enable_flash --root './data/SwissProtV3' --stage1_path /path/to/ckpt/saving/address;
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
* Protein Question-Answering:
|
| 54 |
+
|
| 55 |
+
```sh
|
| 56 |
+
python stage2.py --devices '0,1,2,3' --mode train --filename prot_qa --num_query_token 8 --save_every_n_epochs 10 --num_workers 8 --batch_size 128 --accumulate_grad_batches 1 --precision 'bf16-mixed' --root "data/PDBDataset" --llm_tune mid_lora --prompt "Question: {} Answer:" --inference_batch 32 --max_inference_len 36 --stage1_path /path/to/ckpt/saving/address;
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
* After running one of the two scripts above, the model's protein-to-text generation resuults will be saved at `./all_checkpoint/[filename]/lightning_logs/[version_x]/dataset0_predictions.txt`. You can evaluate the results by running
|
| 60 |
+
|
| 61 |
+
```sh
|
| 62 |
+
## for question-answering evaluation
|
| 63 |
+
python read_results --path ./all_checkpoint/[filename]/lightning_logs/[version_x]/dataset0_predictions.txt --qa_question
|
| 64 |
+
|
| 65 |
+
## for protein captioning evaluation
|
| 66 |
+
python read_results --path ./all_checkpoint/[filename]/lightning_logs/[version_x]/dataset0_predictions.txt
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
## Reproduce results by loading our checkpoints
|
| 70 |
+
|
| 71 |
+
Download our released checkpoints from [link](https://osf.io/23azs/?view_only=185575515e714f4798499bf06513a730)
|
| 72 |
+
|
| 73 |
+
* Reproduce results in stage 1:
|
| 74 |
+
|
| 75 |
+
```sh
|
| 76 |
+
python stage1.py --devices '0,1,2,3' --mode eval --filename stage1_ckpt --num_query_token 8 --plm_name "facebook/esm2_t30_150M_UR50D" --save_every_n_epochs 10 --batch_size 32 --precision 'bf16-mixed' --num_workers 8 --init_checkpoint /path/to/stage1.ckpt;
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
* Reproduce results in stage 2:
|
| 80 |
+
|
| 81 |
+
* Protein Captioning:
|
| 82 |
+
|
| 83 |
+
```sh
|
| 84 |
+
python stage2.py --devices '0,1,2,3' --mode train --filename protein_captioning_swiss_dataset --num_query_token 8 --save_every_n_epochs 10 --batch_size 32 --precision 'bf16-mixed' --num_workers 8 --llm_tune mid_lora --enable_flash --root './data/SwissProtV3' --init_checkpoint /path/to/swiss_ft.ckpt;
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
* Protein Question-Answering:
|
| 88 |
+
|
| 89 |
+
```sh
|
| 90 |
+
python stage2.py --devices '0,1,2,3' --mode train --filename prot_qa --num_query_token 8 --save_every_n_epochs 10 --num_workers 8 --batch_size 128 --accumulate_grad_batches 1 --precision 'bf16-mixed' --root "data/PDBDataset" --llm_tune mid_lora --prompt "Question: {} Answer:" --inference_batch 32 --max_inference_len 36 --init_checkpoint /path/to/pdbqa_ft.ckpt;
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
## Citation
|
| 95 |
+
|
| 96 |
+
```bib
|
| 97 |
+
@inproceedings{liu2024prott,
|
| 98 |
+
title={ProtT3: Protein-to-Text Generation for Text-based Protein Understanding},
|
| 99 |
+
author={Liu, Zhiyuan and Zhang, An and Fei, Hao and Zhang, Enzhi and Wang, Xiang and Kawaguchi, Kenji and Chua, Tat-Seng},
|
| 100 |
+
booktitle={{ACL}},
|
| 101 |
+
publisher = {Association for Computational Linguistics},
|
| 102 |
+
year={2024},
|
| 103 |
+
url={https://openreview.net/forum?id=ZmIjOPil2b}
|
| 104 |
+
}
|
| 105 |
+
```
|
all_checkpoints/.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*
|
| 2 |
+
!.gitignore
|
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e1d6798603170f2f7903bb9a0d21a5ed25e860528296107c8b31cd2914a86965
|
| 3 |
+
size 269461360
|
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ab0580d136844b18a0e388946d57dca52ab782052d7885d0c033a07708b3957
|
| 3 |
+
size 269466288
|
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e71ea3d389287f043d6acb2922ee075f8ceaf30ac73b145c5cb709ca3a4cfea1
|
| 3 |
+
size 269466352
|
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9bd10e7db6ec37fcbdb2baa6f51691e815bde2e5a112b516675fb556c4674e87
|
| 3 |
+
size 269465648
|
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2b5975d03715bc1cc0eb9de733fbb4475c0540e790c6f00d0aae625e8ba08230
|
| 3 |
+
size 269466096
|
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:61d27b7a3de97f24ef54d58dc7e2a57d2568ed520177746f47540c3f4d26a847
|
| 3 |
+
size 269466352
|
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:08bcb9b3d8e2b0e8e71b925b4c0ef0da6702e9609c252dc90607d9d4e3801b4c
|
| 3 |
+
size 269466096
|
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d426f595bbbde8b5a76ead76e06cee6febe35c7cf6edcf40aaf06560d47100e
|
| 3 |
+
size 269465904
|
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/checkpoint/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2691cb5de812f5198012ce9c319ee4faae20c95a97a5639b3796d13561d44b49
|
| 3 |
+
size 359448824
|
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aabff8ac8cffa262ba0de81d44b32342f1fce2f1b6efe66064bb9d99b3ef01bd
|
| 3 |
+
size 718651620
|
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/latest
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47320987f9a49d5b00119b960f247a956773f57543982b8bfcb6da5bb3afd9ef
|
| 3 |
+
size 10
|
all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/zero_to_fp32.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:46497565ccf2b4a8b1f6f18c8341042f3749605a94335c81f69df1bd268af64f
|
| 3 |
+
size 33272
|
all_checkpoints/stage1_06290009_deepspeed/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-29T00:12:00.930911849+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-29T00:12:31.03707486+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 3 |
+
{"time":"2025-06-29T00:12:36.876219526+08:00","level":"INFO","msg":"created new stream","id":"vgvxxzqc"}
|
| 4 |
+
{"time":"2025-06-29T00:12:36.876272436+08:00","level":"INFO","msg":"stream: started","id":"vgvxxzqc"}
|
| 5 |
+
{"time":"2025-06-29T00:12:36.876317878+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"vgvxxzqc"}
|
| 6 |
+
{"time":"2025-06-29T00:12:36.876360145+08:00","level":"INFO","msg":"handler: started","stream_id":"vgvxxzqc"}
|
| 7 |
+
{"time":"2025-06-29T00:12:36.876401621+08:00","level":"INFO","msg":"sender: started","stream_id":"vgvxxzqc"}
|
| 8 |
+
{"time":"2025-06-29T00:12:39.839397838+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 9 |
+
{"time":"2025-06-29T00:13:20.87652364+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58014->104.21.20.172:443: read: connection reset by peer"}
|
| 10 |
+
{"time":"2025-06-29T00:16:17.426969211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 11 |
+
{"time":"2025-06-29T00:17:58.51721346+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 12 |
+
{"time":"2025-06-29T00:26:10.222503445+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51478->172.67.193.61:443: read: connection timed out"}
|
| 13 |
+
{"time":"2025-06-29T00:28:34.807681677+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46066->104.21.20.172:443: read: connection reset by peer"}
|
| 14 |
+
{"time":"2025-06-29T00:33:01.870533298+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55734->104.21.20.172:443: read: connection timed out"}
|
| 15 |
+
{"time":"2025-06-29T00:33:23.754842161+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46042->172.67.193.61:443: read: connection reset by peer"}
|
| 16 |
+
{"time":"2025-06-29T00:36:47.149592254+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:57176->172.67.193.61:443: read: connection timed out"}
|
| 17 |
+
{"time":"2025-06-29T00:41:05.645712211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 18 |
+
{"time":"2025-06-29T00:47:20.493525378+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58300->104.21.20.172:443: read: connection timed out"}
|
| 19 |
+
{"time":"2025-06-29T00:47:45.629981285+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 20 |
+
{"time":"2025-06-29T00:51:19.597480154+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:60802->104.21.20.172:443: read: connection timed out"}
|
| 21 |
+
{"time":"2025-06-29T00:54:09.777365701+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51532->104.21.20.172:443: read: connection reset by peer"}
|
| 22 |
+
{"time":"2025-06-29T01:00:20.78154967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:34934->172.67.193.61:443: read: connection timed out"}
|
| 23 |
+
{"time":"2025-06-29T01:04:21.421531776+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55938->104.21.20.172:443: read: connection timed out"}
|
| 24 |
+
{"time":"2025-06-29T01:05:41.05509194+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45678->172.67.193.61:443: read: connection reset by peer"}
|
| 25 |
+
{"time":"2025-06-29T01:08:44.130266043+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 26 |
+
{"time":"2025-06-29T01:10:59.724692621+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 27 |
+
{"time":"2025-06-29T01:14:35.821570745+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:35232->104.21.20.172:443: read: connection timed out"}
|
| 28 |
+
{"time":"2025-06-29T01:17:35.533530754+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:44258->104.21.20.172:443: read: connection timed out"}
|
| 29 |
+
{"time":"2025-06-29T01:23:12.630381225+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:48108->172.67.193.61:443: read: connection reset by peer"}
|
| 30 |
+
{"time":"2025-06-29T01:24:55.067569821+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 31 |
+
{"time":"2025-06-29T01:25:27.188065501+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 32 |
+
{"time":"2025-06-29T01:25:56.782489942+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45024->104.21.20.172:443: read: connection timed out"}
|
| 33 |
+
{"time":"2025-06-29T01:26:49.538892815+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 34 |
+
{"time":"2025-06-29T01:29:46.157546022+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:42508->172.67.193.61:443: read: connection timed out"}
|
| 35 |
+
{"time":"2025-06-29T05:53:57.9412544+08:00","level":"INFO","msg":"stream: closing","id":"vgvxxzqc"}
|
| 36 |
+
{"time":"2025-06-29T05:53:57.941286983+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 37 |
+
{"time":"2025-06-29T05:53:57.942366437+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 38 |
+
{"time":"2025-06-29T05:54:00.869660002+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 39 |
+
{"time":"2025-06-29T05:54:03.731237694+08:00","level":"INFO","msg":"handler: closed","stream_id":"vgvxxzqc"}
|
| 40 |
+
{"time":"2025-06-29T05:54:03.731282348+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"vgvxxzqc"}
|
| 41 |
+
{"time":"2025-06-29T05:54:03.731313818+08:00","level":"INFO","msg":"sender: closed","stream_id":"vgvxxzqc"}
|
| 42 |
+
{"time":"2025-06-29T05:54:03.735031072+08:00","level":"INFO","msg":"stream: closed","id":"vgvxxzqc"}
|
all_checkpoints/stage1_06290009_deepspeed/wandb/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Configure stats pid to 2351
|
| 3 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log
|
| 7 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log
|
| 8 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-29 00:12:00,923 INFO MainThread:2351 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-29 00:12:00,924 INFO MainThread:2351 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-29 00:12:00,926 INFO MainThread:2351 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-29 00:12:00,929 INFO MainThread:2351 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-29 00:12:39,788 INFO MainThread:2351 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-06-29 00:12:40,036 INFO MainThread:2351 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-06-29 00:12:46,669 INFO MainThread:2351 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06290009_deepspeed', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 168, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
|
| 24 |
+
2025-06-29 05:53:57,929 INFO MsgRouterThr:2351 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
|
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/config.yaml
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.11
|
| 4 |
+
m:
|
| 5 |
+
- "1": trainer/global_step
|
| 6 |
+
"6":
|
| 7 |
+
- 3
|
| 8 |
+
"7": []
|
| 9 |
+
- "1": train_loss
|
| 10 |
+
"5": 1
|
| 11 |
+
"6":
|
| 12 |
+
- 1
|
| 13 |
+
- 3
|
| 14 |
+
"7": []
|
| 15 |
+
- "1": loader1/val_loss/dataloader_idx_1
|
| 16 |
+
"5": 1
|
| 17 |
+
"6":
|
| 18 |
+
- 1
|
| 19 |
+
- 3
|
| 20 |
+
"7": []
|
| 21 |
+
- "1": loader0/val_loss_lm/dataloader_idx_0
|
| 22 |
+
"5": 1
|
| 23 |
+
"6":
|
| 24 |
+
- 1
|
| 25 |
+
- 3
|
| 26 |
+
"7": []
|
| 27 |
+
- "1": loader1/val_loss_ptm/dataloader_idx_1
|
| 28 |
+
"5": 1
|
| 29 |
+
"6":
|
| 30 |
+
- 1
|
| 31 |
+
- 3
|
| 32 |
+
"7": []
|
| 33 |
+
- "1": loader2/val_loss_lm/dataloader_idx_2
|
| 34 |
+
"5": 1
|
| 35 |
+
"6":
|
| 36 |
+
- 1
|
| 37 |
+
- 3
|
| 38 |
+
"7": []
|
| 39 |
+
- "1": loader1/val_loss_ptc/dataloader_idx_1
|
| 40 |
+
"5": 1
|
| 41 |
+
"6":
|
| 42 |
+
- 1
|
| 43 |
+
- 3
|
| 44 |
+
"7": []
|
| 45 |
+
- "1": epoch
|
| 46 |
+
"5": 1
|
| 47 |
+
"6":
|
| 48 |
+
- 1
|
| 49 |
+
- 3
|
| 50 |
+
"7": []
|
| 51 |
+
- "1": lr
|
| 52 |
+
"5": 1
|
| 53 |
+
"6":
|
| 54 |
+
- 1
|
| 55 |
+
- 3
|
| 56 |
+
"7": []
|
| 57 |
+
- "1": loader2/val_loss_ptc/dataloader_idx_2
|
| 58 |
+
"5": 1
|
| 59 |
+
"6":
|
| 60 |
+
- 1
|
| 61 |
+
- 3
|
| 62 |
+
"7": []
|
| 63 |
+
- "1": loader0/val_loss_ptm/dataloader_idx_0
|
| 64 |
+
"5": 1
|
| 65 |
+
"6":
|
| 66 |
+
- 1
|
| 67 |
+
- 3
|
| 68 |
+
"7": []
|
| 69 |
+
- "1": train_loss_ptc
|
| 70 |
+
"5": 1
|
| 71 |
+
"6":
|
| 72 |
+
- 1
|
| 73 |
+
- 3
|
| 74 |
+
"7": []
|
| 75 |
+
- "1": train_loss_ptm
|
| 76 |
+
"5": 1
|
| 77 |
+
"6":
|
| 78 |
+
- 1
|
| 79 |
+
- 3
|
| 80 |
+
"7": []
|
| 81 |
+
- "1": train_loss_lm
|
| 82 |
+
"5": 1
|
| 83 |
+
"6":
|
| 84 |
+
- 1
|
| 85 |
+
- 3
|
| 86 |
+
"7": []
|
| 87 |
+
- "1": loader2/val_loss_ptm/dataloader_idx_2
|
| 88 |
+
"5": 1
|
| 89 |
+
"6":
|
| 90 |
+
- 1
|
| 91 |
+
- 3
|
| 92 |
+
"7": []
|
| 93 |
+
- "1": loader0/val_loss/dataloader_idx_0
|
| 94 |
+
"5": 1
|
| 95 |
+
"6":
|
| 96 |
+
- 1
|
| 97 |
+
- 3
|
| 98 |
+
"7": []
|
| 99 |
+
- "1": loader2/val_loss/dataloader_idx_2
|
| 100 |
+
"5": 1
|
| 101 |
+
"6":
|
| 102 |
+
- 1
|
| 103 |
+
- 3
|
| 104 |
+
"7": []
|
| 105 |
+
- "1": loader1/val_loss_lm/dataloader_idx_1
|
| 106 |
+
"5": 1
|
| 107 |
+
"6":
|
| 108 |
+
- 1
|
| 109 |
+
- 3
|
| 110 |
+
"7": []
|
| 111 |
+
- "1": loader0/val_loss_ptc/dataloader_idx_0
|
| 112 |
+
"5": 1
|
| 113 |
+
"6":
|
| 114 |
+
- 1
|
| 115 |
+
- 3
|
| 116 |
+
"7": []
|
| 117 |
+
python_version: 3.10.0
|
| 118 |
+
t:
|
| 119 |
+
"1":
|
| 120 |
+
- 1
|
| 121 |
+
- 5
|
| 122 |
+
- 9
|
| 123 |
+
- 11
|
| 124 |
+
- 33
|
| 125 |
+
- 41
|
| 126 |
+
- 49
|
| 127 |
+
- 53
|
| 128 |
+
- 55
|
| 129 |
+
- 63
|
| 130 |
+
- 103
|
| 131 |
+
"2":
|
| 132 |
+
- 1
|
| 133 |
+
- 5
|
| 134 |
+
- 9
|
| 135 |
+
- 11
|
| 136 |
+
- 33
|
| 137 |
+
- 41
|
| 138 |
+
- 49
|
| 139 |
+
- 53
|
| 140 |
+
- 55
|
| 141 |
+
- 63
|
| 142 |
+
- 103
|
| 143 |
+
"3":
|
| 144 |
+
- 7
|
| 145 |
+
- 23
|
| 146 |
+
- 55
|
| 147 |
+
- 66
|
| 148 |
+
"4": 3.10.0
|
| 149 |
+
"5": 0.19.11
|
| 150 |
+
"6": 4.52.3
|
| 151 |
+
"8":
|
| 152 |
+
- 5
|
| 153 |
+
"12": 0.19.11
|
| 154 |
+
"13": linux-x86_64
|
| 155 |
+
accelerator:
|
| 156 |
+
value: gpu
|
| 157 |
+
batch_size:
|
| 158 |
+
value: 168
|
| 159 |
+
bert_hidden_dim:
|
| 160 |
+
value: 768
|
| 161 |
+
bert_name:
|
| 162 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
|
| 163 |
+
check_val_every_n_epoch:
|
| 164 |
+
value: 1
|
| 165 |
+
cross_attention_freq:
|
| 166 |
+
value: 2
|
| 167 |
+
devices:
|
| 168 |
+
value: 0,1,2,3,4,5,6,7
|
| 169 |
+
filename:
|
| 170 |
+
value: stage1_06290009_deepspeed
|
| 171 |
+
init_checkpoint:
|
| 172 |
+
value: ""
|
| 173 |
+
init_lr:
|
| 174 |
+
value: 0.0001
|
| 175 |
+
lm:
|
| 176 |
+
value: true
|
| 177 |
+
load_4bit:
|
| 178 |
+
value: false
|
| 179 |
+
lr_decay_rate:
|
| 180 |
+
value: 0.9
|
| 181 |
+
match_batch_size:
|
| 182 |
+
value: 64
|
| 183 |
+
max_epochs:
|
| 184 |
+
value: 20
|
| 185 |
+
min_lr:
|
| 186 |
+
value: 1e-05
|
| 187 |
+
mix_dataset:
|
| 188 |
+
value: true
|
| 189 |
+
mode:
|
| 190 |
+
value: train
|
| 191 |
+
num_query_token:
|
| 192 |
+
value: 8
|
| 193 |
+
num_workers:
|
| 194 |
+
value: 8
|
| 195 |
+
plm_name:
|
| 196 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
|
| 197 |
+
plm_tune:
|
| 198 |
+
value: freeze
|
| 199 |
+
pool_size:
|
| 200 |
+
value: 0
|
| 201 |
+
precision:
|
| 202 |
+
value: bf16-mixed
|
| 203 |
+
projection_dim:
|
| 204 |
+
value: 256
|
| 205 |
+
prot_aug:
|
| 206 |
+
value: None
|
| 207 |
+
prot_max_len:
|
| 208 |
+
value: 1024
|
| 209 |
+
ptm:
|
| 210 |
+
value: true
|
| 211 |
+
rerank_cand_num:
|
| 212 |
+
value: 128
|
| 213 |
+
retrieval_eval_epoch:
|
| 214 |
+
value: 10
|
| 215 |
+
root:
|
| 216 |
+
value: data
|
| 217 |
+
save_every_n_epochs:
|
| 218 |
+
value: 5
|
| 219 |
+
scheduler:
|
| 220 |
+
value: linear_warmup_cosine_lr
|
| 221 |
+
seed:
|
| 222 |
+
value: 42
|
| 223 |
+
strategy:
|
| 224 |
+
value: deepspeed
|
| 225 |
+
temperature:
|
| 226 |
+
value: 0.1
|
| 227 |
+
text_max_len:
|
| 228 |
+
value: 128
|
| 229 |
+
use_wandb_logger:
|
| 230 |
+
value: true
|
| 231 |
+
warmup_lr:
|
| 232 |
+
value: 1e-06
|
| 233 |
+
warmup_steps:
|
| 234 |
+
value: 1000
|
| 235 |
+
weight_decay:
|
| 236 |
+
value: 0.05
|
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/output.log
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed exists and is not empty.
|
| 2 |
+
Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
|
| 3 |
+
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
|
| 4 |
+
|
| 5 |
+
| Name | Type | Params | Mode
|
| 6 |
+
------------------------------------------------------
|
| 7 |
+
0 | blip2qformer | Blip2Qformer | 327 M | train
|
| 8 |
+
------------------------------------------------------
|
| 9 |
+
179 M Trainable params
|
| 10 |
+
147 M Non-trainable params
|
| 11 |
+
327 M Total params
|
| 12 |
+
1,309.467 Total estimated model params size (MB)
|
| 13 |
+
5 Modules in train mode
|
| 14 |
+
926 Modules in eval mode
|
| 15 |
+
Epoch 19: 100%|███████████████████████████████████████████| 320/320 [17:07<00:00, 0.31it/s, v_num=xzqc]
|
| 16 |
+
/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:220: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
|
| 17 |
+
with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
|
| 18 |
+
|
| 19 |
+
/nas/shared/kilab/wangyujia/ProtT3/model/dist_funs.py:18: FutureWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
|
| 20 |
+
sd = self.module.state_dict(destination, prefix, keep_vars)
|
| 21 |
+
`Trainer.fit` stopped: `max_epochs=20` reached.
|
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/requirements.txt
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
opendatasets==0.1.22
|
| 2 |
+
salesforce-lavis==1.0.2
|
| 3 |
+
Pygments==2.19.1
|
| 4 |
+
nvidia-nccl-cu12==2.21.5
|
| 5 |
+
tornado==6.5.1
|
| 6 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 7 |
+
requests==2.32.3
|
| 8 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 9 |
+
decord==0.6.0
|
| 10 |
+
braceexpand==0.1.7
|
| 11 |
+
frozenlist==1.6.0
|
| 12 |
+
markdown-it-py==3.0.0
|
| 13 |
+
shellingham==1.5.4
|
| 14 |
+
absl-py==2.2.2
|
| 15 |
+
pycocoevalcap==1.2
|
| 16 |
+
contexttimer==0.3.3
|
| 17 |
+
bleach==6.2.0
|
| 18 |
+
jsonschema-specifications==2025.4.1
|
| 19 |
+
pycocotools==2.0.8
|
| 20 |
+
python-slugify==8.0.4
|
| 21 |
+
tqdm==4.67.1
|
| 22 |
+
numpy==2.2.6
|
| 23 |
+
urllib3==2.4.0
|
| 24 |
+
deepspeed==0.16.10+b666844f
|
| 25 |
+
watchdog==6.0.0
|
| 26 |
+
wrapt==1.17.2
|
| 27 |
+
setuptools==78.1.1
|
| 28 |
+
matplotlib==3.10.3
|
| 29 |
+
pydeck==0.9.1
|
| 30 |
+
aiosignal==1.3.2
|
| 31 |
+
gitdb==4.0.12
|
| 32 |
+
hjson==3.1.0
|
| 33 |
+
timm==0.4.12
|
| 34 |
+
blis==1.3.0
|
| 35 |
+
PyYAML==6.0.2
|
| 36 |
+
referencing==0.36.2
|
| 37 |
+
contourpy==1.3.2
|
| 38 |
+
kaggle==1.7.4.5
|
| 39 |
+
triton==3.2.0
|
| 40 |
+
catalogue==2.0.10
|
| 41 |
+
idna==3.10
|
| 42 |
+
torch==2.6.0
|
| 43 |
+
text-unidecode==1.3
|
| 44 |
+
altair==5.5.0
|
| 45 |
+
cloudpathlib==0.21.1
|
| 46 |
+
protobuf==6.31.0
|
| 47 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 48 |
+
pytz==2025.2
|
| 49 |
+
sympy==1.13.1
|
| 50 |
+
spacy==3.8.7
|
| 51 |
+
MarkupSafe==3.0.2
|
| 52 |
+
thinc==8.3.6
|
| 53 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 54 |
+
wasabi==1.1.3
|
| 55 |
+
aiohappyeyeballs==2.6.1
|
| 56 |
+
nvidia-nvtx-cu12==12.4.127
|
| 57 |
+
rich==14.0.0
|
| 58 |
+
ipython==8.36.0
|
| 59 |
+
yarl==1.20.0
|
| 60 |
+
torchmetrics==1.7.1
|
| 61 |
+
multidict==6.4.4
|
| 62 |
+
cfgv==3.4.0
|
| 63 |
+
smmap==5.0.2
|
| 64 |
+
srsly==2.5.1
|
| 65 |
+
scikit-image==0.25.2
|
| 66 |
+
matplotlib-inline==0.1.7
|
| 67 |
+
annotated-types==0.7.0
|
| 68 |
+
lazy_loader==0.4
|
| 69 |
+
tenacity==9.1.2
|
| 70 |
+
GitPython==3.1.44
|
| 71 |
+
language_data==1.3.0
|
| 72 |
+
pydantic_core==2.33.2
|
| 73 |
+
sentencepiece==0.2.0
|
| 74 |
+
platformdirs==4.3.8
|
| 75 |
+
distlib==0.3.9
|
| 76 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 77 |
+
blinker==1.9.0
|
| 78 |
+
regex==2024.11.6
|
| 79 |
+
tifffile==2025.5.10
|
| 80 |
+
py-cpuinfo==9.0.0
|
| 81 |
+
attrs==25.3.0
|
| 82 |
+
mdurl==0.1.2
|
| 83 |
+
prompt_toolkit==3.0.51
|
| 84 |
+
packaging==24.2
|
| 85 |
+
async-timeout==5.0.1
|
| 86 |
+
six==1.17.0
|
| 87 |
+
executing==2.2.0
|
| 88 |
+
parso==0.8.4
|
| 89 |
+
omegaconf==2.3.0
|
| 90 |
+
wcwidth==0.2.13
|
| 91 |
+
murmurhash==1.0.13
|
| 92 |
+
stack-data==0.6.3
|
| 93 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 94 |
+
virtualenv==20.31.2
|
| 95 |
+
langcodes==3.5.0
|
| 96 |
+
fonttools==4.58.0
|
| 97 |
+
opencv-python-headless==4.5.5.64
|
| 98 |
+
jedi==0.19.2
|
| 99 |
+
torchvision==0.21.0
|
| 100 |
+
plotly==6.1.1
|
| 101 |
+
nodeenv==1.9.1
|
| 102 |
+
smart-open==7.1.0
|
| 103 |
+
toml==0.10.2
|
| 104 |
+
pytorch-lightning==2.5.1.post0
|
| 105 |
+
typing_extensions==4.13.2
|
| 106 |
+
safetensors==0.5.3
|
| 107 |
+
psutil==7.0.0
|
| 108 |
+
pillow==11.2.1
|
| 109 |
+
python-dateutil==2.9.0.post0
|
| 110 |
+
ftfy==6.3.1
|
| 111 |
+
scipy==1.15.3
|
| 112 |
+
webdataset==0.2.111
|
| 113 |
+
charset-normalizer==3.4.2
|
| 114 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 115 |
+
kiwisolver==1.4.8
|
| 116 |
+
nvidia-ml-py==12.575.51
|
| 117 |
+
confection==0.1.5
|
| 118 |
+
nvidia-curand-cu12==10.3.5.147
|
| 119 |
+
pandas==2.2.3
|
| 120 |
+
nltk==3.9.1
|
| 121 |
+
webencodings==0.5.1
|
| 122 |
+
pyarrow==20.0.0
|
| 123 |
+
asttokens==3.0.0
|
| 124 |
+
exceptiongroup==1.3.0
|
| 125 |
+
pre_commit==4.2.0
|
| 126 |
+
ninja==1.11.1.4
|
| 127 |
+
spacy-loggers==1.0.5
|
| 128 |
+
msgpack==1.1.0
|
| 129 |
+
lightning-utilities==0.14.3
|
| 130 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 131 |
+
tzdata==2025.2
|
| 132 |
+
cycler==0.12.1
|
| 133 |
+
hf-xet==1.1.2
|
| 134 |
+
antlr4-python3-runtime==4.9.3
|
| 135 |
+
iopath==0.1.10
|
| 136 |
+
pexpect==4.9.0
|
| 137 |
+
imageio==2.37.0
|
| 138 |
+
streamlit==1.45.1
|
| 139 |
+
python-magic==0.4.27
|
| 140 |
+
networkx==3.4.2
|
| 141 |
+
portalocker==3.1.1
|
| 142 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 143 |
+
propcache==0.3.1
|
| 144 |
+
ptyprocess==0.7.0
|
| 145 |
+
fairscale==0.4.4
|
| 146 |
+
rpds-py==0.25.1
|
| 147 |
+
certifi==2025.4.26
|
| 148 |
+
rouge_score==0.1.2
|
| 149 |
+
traitlets==5.14.3
|
| 150 |
+
identify==2.6.12
|
| 151 |
+
spacy-legacy==3.0.12
|
| 152 |
+
weasel==0.4.1
|
| 153 |
+
mpmath==1.3.0
|
| 154 |
+
cymem==2.0.11
|
| 155 |
+
typing-inspection==0.4.1
|
| 156 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 157 |
+
marisa-trie==1.2.1
|
| 158 |
+
einops==0.8.1
|
| 159 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 160 |
+
pydantic==2.11.5
|
| 161 |
+
cachetools==5.5.2
|
| 162 |
+
joblib==1.5.1
|
| 163 |
+
Jinja2==3.1.6
|
| 164 |
+
filelock==3.18.0
|
| 165 |
+
pyparsing==3.2.3
|
| 166 |
+
pure_eval==0.2.3
|
| 167 |
+
decorator==5.2.1
|
| 168 |
+
wheel==0.45.1
|
| 169 |
+
pycryptodome==3.23.0
|
| 170 |
+
cheroot==10.0.1
|
| 171 |
+
multiprocess==0.70.16
|
| 172 |
+
aiohttp==3.12.2
|
| 173 |
+
crcmod==1.7
|
| 174 |
+
fsspec==2025.3.0
|
| 175 |
+
jmespath==0.10.0
|
| 176 |
+
preshed==3.0.10
|
| 177 |
+
jaraco.functools==4.1.0
|
| 178 |
+
cryptography==45.0.3
|
| 179 |
+
sentry-sdk==2.29.1
|
| 180 |
+
tokenizers==0.21.1
|
| 181 |
+
opendelta==0.3.2
|
| 182 |
+
pycparser==2.22
|
| 183 |
+
narwhals==1.41.0
|
| 184 |
+
scikit-learn==1.6.1
|
| 185 |
+
dill==0.3.8
|
| 186 |
+
oss2==2.15.0
|
| 187 |
+
yacs==0.1.8
|
| 188 |
+
more-itertools==10.7.0
|
| 189 |
+
pip==25.1.1
|
| 190 |
+
threadpoolctl==3.6.0
|
| 191 |
+
flash-attn==2.7.1.post1
|
| 192 |
+
bigmodelvis==0.0.1
|
| 193 |
+
pathlib==1.0.1
|
| 194 |
+
delta-center-client==0.0.4
|
| 195 |
+
xxhash==3.5.0
|
| 196 |
+
wandb==0.19.11
|
| 197 |
+
setproctitle==1.3.6
|
| 198 |
+
aliyun-python-sdk-core==2.16.0
|
| 199 |
+
transformers==4.52.3
|
| 200 |
+
aliyun-python-sdk-kms==2.16.5
|
| 201 |
+
datasets==3.6.0
|
| 202 |
+
typer==0.16.0
|
| 203 |
+
docker-pycreds==0.4.0
|
| 204 |
+
click==8.2.1
|
| 205 |
+
huggingface-hub==0.32.1
|
| 206 |
+
web.py==0.62
|
| 207 |
+
cffi==1.17.1
|
| 208 |
+
opencv-python==4.11.0.86
|
| 209 |
+
jsonschema==4.24.0
|
| 210 |
+
typing_extensions==4.12.2
|
| 211 |
+
jaraco.functools==4.0.1
|
| 212 |
+
jaraco.text==3.12.1
|
| 213 |
+
jaraco.collections==5.1.0
|
| 214 |
+
inflect==7.3.1
|
| 215 |
+
more-itertools==10.3.0
|
| 216 |
+
packaging==24.2
|
| 217 |
+
importlib_metadata==8.0.0
|
| 218 |
+
backports.tarfile==1.2.0
|
| 219 |
+
typeguard==4.3.0
|
| 220 |
+
zipp==3.19.2
|
| 221 |
+
platformdirs==4.2.2
|
| 222 |
+
autocommand==2.2.2
|
| 223 |
+
jaraco.context==5.3.0
|
| 224 |
+
tomli==2.0.1
|
| 225 |
+
wheel==0.45.1
|
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-06-28T16:12:00.926076Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--devices",
|
| 7 |
+
"0,1,2,3,4,5,6,7",
|
| 8 |
+
"--mode",
|
| 9 |
+
"train",
|
| 10 |
+
"--filename",
|
| 11 |
+
"stage1_06290009_deepspeed",
|
| 12 |
+
"--num_query_token",
|
| 13 |
+
"8",
|
| 14 |
+
"--plm_name",
|
| 15 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
|
| 16 |
+
"--bert_name",
|
| 17 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
|
| 18 |
+
"--save_every_n_epochs",
|
| 19 |
+
"5",
|
| 20 |
+
"--max_epochs",
|
| 21 |
+
"20",
|
| 22 |
+
"--batch_size",
|
| 23 |
+
"168",
|
| 24 |
+
"--precision",
|
| 25 |
+
"bf16-mixed",
|
| 26 |
+
"--mix_dataset",
|
| 27 |
+
"--num_workers",
|
| 28 |
+
"8",
|
| 29 |
+
"--use_wandb_logger"
|
| 30 |
+
],
|
| 31 |
+
"program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
|
| 32 |
+
"codePath": "stage1.py",
|
| 33 |
+
"root": "./all_checkpoints/stage1_06290009_deepspeed/",
|
| 34 |
+
"host": "dsw-251511-c65bb988c-9g24f",
|
| 35 |
+
"executable": "/root/miniconda3/envs/protT3/bin/python",
|
| 36 |
+
"codePathLocal": "stage1.py",
|
| 37 |
+
"cpu_count": 64,
|
| 38 |
+
"cpu_count_logical": 64,
|
| 39 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 40 |
+
"gpu_count": 8,
|
| 41 |
+
"disk": {
|
| 42 |
+
"/": {
|
| 43 |
+
"total": "1623302262784",
|
| 44 |
+
"used": "987680768"
|
| 45 |
+
}
|
| 46 |
+
},
|
| 47 |
+
"memory": {
|
| 48 |
+
"total": "549755813888"
|
| 49 |
+
},
|
| 50 |
+
"cpu": {
|
| 51 |
+
"count": 64,
|
| 52 |
+
"countLogical": 64
|
| 53 |
+
},
|
| 54 |
+
"gpu_nvidia": [
|
| 55 |
+
{
|
| 56 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 57 |
+
"memoryTotal": "85198045184",
|
| 58 |
+
"architecture": "Ampere"
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 62 |
+
"memoryTotal": "85198045184",
|
| 63 |
+
"architecture": "Ampere"
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 67 |
+
"memoryTotal": "85198045184",
|
| 68 |
+
"architecture": "Ampere"
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 72 |
+
"memoryTotal": "85198045184",
|
| 73 |
+
"architecture": "Ampere"
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 77 |
+
"memoryTotal": "85198045184",
|
| 78 |
+
"architecture": "Ampere"
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 82 |
+
"memoryTotal": "85198045184",
|
| 83 |
+
"architecture": "Ampere"
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 87 |
+
"memoryTotal": "85198045184",
|
| 88 |
+
"architecture": "Ampere"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 92 |
+
"memoryTotal": "85198045184",
|
| 93 |
+
"architecture": "Ampere"
|
| 94 |
+
}
|
| 95 |
+
],
|
| 96 |
+
"cudaVersion": "12.1"
|
| 97 |
+
}
|
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"loader1/val_loss_lm/dataloader_idx_1":5.460818767547607,"lr":1.0554024811426643e-05,"_timestamp":1.7511476307831557e+09,"loader1/val_loss/dataloader_idx_1":14.773687362670898,"train_loss":2.630859375,"loader2/val_loss_ptm/dataloader_idx_2":2.988206148147583,"loader1/val_loss_ptm/dataloader_idx_1":3.491912603378296,"loader2/val_loss_lm/dataloader_idx_2":6.094737529754639,"_runtime":20509.857471191,"loader2/val_loss_ptc/dataloader_idx_2":5.59119987487793,"_wandb":{"runtime":20517},"train_loss_ptc":0.64306640625,"train_loss_ptm":0.291015625,"loader0/val_loss_ptc/dataloader_idx_0":1.0085703134536743,"loader1/val_loss_ptc/dataloader_idx_1":5.825062274932861,"loader0/val_loss_lm/dataloader_idx_0":2.1440062522888184,"loader2/val_loss/dataloader_idx_2":14.674175262451172,"epoch":19,"train_loss_lm":1.697265625,"_step":147,"loader0/val_loss/dataloader_idx_0":3.6699562072753906,"loader0/val_loss_ptm/dataloader_idx_0":0.5172882676124573,"trainer/global_step":6399}
|
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-29T00:12:00.930911849+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-29T00:12:31.03707486+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
|
| 3 |
+
{"time":"2025-06-29T00:12:36.876219526+08:00","level":"INFO","msg":"created new stream","id":"vgvxxzqc"}
|
| 4 |
+
{"time":"2025-06-29T00:12:36.876272436+08:00","level":"INFO","msg":"stream: started","id":"vgvxxzqc"}
|
| 5 |
+
{"time":"2025-06-29T00:12:36.876317878+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"vgvxxzqc"}
|
| 6 |
+
{"time":"2025-06-29T00:12:36.876360145+08:00","level":"INFO","msg":"handler: started","stream_id":"vgvxxzqc"}
|
| 7 |
+
{"time":"2025-06-29T00:12:36.876401621+08:00","level":"INFO","msg":"sender: started","stream_id":"vgvxxzqc"}
|
| 8 |
+
{"time":"2025-06-29T00:12:39.839397838+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 9 |
+
{"time":"2025-06-29T00:13:20.87652364+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58014->104.21.20.172:443: read: connection reset by peer"}
|
| 10 |
+
{"time":"2025-06-29T00:16:17.426969211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 11 |
+
{"time":"2025-06-29T00:17:58.51721346+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 12 |
+
{"time":"2025-06-29T00:26:10.222503445+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51478->172.67.193.61:443: read: connection timed out"}
|
| 13 |
+
{"time":"2025-06-29T00:28:34.807681677+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46066->104.21.20.172:443: read: connection reset by peer"}
|
| 14 |
+
{"time":"2025-06-29T00:33:01.870533298+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55734->104.21.20.172:443: read: connection timed out"}
|
| 15 |
+
{"time":"2025-06-29T00:33:23.754842161+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46042->172.67.193.61:443: read: connection reset by peer"}
|
| 16 |
+
{"time":"2025-06-29T00:36:47.149592254+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:57176->172.67.193.61:443: read: connection timed out"}
|
| 17 |
+
{"time":"2025-06-29T00:41:05.645712211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 18 |
+
{"time":"2025-06-29T00:47:20.493525378+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58300->104.21.20.172:443: read: connection timed out"}
|
| 19 |
+
{"time":"2025-06-29T00:47:45.629981285+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 20 |
+
{"time":"2025-06-29T00:51:19.597480154+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:60802->104.21.20.172:443: read: connection timed out"}
|
| 21 |
+
{"time":"2025-06-29T00:54:09.777365701+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51532->104.21.20.172:443: read: connection reset by peer"}
|
| 22 |
+
{"time":"2025-06-29T01:00:20.78154967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:34934->172.67.193.61:443: read: connection timed out"}
|
| 23 |
+
{"time":"2025-06-29T01:04:21.421531776+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55938->104.21.20.172:443: read: connection timed out"}
|
| 24 |
+
{"time":"2025-06-29T01:05:41.05509194+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45678->172.67.193.61:443: read: connection reset by peer"}
|
| 25 |
+
{"time":"2025-06-29T01:08:44.130266043+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 26 |
+
{"time":"2025-06-29T01:10:59.724692621+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 27 |
+
{"time":"2025-06-29T01:14:35.821570745+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:35232->104.21.20.172:443: read: connection timed out"}
|
| 28 |
+
{"time":"2025-06-29T01:17:35.533530754+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:44258->104.21.20.172:443: read: connection timed out"}
|
| 29 |
+
{"time":"2025-06-29T01:23:12.630381225+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:48108->172.67.193.61:443: read: connection reset by peer"}
|
| 30 |
+
{"time":"2025-06-29T01:24:55.067569821+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 31 |
+
{"time":"2025-06-29T01:25:27.188065501+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 32 |
+
{"time":"2025-06-29T01:25:56.782489942+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45024->104.21.20.172:443: read: connection timed out"}
|
| 33 |
+
{"time":"2025-06-29T01:26:49.538892815+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
|
| 34 |
+
{"time":"2025-06-29T01:29:46.157546022+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:42508->172.67.193.61:443: read: connection timed out"}
|
| 35 |
+
{"time":"2025-06-29T05:53:57.9412544+08:00","level":"INFO","msg":"stream: closing","id":"vgvxxzqc"}
|
| 36 |
+
{"time":"2025-06-29T05:53:57.941286983+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 37 |
+
{"time":"2025-06-29T05:53:57.942366437+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 38 |
+
{"time":"2025-06-29T05:54:00.869660002+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 39 |
+
{"time":"2025-06-29T05:54:03.731237694+08:00","level":"INFO","msg":"handler: closed","stream_id":"vgvxxzqc"}
|
| 40 |
+
{"time":"2025-06-29T05:54:03.731282348+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"vgvxxzqc"}
|
| 41 |
+
{"time":"2025-06-29T05:54:03.731313818+08:00","level":"INFO","msg":"sender: closed","stream_id":"vgvxxzqc"}
|
| 42 |
+
{"time":"2025-06-29T05:54:03.735031072+08:00","level":"INFO","msg":"stream: closed","id":"vgvxxzqc"}
|
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Configure stats pid to 2351
|
| 3 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log
|
| 7 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log
|
| 8 |
+
2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-29 00:12:00,923 INFO MainThread:2351 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-29 00:12:00,924 INFO MainThread:2351 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-29 00:12:00,926 INFO MainThread:2351 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-29 00:12:00,929 INFO MainThread:2351 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-29 00:12:39,788 INFO MainThread:2351 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-06-29 00:12:40,036 INFO MainThread:2351 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-06-29 00:12:46,669 INFO MainThread:2351 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06290009_deepspeed', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 168, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
|
| 24 |
+
2025-06-29 05:53:57,929 INFO MsgRouterThr:2351 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
|
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/run-vgvxxzqc.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:02b9bae29f7eff7d1456d46d7a5fc028ecfa83d57215ed5ead79c22a39602cb8
|
| 3 |
+
size 7831005
|
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eef5f0c622ec387ea2170b4d959435d6318f6de5a9cc1feb0516c58c1a678732
|
| 3 |
+
size 269461360
|
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db4bea839400c6bc8cdb4aeecb922ce12df2e2d61dd5044829ecbda842413c2a
|
| 3 |
+
size 269466288
|
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:15d343b92ad55cb228b7571bad35ee7aaebdbedbcfe4160f83250484719333b3
|
| 3 |
+
size 269466352
|
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d1f3393bc0e7b8baae8bec392cd9d8599815f083eb9c2bc478332a65f1aff728
|
| 3 |
+
size 269465648
|
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a24b3d97ae95d888cfdd074705605bf04315162a2c87780e94c7d6fedbca9b49
|
| 3 |
+
size 269466096
|
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:208d3f152953084a010524ab98240fdece461a004a016925d05ac6e3f92847a6
|
| 3 |
+
size 269466352
|
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8860a1be45ca900d2da827852856116e0a548ce82432d94210d4713cbbd5acc4
|
| 3 |
+
size 269466096
|
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d33924194e0954a8e36ef63297fdc2b9754a2e9f368e8f302cab40552b8d5f20
|
| 3 |
+
size 269465904
|
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/checkpoint/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b7ded329a604392d5b75d24097eff999add88eee739da2b384cb7b6a8b5c311a
|
| 3 |
+
size 359448824
|
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/latest
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47320987f9a49d5b00119b960f247a956773f57543982b8bfcb6da5bb3afd9ef
|
| 3 |
+
size 10
|
all_checkpoints/stage1_07041727_2dataset/epoch=19.ckpt/zero_to_fp32.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:46497565ccf2b4a8b1f6f18c8341042f3749605a94335c81f69df1bd268af64f
|
| 3 |
+
size 33272
|
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:194d691a6ab90298daf6d5eaa20e3f9a8164e46912823a9fef49bf8b38aad782
|
| 3 |
+
size 269461360
|
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c4f598e2f0e66dbdc116fd5ca5e5311fe8b80a8f685e59a9a09d376d8b1666d
|
| 3 |
+
size 269466288
|
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79bb71cb535f9037a708d5df0bdd2f1f7503f2863210a6e70d95e82e1252efd5
|
| 3 |
+
size 269466352
|
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69968693b4d1f1dc91388c21530f05303a247181a06837f0d1f7102aa1d08569
|
| 3 |
+
size 269465648
|
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:45daf7ab82636399b71a0e7f8e8f1e713eecccbc1b02231c42c37b2827efa557
|
| 3 |
+
size 269466096
|
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3933fad3f64b1ec08005ff3f4b41707aec3bc8ad131e058fc01f7a7011b5063c
|
| 3 |
+
size 269466352
|
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1972613b6bded397cb64a5cbfe6da779c1809df0dc8c91fff5d8a75aa82384b7
|
| 3 |
+
size 269466096
|
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:19d448bb7539c16d523eaefd5f6417c700aed03f1a6c8395c99514462f86ee1e
|
| 3 |
+
size 269465904
|
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/checkpoint/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b212a5b5020f57247311f70f59e51d7f724cad099891b5d172b38506ecc64642
|
| 3 |
+
size 359448824
|
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23721c244e955e6c1b8ef31b4e95492811f6b9bb605d28345b5cae3095393a5b
|
| 3 |
+
size 718651684
|
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/latest
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47320987f9a49d5b00119b960f247a956773f57543982b8bfcb6da5bb3afd9ef
|
| 3 |
+
size 10
|
all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/zero_to_fp32.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:46497565ccf2b4a8b1f6f18c8341042f3749605a94335c81f69df1bd268af64f
|
| 3 |
+
size 33272
|