Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_token_acc.png +0 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_epoch.png +0 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_grad_norm.png +0 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_learning_rate.png +0 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_loss.png +0 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_memory(GiB).png +0 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/config.yaml +128 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/output.log +141 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/requirements.txt +225 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-metadata.json +98 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-summary.json +1 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-internal.log +10 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug.log +24 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/run-tul2l6xd.wandb +0 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/config.yaml +128 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/output.log +141 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/requirements.txt +225 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-metadata.json +98 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-summary.json +1 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-internal.log +15 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug.log +24 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/run-bq9amgfj.wandb +0 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/config.yaml +236 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/output.log +20 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/requirements.txt +225 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-metadata.json +98 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-summary.json +1 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-internal.log +10 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug.log +24 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/config.yaml +128 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/output.log +141 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/requirements.txt +225 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-metadata.json +98 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-summary.json +1 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-internal.log +15 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug.log +24 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/run-bu5vqvh5.wandb +0 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/config.yaml +128 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/output.log +150 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/requirements.txt +225 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-metadata.json +98 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-summary.json +1 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-internal.log +10 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug.log +24 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/run-a2dszq0q.wandb +0 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/config.yaml +128 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/output.log +110 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/requirements.txt +225 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/wandb-metadata.json +98 -0
- ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/run-zmuhvn72.wandb +0 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_token_acc.png
ADDED
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_epoch.png
ADDED
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_grad_norm.png
ADDED
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_learning_rate.png
ADDED
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_loss.png
ADDED
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_memory(GiB).png
ADDED
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/config.yaml
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.11
|
| 4 |
+
m:
|
| 5 |
+
- "1": trainer/global_step
|
| 6 |
+
"6":
|
| 7 |
+
- 3
|
| 8 |
+
"7": []
|
| 9 |
+
python_version: 3.10.0
|
| 10 |
+
t:
|
| 11 |
+
"1":
|
| 12 |
+
- 1
|
| 13 |
+
- 5
|
| 14 |
+
- 9
|
| 15 |
+
- 11
|
| 16 |
+
- 33
|
| 17 |
+
- 41
|
| 18 |
+
- 49
|
| 19 |
+
- 53
|
| 20 |
+
- 55
|
| 21 |
+
- 63
|
| 22 |
+
- 103
|
| 23 |
+
"2":
|
| 24 |
+
- 1
|
| 25 |
+
- 5
|
| 26 |
+
- 9
|
| 27 |
+
- 11
|
| 28 |
+
- 33
|
| 29 |
+
- 41
|
| 30 |
+
- 49
|
| 31 |
+
- 53
|
| 32 |
+
- 55
|
| 33 |
+
- 63
|
| 34 |
+
- 103
|
| 35 |
+
"3":
|
| 36 |
+
- 7
|
| 37 |
+
- 23
|
| 38 |
+
- 55
|
| 39 |
+
- 66
|
| 40 |
+
"4": 3.10.0
|
| 41 |
+
"5": 0.19.11
|
| 42 |
+
"6": 4.52.3
|
| 43 |
+
"8":
|
| 44 |
+
- 5
|
| 45 |
+
"12": 0.19.11
|
| 46 |
+
"13": linux-x86_64
|
| 47 |
+
accelerator:
|
| 48 |
+
value: gpu
|
| 49 |
+
batch_size:
|
| 50 |
+
value: 32
|
| 51 |
+
bert_hidden_dim:
|
| 52 |
+
value: 768
|
| 53 |
+
bert_name:
|
| 54 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
|
| 55 |
+
check_val_every_n_epoch:
|
| 56 |
+
value: 1
|
| 57 |
+
cross_attention_freq:
|
| 58 |
+
value: 2
|
| 59 |
+
devices:
|
| 60 |
+
value: 0,1,2,3
|
| 61 |
+
filename:
|
| 62 |
+
value: stage1_06221723
|
| 63 |
+
init_checkpoint:
|
| 64 |
+
value: ""
|
| 65 |
+
init_lr:
|
| 66 |
+
value: 0.0001
|
| 67 |
+
lm:
|
| 68 |
+
value: true
|
| 69 |
+
load_4bit:
|
| 70 |
+
value: false
|
| 71 |
+
lr_decay_rate:
|
| 72 |
+
value: 0.9
|
| 73 |
+
match_batch_size:
|
| 74 |
+
value: 64
|
| 75 |
+
max_epochs:
|
| 76 |
+
value: 20
|
| 77 |
+
min_lr:
|
| 78 |
+
value: 1e-05
|
| 79 |
+
mix_dataset:
|
| 80 |
+
value: true
|
| 81 |
+
mode:
|
| 82 |
+
value: train
|
| 83 |
+
num_query_token:
|
| 84 |
+
value: 8
|
| 85 |
+
num_workers:
|
| 86 |
+
value: 8
|
| 87 |
+
plm_name:
|
| 88 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
|
| 89 |
+
plm_tune:
|
| 90 |
+
value: freeze
|
| 91 |
+
pool_size:
|
| 92 |
+
value: 0
|
| 93 |
+
precision:
|
| 94 |
+
value: bf16-mixed
|
| 95 |
+
projection_dim:
|
| 96 |
+
value: 256
|
| 97 |
+
prot_aug:
|
| 98 |
+
value: None
|
| 99 |
+
prot_max_len:
|
| 100 |
+
value: 1024
|
| 101 |
+
ptm:
|
| 102 |
+
value: true
|
| 103 |
+
rerank_cand_num:
|
| 104 |
+
value: 128
|
| 105 |
+
retrieval_eval_epoch:
|
| 106 |
+
value: 10
|
| 107 |
+
root:
|
| 108 |
+
value: data
|
| 109 |
+
save_every_n_epochs:
|
| 110 |
+
value: 5
|
| 111 |
+
scheduler:
|
| 112 |
+
value: linear_warmup_cosine_lr
|
| 113 |
+
seed:
|
| 114 |
+
value: 42
|
| 115 |
+
strategy:
|
| 116 |
+
value: deepspeed
|
| 117 |
+
temperature:
|
| 118 |
+
value: 0.1
|
| 119 |
+
text_max_len:
|
| 120 |
+
value: 128
|
| 121 |
+
use_wandb_logger:
|
| 122 |
+
value: true
|
| 123 |
+
warmup_lr:
|
| 124 |
+
value: 1e-06
|
| 125 |
+
warmup_steps:
|
| 126 |
+
value: 1000
|
| 127 |
+
weight_decay:
|
| 128 |
+
value: 0.05
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/output.log
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
|
| 2 |
+
Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
|
| 3 |
+
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
|
| 4 |
+
|
| 5 |
+
| Name | Type | Params | Mode
|
| 6 |
+
------------------------------------------------------
|
| 7 |
+
0 | blip2qformer | Blip2Qformer | 327 M | train
|
| 8 |
+
------------------------------------------------------
|
| 9 |
+
179 M Trainable params
|
| 10 |
+
147 M Non-trainable params
|
| 11 |
+
327 M Total params
|
| 12 |
+
1,309.467 Total estimated model params size (MB)
|
| 13 |
+
5 Modules in train mode
|
| 14 |
+
926 Modules in eval mode
|
| 15 |
+
Sanity Checking DataLoader 2: 50%|██████████████████████████▌ | 1/2 [00:00<00:00, 1.86it/s]
|
| 16 |
+
/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
|
| 17 |
+
with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
|
| 18 |
+
Traceback (most recent call last):
|
| 19 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
|
| 20 |
+
main(args)
|
| 21 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
|
| 22 |
+
trainer.fit(model, datamodule=dm)
|
| 23 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
|
| 24 |
+
call._call_and_handle_interrupt(
|
| 25 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
|
| 26 |
+
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
|
| 27 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
|
| 28 |
+
return function(*args, **kwargs)
|
| 29 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
|
| 30 |
+
self._run(model, ckpt_path=ckpt_path)
|
| 31 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
|
| 32 |
+
results = self._run_stage()
|
| 33 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
|
| 34 |
+
self._run_sanity_check()
|
| 35 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
|
| 36 |
+
val_loop.run()
|
| 37 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
|
| 38 |
+
return loop_run(self, *args, **kwargs)
|
| 39 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
|
| 40 |
+
self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
|
| 41 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
|
| 42 |
+
output = call._call_strategy_hook(trainer, hook_name, *step_args)
|
| 43 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
|
| 44 |
+
output = fn(*args, **kwargs)
|
| 45 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
|
| 46 |
+
return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
|
| 47 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
|
| 48 |
+
wrapper_output = wrapper_module(*args, **kwargs)
|
| 49 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 50 |
+
return self._call_impl(*args, **kwargs)
|
| 51 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 52 |
+
return forward_call(*args, **kwargs)
|
| 53 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
|
| 54 |
+
ret_val = func(*args, **kwargs)
|
| 55 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
|
| 56 |
+
loss = self.module(*inputs, **kwargs)
|
| 57 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 58 |
+
return self._call_impl(*args, **kwargs)
|
| 59 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
|
| 60 |
+
return inner()
|
| 61 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
|
| 62 |
+
result = forward_call(*args, **kwargs)
|
| 63 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
|
| 64 |
+
out = method(*_args, **_kwargs)
|
| 65 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
|
| 66 |
+
return func(*args, **kwargs)
|
| 67 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
|
| 68 |
+
blip2_loss = self.blip2qformer(batch)
|
| 69 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 70 |
+
return self._call_impl(*args, **kwargs)
|
| 71 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 72 |
+
return forward_call(*args, **kwargs)
|
| 73 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
|
| 74 |
+
sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
|
| 75 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
|
| 76 |
+
loss_graph = F.cross_entropy(logits_per_graph, labels)
|
| 77 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
|
| 78 |
+
return torch._C._nn.cross_entropy_loss(
|
| 79 |
+
RuntimeError: size mismatch (got input: [4], target: [1])
|
| 80 |
+
[rank0]: Traceback (most recent call last):
|
| 81 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
|
| 82 |
+
[rank0]: main(args)
|
| 83 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
|
| 84 |
+
[rank0]: trainer.fit(model, datamodule=dm)
|
| 85 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
|
| 86 |
+
[rank0]: call._call_and_handle_interrupt(
|
| 87 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
|
| 88 |
+
[rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
|
| 89 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
|
| 90 |
+
[rank0]: return function(*args, **kwargs)
|
| 91 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
|
| 92 |
+
[rank0]: self._run(model, ckpt_path=ckpt_path)
|
| 93 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
|
| 94 |
+
[rank0]: results = self._run_stage()
|
| 95 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
|
| 96 |
+
[rank0]: self._run_sanity_check()
|
| 97 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
|
| 98 |
+
[rank0]: val_loop.run()
|
| 99 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
|
| 100 |
+
[rank0]: return loop_run(self, *args, **kwargs)
|
| 101 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
|
| 102 |
+
[rank0]: self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
|
| 103 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
|
| 104 |
+
[rank0]: output = call._call_strategy_hook(trainer, hook_name, *step_args)
|
| 105 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
|
| 106 |
+
[rank0]: output = fn(*args, **kwargs)
|
| 107 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
|
| 108 |
+
[rank0]: return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
|
| 109 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
|
| 110 |
+
[rank0]: wrapper_output = wrapper_module(*args, **kwargs)
|
| 111 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 112 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 113 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 114 |
+
[rank0]: return forward_call(*args, **kwargs)
|
| 115 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
|
| 116 |
+
[rank0]: ret_val = func(*args, **kwargs)
|
| 117 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
|
| 118 |
+
[rank0]: loss = self.module(*inputs, **kwargs)
|
| 119 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 120 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 121 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
|
| 122 |
+
[rank0]: return inner()
|
| 123 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
|
| 124 |
+
[rank0]: result = forward_call(*args, **kwargs)
|
| 125 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
|
| 126 |
+
[rank0]: out = method(*_args, **_kwargs)
|
| 127 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
|
| 128 |
+
[rank0]: return func(*args, **kwargs)
|
| 129 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
|
| 130 |
+
[rank0]: blip2_loss = self.blip2qformer(batch)
|
| 131 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 132 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 133 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 134 |
+
[rank0]: return forward_call(*args, **kwargs)
|
| 135 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
|
| 136 |
+
[rank0]: sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
|
| 137 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
|
| 138 |
+
[rank0]: loss_graph = F.cross_entropy(logits_per_graph, labels)
|
| 139 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
|
| 140 |
+
[rank0]: return torch._C._nn.cross_entropy_loss(
|
| 141 |
+
[rank0]: RuntimeError: size mismatch (got input: [4], target: [1])
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/requirements.txt
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy==2.2.6
|
| 2 |
+
confection==0.1.5
|
| 3 |
+
text-unidecode==1.3
|
| 4 |
+
contexttimer==0.3.3
|
| 5 |
+
omegaconf==2.3.0
|
| 6 |
+
tzdata==2025.2
|
| 7 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 8 |
+
plotly==6.1.1
|
| 9 |
+
decord==0.6.0
|
| 10 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 11 |
+
scipy==1.15.3
|
| 12 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 13 |
+
parso==0.8.4
|
| 14 |
+
python-dateutil==2.9.0.post0
|
| 15 |
+
setuptools==78.1.1
|
| 16 |
+
aiosignal==1.3.2
|
| 17 |
+
joblib==1.5.1
|
| 18 |
+
platformdirs==4.3.8
|
| 19 |
+
regex==2024.11.6
|
| 20 |
+
aiohappyeyeballs==2.6.1
|
| 21 |
+
virtualenv==20.31.2
|
| 22 |
+
lazy_loader==0.4
|
| 23 |
+
rich==14.0.0
|
| 24 |
+
timm==0.4.12
|
| 25 |
+
antlr4-python3-runtime==4.9.3
|
| 26 |
+
pandas==2.2.3
|
| 27 |
+
salesforce-lavis==1.0.2
|
| 28 |
+
gitdb==4.0.12
|
| 29 |
+
six==1.17.0
|
| 30 |
+
smmap==5.0.2
|
| 31 |
+
annotated-types==0.7.0
|
| 32 |
+
pyparsing==3.2.3
|
| 33 |
+
Jinja2==3.1.6
|
| 34 |
+
ptyprocess==0.7.0
|
| 35 |
+
streamlit==1.45.1
|
| 36 |
+
idna==3.10
|
| 37 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 38 |
+
tenacity==9.1.2
|
| 39 |
+
sentencepiece==0.2.0
|
| 40 |
+
matplotlib-inline==0.1.7
|
| 41 |
+
typing-inspection==0.4.1
|
| 42 |
+
packaging==24.2
|
| 43 |
+
nltk==3.9.1
|
| 44 |
+
wheel==0.45.1
|
| 45 |
+
catalogue==2.0.10
|
| 46 |
+
matplotlib==3.10.3
|
| 47 |
+
propcache==0.3.1
|
| 48 |
+
Pygments==2.19.1
|
| 49 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 50 |
+
requests==2.32.3
|
| 51 |
+
filelock==3.18.0
|
| 52 |
+
pexpect==4.9.0
|
| 53 |
+
opencv-python-headless==4.5.5.64
|
| 54 |
+
certifi==2025.4.26
|
| 55 |
+
nvidia-nvtx-cu12==12.4.127
|
| 56 |
+
bleach==6.2.0
|
| 57 |
+
typing_extensions==4.13.2
|
| 58 |
+
tornado==6.5.1
|
| 59 |
+
networkx==3.4.2
|
| 60 |
+
sympy==1.13.1
|
| 61 |
+
watchdog==6.0.0
|
| 62 |
+
kaggle==1.7.4.5
|
| 63 |
+
nvidia-ml-py==12.575.51
|
| 64 |
+
pyarrow==20.0.0
|
| 65 |
+
mpmath==1.3.0
|
| 66 |
+
lightning-utilities==0.14.3
|
| 67 |
+
ftfy==6.3.1
|
| 68 |
+
triton==3.2.0
|
| 69 |
+
referencing==0.36.2
|
| 70 |
+
ipython==8.36.0
|
| 71 |
+
yarl==1.20.0
|
| 72 |
+
language_data==1.3.0
|
| 73 |
+
cycler==0.12.1
|
| 74 |
+
python-magic==0.4.27
|
| 75 |
+
wasabi==1.1.3
|
| 76 |
+
protobuf==6.31.0
|
| 77 |
+
murmurhash==1.0.13
|
| 78 |
+
jsonschema-specifications==2025.4.1
|
| 79 |
+
blinker==1.9.0
|
| 80 |
+
fonttools==4.58.0
|
| 81 |
+
imageio==2.37.0
|
| 82 |
+
pycocoevalcap==1.2
|
| 83 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 84 |
+
fairscale==0.4.4
|
| 85 |
+
hjson==3.1.0
|
| 86 |
+
identify==2.6.12
|
| 87 |
+
mdurl==0.1.2
|
| 88 |
+
decorator==5.2.1
|
| 89 |
+
distlib==0.3.9
|
| 90 |
+
webencodings==0.5.1
|
| 91 |
+
kiwisolver==1.4.8
|
| 92 |
+
srsly==2.5.1
|
| 93 |
+
frozenlist==1.6.0
|
| 94 |
+
blis==1.3.0
|
| 95 |
+
contourpy==1.3.2
|
| 96 |
+
hf-xet==1.1.2
|
| 97 |
+
cymem==2.0.11
|
| 98 |
+
pillow==11.2.1
|
| 99 |
+
pycocotools==2.0.8
|
| 100 |
+
pre_commit==4.2.0
|
| 101 |
+
wrapt==1.17.2
|
| 102 |
+
nvidia-curand-cu12==10.3.5.147
|
| 103 |
+
spacy==3.8.7
|
| 104 |
+
rpds-py==0.25.1
|
| 105 |
+
exceptiongroup==1.3.0
|
| 106 |
+
braceexpand==0.1.7
|
| 107 |
+
rouge_score==0.1.2
|
| 108 |
+
async-timeout==5.0.1
|
| 109 |
+
torchmetrics==1.7.1
|
| 110 |
+
nvidia-nccl-cu12==2.21.5
|
| 111 |
+
wcwidth==0.2.13
|
| 112 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 113 |
+
scikit-image==0.25.2
|
| 114 |
+
urllib3==2.4.0
|
| 115 |
+
portalocker==3.1.1
|
| 116 |
+
smart-open==7.1.0
|
| 117 |
+
cfgv==3.4.0
|
| 118 |
+
markdown-it-py==3.0.0
|
| 119 |
+
charset-normalizer==3.4.2
|
| 120 |
+
executing==2.2.0
|
| 121 |
+
pure_eval==0.2.3
|
| 122 |
+
safetensors==0.5.3
|
| 123 |
+
spacy-legacy==3.0.12
|
| 124 |
+
shellingham==1.5.4
|
| 125 |
+
langcodes==3.5.0
|
| 126 |
+
pytz==2025.2
|
| 127 |
+
iopath==0.1.10
|
| 128 |
+
weasel==0.4.1
|
| 129 |
+
tifffile==2025.5.10
|
| 130 |
+
nodeenv==1.9.1
|
| 131 |
+
absl-py==2.2.2
|
| 132 |
+
einops==0.8.1
|
| 133 |
+
msgpack==1.1.0
|
| 134 |
+
pydantic_core==2.33.2
|
| 135 |
+
ninja==1.11.1.4
|
| 136 |
+
altair==5.5.0
|
| 137 |
+
attrs==25.3.0
|
| 138 |
+
tqdm==4.67.1
|
| 139 |
+
deepspeed==0.16.10+b666844f
|
| 140 |
+
pydeck==0.9.1
|
| 141 |
+
stack-data==0.6.3
|
| 142 |
+
pydantic==2.11.5
|
| 143 |
+
torch==2.6.0
|
| 144 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 145 |
+
python-slugify==8.0.4
|
| 146 |
+
webdataset==0.2.111
|
| 147 |
+
pytorch-lightning==2.5.1.post0
|
| 148 |
+
prompt_toolkit==3.0.51
|
| 149 |
+
psutil==7.0.0
|
| 150 |
+
opendatasets==0.1.22
|
| 151 |
+
asttokens==3.0.0
|
| 152 |
+
MarkupSafe==3.0.2
|
| 153 |
+
multidict==6.4.4
|
| 154 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 155 |
+
GitPython==3.1.44
|
| 156 |
+
PyYAML==6.0.2
|
| 157 |
+
cloudpathlib==0.21.1
|
| 158 |
+
toml==0.10.2
|
| 159 |
+
marisa-trie==1.2.1
|
| 160 |
+
traitlets==5.14.3
|
| 161 |
+
cachetools==5.5.2
|
| 162 |
+
spacy-loggers==1.0.5
|
| 163 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 164 |
+
torchvision==0.21.0
|
| 165 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 166 |
+
jedi==0.19.2
|
| 167 |
+
thinc==8.3.6
|
| 168 |
+
py-cpuinfo==9.0.0
|
| 169 |
+
yacs==0.1.8
|
| 170 |
+
cffi==1.17.1
|
| 171 |
+
preshed==3.0.10
|
| 172 |
+
more-itertools==10.7.0
|
| 173 |
+
bigmodelvis==0.0.1
|
| 174 |
+
datasets==3.6.0
|
| 175 |
+
huggingface-hub==0.32.1
|
| 176 |
+
narwhals==1.41.0
|
| 177 |
+
xxhash==3.5.0
|
| 178 |
+
sentry-sdk==2.29.1
|
| 179 |
+
aiohttp==3.12.2
|
| 180 |
+
opencv-python==4.11.0.86
|
| 181 |
+
pycryptodome==3.23.0
|
| 182 |
+
threadpoolctl==3.6.0
|
| 183 |
+
flash-attn==2.7.1.post1
|
| 184 |
+
transformers==4.52.3
|
| 185 |
+
pycparser==2.22
|
| 186 |
+
pathlib==1.0.1
|
| 187 |
+
dill==0.3.8
|
| 188 |
+
scikit-learn==1.6.1
|
| 189 |
+
tokenizers==0.21.1
|
| 190 |
+
aliyun-python-sdk-core==2.16.0
|
| 191 |
+
fsspec==2025.3.0
|
| 192 |
+
jmespath==0.10.0
|
| 193 |
+
click==8.2.1
|
| 194 |
+
delta-center-client==0.0.4
|
| 195 |
+
cheroot==10.0.1
|
| 196 |
+
wandb==0.19.11
|
| 197 |
+
setproctitle==1.3.6
|
| 198 |
+
jsonschema==4.24.0
|
| 199 |
+
oss2==2.15.0
|
| 200 |
+
multiprocess==0.70.16
|
| 201 |
+
jaraco.functools==4.1.0
|
| 202 |
+
web.py==0.62
|
| 203 |
+
aliyun-python-sdk-kms==2.16.5
|
| 204 |
+
cryptography==45.0.3
|
| 205 |
+
pip==25.1.1
|
| 206 |
+
docker-pycreds==0.4.0
|
| 207 |
+
typer==0.16.0
|
| 208 |
+
opendelta==0.3.2
|
| 209 |
+
crcmod==1.7
|
| 210 |
+
jaraco.functools==4.0.1
|
| 211 |
+
inflect==7.3.1
|
| 212 |
+
jaraco.collections==5.1.0
|
| 213 |
+
packaging==24.2
|
| 214 |
+
wheel==0.45.1
|
| 215 |
+
tomli==2.0.1
|
| 216 |
+
platformdirs==4.2.2
|
| 217 |
+
typing_extensions==4.12.2
|
| 218 |
+
more-itertools==10.3.0
|
| 219 |
+
autocommand==2.2.2
|
| 220 |
+
jaraco.text==3.12.1
|
| 221 |
+
importlib_metadata==8.0.0
|
| 222 |
+
jaraco.context==5.3.0
|
| 223 |
+
zipp==3.19.2
|
| 224 |
+
backports.tarfile==1.2.0
|
| 225 |
+
typeguard==4.3.0
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-06-22T11:01:09.864619Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--devices",
|
| 7 |
+
"0,1,2,3",
|
| 8 |
+
"--mode",
|
| 9 |
+
"train",
|
| 10 |
+
"--filename",
|
| 11 |
+
"stage1_06221723",
|
| 12 |
+
"--num_query_token",
|
| 13 |
+
"8",
|
| 14 |
+
"--plm_name",
|
| 15 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
|
| 16 |
+
"--bert_name",
|
| 17 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
|
| 18 |
+
"--save_every_n_epochs",
|
| 19 |
+
"5",
|
| 20 |
+
"--max_epochs",
|
| 21 |
+
"20",
|
| 22 |
+
"--batch_size",
|
| 23 |
+
"32",
|
| 24 |
+
"--precision",
|
| 25 |
+
"bf16-mixed",
|
| 26 |
+
"--mix_dataset",
|
| 27 |
+
"--num_workers",
|
| 28 |
+
"8",
|
| 29 |
+
"--use_wandb_logger"
|
| 30 |
+
],
|
| 31 |
+
"program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
|
| 32 |
+
"codePath": "stage1.py",
|
| 33 |
+
"email": "gia0603yucca@gmail.com",
|
| 34 |
+
"root": "./all_checkpoints/stage1_06221723/",
|
| 35 |
+
"host": "dsw-251511-c5cfcb8-lwcpt",
|
| 36 |
+
"executable": "/root/miniconda3/envs/protT3/bin/python",
|
| 37 |
+
"codePathLocal": "stage1.py",
|
| 38 |
+
"cpu_count": 64,
|
| 39 |
+
"cpu_count_logical": 64,
|
| 40 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 41 |
+
"gpu_count": 8,
|
| 42 |
+
"disk": {
|
| 43 |
+
"/": {
|
| 44 |
+
"total": "1623302262784",
|
| 45 |
+
"used": "1363144704"
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"memory": {
|
| 49 |
+
"total": "549755813888"
|
| 50 |
+
},
|
| 51 |
+
"cpu": {
|
| 52 |
+
"count": 64,
|
| 53 |
+
"countLogical": 64
|
| 54 |
+
},
|
| 55 |
+
"gpu_nvidia": [
|
| 56 |
+
{
|
| 57 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 58 |
+
"memoryTotal": "85198045184",
|
| 59 |
+
"architecture": "Ampere"
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 63 |
+
"memoryTotal": "85198045184",
|
| 64 |
+
"architecture": "Ampere"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 68 |
+
"memoryTotal": "85198045184",
|
| 69 |
+
"architecture": "Ampere"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 73 |
+
"memoryTotal": "85198045184",
|
| 74 |
+
"architecture": "Ampere"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 78 |
+
"memoryTotal": "85198045184",
|
| 79 |
+
"architecture": "Ampere"
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 83 |
+
"memoryTotal": "85198045184",
|
| 84 |
+
"architecture": "Ampere"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 88 |
+
"memoryTotal": "85198045184",
|
| 89 |
+
"architecture": "Ampere"
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 93 |
+
"memoryTotal": "85198045184",
|
| 94 |
+
"architecture": "Ampere"
|
| 95 |
+
}
|
| 96 |
+
],
|
| 97 |
+
"cudaVersion": "12.1"
|
| 98 |
+
}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":11}}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-22T19:01:09.866022252+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-22T19:01:10.922793248+08:00","level":"INFO","msg":"created new stream","id":"tul2l6xd"}
|
| 3 |
+
{"time":"2025-06-22T19:01:10.922840238+08:00","level":"INFO","msg":"stream: started","id":"tul2l6xd"}
|
| 4 |
+
{"time":"2025-06-22T19:01:10.922861712+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"tul2l6xd"}
|
| 5 |
+
{"time":"2025-06-22T19:01:10.922902903+08:00","level":"INFO","msg":"sender: started","stream_id":"tul2l6xd"}
|
| 6 |
+
{"time":"2025-06-22T19:01:10.922946705+08:00","level":"INFO","msg":"handler: started","stream_id":"tul2l6xd"}
|
| 7 |
+
{"time":"2025-06-22T19:01:12.123540259+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-06-22T19:01:21.097810278+08:00","level":"INFO","msg":"stream: closing","id":"tul2l6xd"}
|
| 9 |
+
{"time":"2025-06-22T19:01:21.097899274+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 10 |
+
{"time":"2025-06-22T19:01:21.098725356+08:00","level":"INFO","msg":"Stopped system monitor"}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Configure stats pid to 75754
|
| 3 |
+
2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug.log
|
| 7 |
+
2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-internal.log
|
| 8 |
+
2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-22 19:01:09,860 INFO MainThread:75754 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-22 19:01:09,861 INFO MainThread:75754 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-22 19:01:09,865 INFO MainThread:75754 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-22 19:01:09,869 INFO MainThread:75754 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-22 19:01:12,112 INFO MainThread:75754 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-06-22 19:01:12,253 INFO MainThread:75754 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-06-22 19:01:12,253 INFO MainThread:75754 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-06-22 19:01:12,256 INFO MainThread:75754 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-06-22 19:01:12,256 INFO MainThread:75754 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-06-22 19:01:12,257 INFO MainThread:75754 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-06-22 19:01:16,111 INFO MainThread:75754 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
|
| 24 |
+
2025-06-22 19:01:21,097 INFO MsgRouterThr:75754 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/run-tul2l6xd.wandb
ADDED
|
File without changes
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/config.yaml
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.11
|
| 4 |
+
m:
|
| 5 |
+
- "1": trainer/global_step
|
| 6 |
+
"6":
|
| 7 |
+
- 3
|
| 8 |
+
"7": []
|
| 9 |
+
python_version: 3.10.0
|
| 10 |
+
t:
|
| 11 |
+
"1":
|
| 12 |
+
- 1
|
| 13 |
+
- 5
|
| 14 |
+
- 9
|
| 15 |
+
- 11
|
| 16 |
+
- 33
|
| 17 |
+
- 41
|
| 18 |
+
- 49
|
| 19 |
+
- 53
|
| 20 |
+
- 55
|
| 21 |
+
- 63
|
| 22 |
+
- 103
|
| 23 |
+
"2":
|
| 24 |
+
- 1
|
| 25 |
+
- 5
|
| 26 |
+
- 9
|
| 27 |
+
- 11
|
| 28 |
+
- 33
|
| 29 |
+
- 41
|
| 30 |
+
- 49
|
| 31 |
+
- 53
|
| 32 |
+
- 55
|
| 33 |
+
- 63
|
| 34 |
+
- 103
|
| 35 |
+
"3":
|
| 36 |
+
- 7
|
| 37 |
+
- 23
|
| 38 |
+
- 55
|
| 39 |
+
- 66
|
| 40 |
+
"4": 3.10.0
|
| 41 |
+
"5": 0.19.11
|
| 42 |
+
"6": 4.52.3
|
| 43 |
+
"8":
|
| 44 |
+
- 5
|
| 45 |
+
"12": 0.19.11
|
| 46 |
+
"13": linux-x86_64
|
| 47 |
+
accelerator:
|
| 48 |
+
value: gpu
|
| 49 |
+
batch_size:
|
| 50 |
+
value: 32
|
| 51 |
+
bert_hidden_dim:
|
| 52 |
+
value: 768
|
| 53 |
+
bert_name:
|
| 54 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
|
| 55 |
+
check_val_every_n_epoch:
|
| 56 |
+
value: 1
|
| 57 |
+
cross_attention_freq:
|
| 58 |
+
value: 2
|
| 59 |
+
devices:
|
| 60 |
+
value: 0,1,2,3
|
| 61 |
+
filename:
|
| 62 |
+
value: stage1_06221723
|
| 63 |
+
init_checkpoint:
|
| 64 |
+
value: ""
|
| 65 |
+
init_lr:
|
| 66 |
+
value: 0.0001
|
| 67 |
+
lm:
|
| 68 |
+
value: true
|
| 69 |
+
load_4bit:
|
| 70 |
+
value: false
|
| 71 |
+
lr_decay_rate:
|
| 72 |
+
value: 0.9
|
| 73 |
+
match_batch_size:
|
| 74 |
+
value: 64
|
| 75 |
+
max_epochs:
|
| 76 |
+
value: 20
|
| 77 |
+
min_lr:
|
| 78 |
+
value: 1e-05
|
| 79 |
+
mix_dataset:
|
| 80 |
+
value: true
|
| 81 |
+
mode:
|
| 82 |
+
value: train
|
| 83 |
+
num_query_token:
|
| 84 |
+
value: 8
|
| 85 |
+
num_workers:
|
| 86 |
+
value: 4
|
| 87 |
+
plm_name:
|
| 88 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
|
| 89 |
+
plm_tune:
|
| 90 |
+
value: freeze
|
| 91 |
+
pool_size:
|
| 92 |
+
value: 0
|
| 93 |
+
precision:
|
| 94 |
+
value: bf16-mixed
|
| 95 |
+
projection_dim:
|
| 96 |
+
value: 256
|
| 97 |
+
prot_aug:
|
| 98 |
+
value: None
|
| 99 |
+
prot_max_len:
|
| 100 |
+
value: 1024
|
| 101 |
+
ptm:
|
| 102 |
+
value: true
|
| 103 |
+
rerank_cand_num:
|
| 104 |
+
value: 128
|
| 105 |
+
retrieval_eval_epoch:
|
| 106 |
+
value: 10
|
| 107 |
+
root:
|
| 108 |
+
value: data
|
| 109 |
+
save_every_n_epochs:
|
| 110 |
+
value: 5
|
| 111 |
+
scheduler:
|
| 112 |
+
value: linear_warmup_cosine_lr
|
| 113 |
+
seed:
|
| 114 |
+
value: 42
|
| 115 |
+
strategy:
|
| 116 |
+
value: deepspeed
|
| 117 |
+
temperature:
|
| 118 |
+
value: 0.1
|
| 119 |
+
text_max_len:
|
| 120 |
+
value: 128
|
| 121 |
+
use_wandb_logger:
|
| 122 |
+
value: true
|
| 123 |
+
warmup_lr:
|
| 124 |
+
value: 1e-06
|
| 125 |
+
warmup_steps:
|
| 126 |
+
value: 1000
|
| 127 |
+
weight_decay:
|
| 128 |
+
value: 0.05
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/output.log
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
|
| 2 |
+
Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
|
| 3 |
+
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
|
| 4 |
+
|
| 5 |
+
| Name | Type | Params | Mode
|
| 6 |
+
------------------------------------------------------
|
| 7 |
+
0 | blip2qformer | Blip2Qformer | 327 M | train
|
| 8 |
+
------------------------------------------------------
|
| 9 |
+
179 M Trainable params
|
| 10 |
+
147 M Non-trainable params
|
| 11 |
+
327 M Total params
|
| 12 |
+
1,309.467 Total estimated model params size (MB)
|
| 13 |
+
5 Modules in train mode
|
| 14 |
+
926 Modules in eval mode
|
| 15 |
+
Sanity Checking DataLoader 2: 50%|██████████████████████████▌ | 1/2 [00:00<00:00, 1.92it/s]
|
| 16 |
+
/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
|
| 17 |
+
with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
|
| 18 |
+
Traceback (most recent call last):
|
| 19 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
|
| 20 |
+
main(args)
|
| 21 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
|
| 22 |
+
trainer.fit(model, datamodule=dm)
|
| 23 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
|
| 24 |
+
call._call_and_handle_interrupt(
|
| 25 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
|
| 26 |
+
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
|
| 27 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
|
| 28 |
+
return function(*args, **kwargs)
|
| 29 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
|
| 30 |
+
self._run(model, ckpt_path=ckpt_path)
|
| 31 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
|
| 32 |
+
results = self._run_stage()
|
| 33 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
|
| 34 |
+
self._run_sanity_check()
|
| 35 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
|
| 36 |
+
val_loop.run()
|
| 37 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
|
| 38 |
+
return loop_run(self, *args, **kwargs)
|
| 39 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
|
| 40 |
+
self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
|
| 41 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
|
| 42 |
+
output = call._call_strategy_hook(trainer, hook_name, *step_args)
|
| 43 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
|
| 44 |
+
output = fn(*args, **kwargs)
|
| 45 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
|
| 46 |
+
return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
|
| 47 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
|
| 48 |
+
wrapper_output = wrapper_module(*args, **kwargs)
|
| 49 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 50 |
+
return self._call_impl(*args, **kwargs)
|
| 51 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 52 |
+
return forward_call(*args, **kwargs)
|
| 53 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
|
| 54 |
+
ret_val = func(*args, **kwargs)
|
| 55 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
|
| 56 |
+
loss = self.module(*inputs, **kwargs)
|
| 57 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 58 |
+
return self._call_impl(*args, **kwargs)
|
| 59 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
|
| 60 |
+
return inner()
|
| 61 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
|
| 62 |
+
result = forward_call(*args, **kwargs)
|
| 63 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
|
| 64 |
+
out = method(*_args, **_kwargs)
|
| 65 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
|
| 66 |
+
return func(*args, **kwargs)
|
| 67 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
|
| 68 |
+
blip2_loss = self.blip2qformer(batch)
|
| 69 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 70 |
+
return self._call_impl(*args, **kwargs)
|
| 71 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 72 |
+
return forward_call(*args, **kwargs)
|
| 73 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
|
| 74 |
+
sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
|
| 75 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
|
| 76 |
+
loss_graph = F.cross_entropy(logits_per_graph, labels)
|
| 77 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
|
| 78 |
+
return torch._C._nn.cross_entropy_loss(
|
| 79 |
+
RuntimeError: size mismatch (got input: [4], target: [1])
|
| 80 |
+
[rank0]: Traceback (most recent call last):
|
| 81 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
|
| 82 |
+
[rank0]: main(args)
|
| 83 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
|
| 84 |
+
[rank0]: trainer.fit(model, datamodule=dm)
|
| 85 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
|
| 86 |
+
[rank0]: call._call_and_handle_interrupt(
|
| 87 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
|
| 88 |
+
[rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
|
| 89 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
|
| 90 |
+
[rank0]: return function(*args, **kwargs)
|
| 91 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
|
| 92 |
+
[rank0]: self._run(model, ckpt_path=ckpt_path)
|
| 93 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
|
| 94 |
+
[rank0]: results = self._run_stage()
|
| 95 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
|
| 96 |
+
[rank0]: self._run_sanity_check()
|
| 97 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
|
| 98 |
+
[rank0]: val_loop.run()
|
| 99 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
|
| 100 |
+
[rank0]: return loop_run(self, *args, **kwargs)
|
| 101 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
|
| 102 |
+
[rank0]: self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
|
| 103 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
|
| 104 |
+
[rank0]: output = call._call_strategy_hook(trainer, hook_name, *step_args)
|
| 105 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
|
| 106 |
+
[rank0]: output = fn(*args, **kwargs)
|
| 107 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
|
| 108 |
+
[rank0]: return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
|
| 109 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
|
| 110 |
+
[rank0]: wrapper_output = wrapper_module(*args, **kwargs)
|
| 111 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 112 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 113 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 114 |
+
[rank0]: return forward_call(*args, **kwargs)
|
| 115 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
|
| 116 |
+
[rank0]: ret_val = func(*args, **kwargs)
|
| 117 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
|
| 118 |
+
[rank0]: loss = self.module(*inputs, **kwargs)
|
| 119 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 120 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 121 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
|
| 122 |
+
[rank0]: return inner()
|
| 123 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
|
| 124 |
+
[rank0]: result = forward_call(*args, **kwargs)
|
| 125 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
|
| 126 |
+
[rank0]: out = method(*_args, **_kwargs)
|
| 127 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
|
| 128 |
+
[rank0]: return func(*args, **kwargs)
|
| 129 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
|
| 130 |
+
[rank0]: blip2_loss = self.blip2qformer(batch)
|
| 131 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 132 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 133 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 134 |
+
[rank0]: return forward_call(*args, **kwargs)
|
| 135 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
|
| 136 |
+
[rank0]: sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
|
| 137 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
|
| 138 |
+
[rank0]: loss_graph = F.cross_entropy(logits_per_graph, labels)
|
| 139 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
|
| 140 |
+
[rank0]: return torch._C._nn.cross_entropy_loss(
|
| 141 |
+
[rank0]: RuntimeError: size mismatch (got input: [4], target: [1])
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/requirements.txt
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy==2.2.6
|
| 2 |
+
confection==0.1.5
|
| 3 |
+
text-unidecode==1.3
|
| 4 |
+
contexttimer==0.3.3
|
| 5 |
+
omegaconf==2.3.0
|
| 6 |
+
tzdata==2025.2
|
| 7 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 8 |
+
plotly==6.1.1
|
| 9 |
+
decord==0.6.0
|
| 10 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 11 |
+
scipy==1.15.3
|
| 12 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 13 |
+
parso==0.8.4
|
| 14 |
+
python-dateutil==2.9.0.post0
|
| 15 |
+
setuptools==78.1.1
|
| 16 |
+
aiosignal==1.3.2
|
| 17 |
+
joblib==1.5.1
|
| 18 |
+
platformdirs==4.3.8
|
| 19 |
+
regex==2024.11.6
|
| 20 |
+
aiohappyeyeballs==2.6.1
|
| 21 |
+
virtualenv==20.31.2
|
| 22 |
+
lazy_loader==0.4
|
| 23 |
+
rich==14.0.0
|
| 24 |
+
timm==0.4.12
|
| 25 |
+
antlr4-python3-runtime==4.9.3
|
| 26 |
+
pandas==2.2.3
|
| 27 |
+
salesforce-lavis==1.0.2
|
| 28 |
+
gitdb==4.0.12
|
| 29 |
+
six==1.17.0
|
| 30 |
+
smmap==5.0.2
|
| 31 |
+
annotated-types==0.7.0
|
| 32 |
+
pyparsing==3.2.3
|
| 33 |
+
Jinja2==3.1.6
|
| 34 |
+
ptyprocess==0.7.0
|
| 35 |
+
streamlit==1.45.1
|
| 36 |
+
idna==3.10
|
| 37 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 38 |
+
tenacity==9.1.2
|
| 39 |
+
sentencepiece==0.2.0
|
| 40 |
+
matplotlib-inline==0.1.7
|
| 41 |
+
typing-inspection==0.4.1
|
| 42 |
+
packaging==24.2
|
| 43 |
+
nltk==3.9.1
|
| 44 |
+
wheel==0.45.1
|
| 45 |
+
catalogue==2.0.10
|
| 46 |
+
matplotlib==3.10.3
|
| 47 |
+
propcache==0.3.1
|
| 48 |
+
Pygments==2.19.1
|
| 49 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 50 |
+
requests==2.32.3
|
| 51 |
+
filelock==3.18.0
|
| 52 |
+
pexpect==4.9.0
|
| 53 |
+
opencv-python-headless==4.5.5.64
|
| 54 |
+
certifi==2025.4.26
|
| 55 |
+
nvidia-nvtx-cu12==12.4.127
|
| 56 |
+
bleach==6.2.0
|
| 57 |
+
typing_extensions==4.13.2
|
| 58 |
+
tornado==6.5.1
|
| 59 |
+
networkx==3.4.2
|
| 60 |
+
sympy==1.13.1
|
| 61 |
+
watchdog==6.0.0
|
| 62 |
+
kaggle==1.7.4.5
|
| 63 |
+
nvidia-ml-py==12.575.51
|
| 64 |
+
pyarrow==20.0.0
|
| 65 |
+
mpmath==1.3.0
|
| 66 |
+
lightning-utilities==0.14.3
|
| 67 |
+
ftfy==6.3.1
|
| 68 |
+
triton==3.2.0
|
| 69 |
+
referencing==0.36.2
|
| 70 |
+
ipython==8.36.0
|
| 71 |
+
yarl==1.20.0
|
| 72 |
+
language_data==1.3.0
|
| 73 |
+
cycler==0.12.1
|
| 74 |
+
python-magic==0.4.27
|
| 75 |
+
wasabi==1.1.3
|
| 76 |
+
protobuf==6.31.0
|
| 77 |
+
murmurhash==1.0.13
|
| 78 |
+
jsonschema-specifications==2025.4.1
|
| 79 |
+
blinker==1.9.0
|
| 80 |
+
fonttools==4.58.0
|
| 81 |
+
imageio==2.37.0
|
| 82 |
+
pycocoevalcap==1.2
|
| 83 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 84 |
+
fairscale==0.4.4
|
| 85 |
+
hjson==3.1.0
|
| 86 |
+
identify==2.6.12
|
| 87 |
+
mdurl==0.1.2
|
| 88 |
+
decorator==5.2.1
|
| 89 |
+
distlib==0.3.9
|
| 90 |
+
webencodings==0.5.1
|
| 91 |
+
kiwisolver==1.4.8
|
| 92 |
+
srsly==2.5.1
|
| 93 |
+
frozenlist==1.6.0
|
| 94 |
+
blis==1.3.0
|
| 95 |
+
contourpy==1.3.2
|
| 96 |
+
hf-xet==1.1.2
|
| 97 |
+
cymem==2.0.11
|
| 98 |
+
pillow==11.2.1
|
| 99 |
+
pycocotools==2.0.8
|
| 100 |
+
pre_commit==4.2.0
|
| 101 |
+
wrapt==1.17.2
|
| 102 |
+
nvidia-curand-cu12==10.3.5.147
|
| 103 |
+
spacy==3.8.7
|
| 104 |
+
rpds-py==0.25.1
|
| 105 |
+
exceptiongroup==1.3.0
|
| 106 |
+
braceexpand==0.1.7
|
| 107 |
+
rouge_score==0.1.2
|
| 108 |
+
async-timeout==5.0.1
|
| 109 |
+
torchmetrics==1.7.1
|
| 110 |
+
nvidia-nccl-cu12==2.21.5
|
| 111 |
+
wcwidth==0.2.13
|
| 112 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 113 |
+
scikit-image==0.25.2
|
| 114 |
+
urllib3==2.4.0
|
| 115 |
+
portalocker==3.1.1
|
| 116 |
+
smart-open==7.1.0
|
| 117 |
+
cfgv==3.4.0
|
| 118 |
+
markdown-it-py==3.0.0
|
| 119 |
+
charset-normalizer==3.4.2
|
| 120 |
+
executing==2.2.0
|
| 121 |
+
pure_eval==0.2.3
|
| 122 |
+
safetensors==0.5.3
|
| 123 |
+
spacy-legacy==3.0.12
|
| 124 |
+
shellingham==1.5.4
|
| 125 |
+
langcodes==3.5.0
|
| 126 |
+
pytz==2025.2
|
| 127 |
+
iopath==0.1.10
|
| 128 |
+
weasel==0.4.1
|
| 129 |
+
tifffile==2025.5.10
|
| 130 |
+
nodeenv==1.9.1
|
| 131 |
+
absl-py==2.2.2
|
| 132 |
+
einops==0.8.1
|
| 133 |
+
msgpack==1.1.0
|
| 134 |
+
pydantic_core==2.33.2
|
| 135 |
+
ninja==1.11.1.4
|
| 136 |
+
altair==5.5.0
|
| 137 |
+
attrs==25.3.0
|
| 138 |
+
tqdm==4.67.1
|
| 139 |
+
deepspeed==0.16.10+b666844f
|
| 140 |
+
pydeck==0.9.1
|
| 141 |
+
stack-data==0.6.3
|
| 142 |
+
pydantic==2.11.5
|
| 143 |
+
torch==2.6.0
|
| 144 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 145 |
+
python-slugify==8.0.4
|
| 146 |
+
webdataset==0.2.111
|
| 147 |
+
pytorch-lightning==2.5.1.post0
|
| 148 |
+
prompt_toolkit==3.0.51
|
| 149 |
+
psutil==7.0.0
|
| 150 |
+
opendatasets==0.1.22
|
| 151 |
+
asttokens==3.0.0
|
| 152 |
+
MarkupSafe==3.0.2
|
| 153 |
+
multidict==6.4.4
|
| 154 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 155 |
+
GitPython==3.1.44
|
| 156 |
+
PyYAML==6.0.2
|
| 157 |
+
cloudpathlib==0.21.1
|
| 158 |
+
toml==0.10.2
|
| 159 |
+
marisa-trie==1.2.1
|
| 160 |
+
traitlets==5.14.3
|
| 161 |
+
cachetools==5.5.2
|
| 162 |
+
spacy-loggers==1.0.5
|
| 163 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 164 |
+
torchvision==0.21.0
|
| 165 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 166 |
+
jedi==0.19.2
|
| 167 |
+
thinc==8.3.6
|
| 168 |
+
py-cpuinfo==9.0.0
|
| 169 |
+
yacs==0.1.8
|
| 170 |
+
cffi==1.17.1
|
| 171 |
+
preshed==3.0.10
|
| 172 |
+
more-itertools==10.7.0
|
| 173 |
+
bigmodelvis==0.0.1
|
| 174 |
+
datasets==3.6.0
|
| 175 |
+
huggingface-hub==0.32.1
|
| 176 |
+
narwhals==1.41.0
|
| 177 |
+
xxhash==3.5.0
|
| 178 |
+
sentry-sdk==2.29.1
|
| 179 |
+
aiohttp==3.12.2
|
| 180 |
+
opencv-python==4.11.0.86
|
| 181 |
+
pycryptodome==3.23.0
|
| 182 |
+
threadpoolctl==3.6.0
|
| 183 |
+
flash-attn==2.7.1.post1
|
| 184 |
+
transformers==4.52.3
|
| 185 |
+
pycparser==2.22
|
| 186 |
+
pathlib==1.0.1
|
| 187 |
+
dill==0.3.8
|
| 188 |
+
scikit-learn==1.6.1
|
| 189 |
+
tokenizers==0.21.1
|
| 190 |
+
aliyun-python-sdk-core==2.16.0
|
| 191 |
+
fsspec==2025.3.0
|
| 192 |
+
jmespath==0.10.0
|
| 193 |
+
click==8.2.1
|
| 194 |
+
delta-center-client==0.0.4
|
| 195 |
+
cheroot==10.0.1
|
| 196 |
+
wandb==0.19.11
|
| 197 |
+
setproctitle==1.3.6
|
| 198 |
+
jsonschema==4.24.0
|
| 199 |
+
oss2==2.15.0
|
| 200 |
+
multiprocess==0.70.16
|
| 201 |
+
jaraco.functools==4.1.0
|
| 202 |
+
web.py==0.62
|
| 203 |
+
aliyun-python-sdk-kms==2.16.5
|
| 204 |
+
cryptography==45.0.3
|
| 205 |
+
pip==25.1.1
|
| 206 |
+
docker-pycreds==0.4.0
|
| 207 |
+
typer==0.16.0
|
| 208 |
+
opendelta==0.3.2
|
| 209 |
+
crcmod==1.7
|
| 210 |
+
jaraco.functools==4.0.1
|
| 211 |
+
inflect==7.3.1
|
| 212 |
+
jaraco.collections==5.1.0
|
| 213 |
+
packaging==24.2
|
| 214 |
+
wheel==0.45.1
|
| 215 |
+
tomli==2.0.1
|
| 216 |
+
platformdirs==4.2.2
|
| 217 |
+
typing_extensions==4.12.2
|
| 218 |
+
more-itertools==10.3.0
|
| 219 |
+
autocommand==2.2.2
|
| 220 |
+
jaraco.text==3.12.1
|
| 221 |
+
importlib_metadata==8.0.0
|
| 222 |
+
jaraco.context==5.3.0
|
| 223 |
+
zipp==3.19.2
|
| 224 |
+
backports.tarfile==1.2.0
|
| 225 |
+
typeguard==4.3.0
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-06-22T11:04:29.455254Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--devices",
|
| 7 |
+
"0,1,2,3",
|
| 8 |
+
"--mode",
|
| 9 |
+
"train",
|
| 10 |
+
"--filename",
|
| 11 |
+
"stage1_06221723",
|
| 12 |
+
"--num_query_token",
|
| 13 |
+
"8",
|
| 14 |
+
"--plm_name",
|
| 15 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
|
| 16 |
+
"--bert_name",
|
| 17 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
|
| 18 |
+
"--save_every_n_epochs",
|
| 19 |
+
"5",
|
| 20 |
+
"--max_epochs",
|
| 21 |
+
"20",
|
| 22 |
+
"--batch_size",
|
| 23 |
+
"32",
|
| 24 |
+
"--precision",
|
| 25 |
+
"bf16-mixed",
|
| 26 |
+
"--mix_dataset",
|
| 27 |
+
"--num_workers",
|
| 28 |
+
"4",
|
| 29 |
+
"--use_wandb_logger"
|
| 30 |
+
],
|
| 31 |
+
"program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
|
| 32 |
+
"codePath": "stage1.py",
|
| 33 |
+
"email": "gia0603yucca@gmail.com",
|
| 34 |
+
"root": "./all_checkpoints/stage1_06221723/",
|
| 35 |
+
"host": "dsw-251511-c5cfcb8-lwcpt",
|
| 36 |
+
"executable": "/root/miniconda3/envs/protT3/bin/python",
|
| 37 |
+
"codePathLocal": "stage1.py",
|
| 38 |
+
"cpu_count": 64,
|
| 39 |
+
"cpu_count_logical": 64,
|
| 40 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 41 |
+
"gpu_count": 8,
|
| 42 |
+
"disk": {
|
| 43 |
+
"/": {
|
| 44 |
+
"total": "1623302262784",
|
| 45 |
+
"used": "1363148800"
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"memory": {
|
| 49 |
+
"total": "549755813888"
|
| 50 |
+
},
|
| 51 |
+
"cpu": {
|
| 52 |
+
"count": 64,
|
| 53 |
+
"countLogical": 64
|
| 54 |
+
},
|
| 55 |
+
"gpu_nvidia": [
|
| 56 |
+
{
|
| 57 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 58 |
+
"memoryTotal": "85198045184",
|
| 59 |
+
"architecture": "Ampere"
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 63 |
+
"memoryTotal": "85198045184",
|
| 64 |
+
"architecture": "Ampere"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 68 |
+
"memoryTotal": "85198045184",
|
| 69 |
+
"architecture": "Ampere"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 73 |
+
"memoryTotal": "85198045184",
|
| 74 |
+
"architecture": "Ampere"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 78 |
+
"memoryTotal": "85198045184",
|
| 79 |
+
"architecture": "Ampere"
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 83 |
+
"memoryTotal": "85198045184",
|
| 84 |
+
"architecture": "Ampere"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 88 |
+
"memoryTotal": "85198045184",
|
| 89 |
+
"architecture": "Ampere"
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 93 |
+
"memoryTotal": "85198045184",
|
| 94 |
+
"architecture": "Ampere"
|
| 95 |
+
}
|
| 96 |
+
],
|
| 97 |
+
"cudaVersion": "12.1"
|
| 98 |
+
}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":11}}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-22T19:04:29.457094442+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-22T19:04:30.490221799+08:00","level":"INFO","msg":"created new stream","id":"bq9amgfj"}
|
| 3 |
+
{"time":"2025-06-22T19:04:30.490257318+08:00","level":"INFO","msg":"stream: started","id":"bq9amgfj"}
|
| 4 |
+
{"time":"2025-06-22T19:04:30.490294134+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"bq9amgfj"}
|
| 5 |
+
{"time":"2025-06-22T19:04:30.490304877+08:00","level":"INFO","msg":"sender: started","stream_id":"bq9amgfj"}
|
| 6 |
+
{"time":"2025-06-22T19:04:30.490450091+08:00","level":"INFO","msg":"handler: started","stream_id":"bq9amgfj"}
|
| 7 |
+
{"time":"2025-06-22T19:04:31.742014982+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-06-22T19:04:40.504265321+08:00","level":"INFO","msg":"stream: closing","id":"bq9amgfj"}
|
| 9 |
+
{"time":"2025-06-22T19:04:40.504307897+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 10 |
+
{"time":"2025-06-22T19:04:40.505067489+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 11 |
+
{"time":"2025-06-22T19:04:41.953923124+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 12 |
+
{"time":"2025-06-22T19:04:43.137437891+08:00","level":"INFO","msg":"handler: closed","stream_id":"bq9amgfj"}
|
| 13 |
+
{"time":"2025-06-22T19:04:43.137489667+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"bq9amgfj"}
|
| 14 |
+
{"time":"2025-06-22T19:04:43.137501311+08:00","level":"INFO","msg":"sender: closed","stream_id":"bq9amgfj"}
|
| 15 |
+
{"time":"2025-06-22T19:04:43.141402359+08:00","level":"INFO","msg":"stream: closed","id":"bq9amgfj"}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Configure stats pid to 79876
|
| 3 |
+
2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug.log
|
| 7 |
+
2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-internal.log
|
| 8 |
+
2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-22 19:04:29,450 INFO MainThread:79876 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-22 19:04:29,452 INFO MainThread:79876 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-22 19:04:29,456 INFO MainThread:79876 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-22 19:04:29,458 INFO MainThread:79876 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-22 19:04:31,693 INFO MainThread:79876 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-06-22 19:04:31,896 INFO MainThread:79876 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-06-22 19:04:31,897 INFO MainThread:79876 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-06-22 19:04:31,899 INFO MainThread:79876 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-06-22 19:04:31,899 INFO MainThread:79876 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-06-22 19:04:31,900 INFO MainThread:79876 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-06-22 19:04:35,461 INFO MainThread:79876 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 4, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
|
| 24 |
+
2025-06-22 19:04:40,503 INFO MsgRouterThr:79876 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/run-bq9amgfj.wandb
ADDED
|
Binary file (20.6 kB). View file
|
|
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/config.yaml
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.11
|
| 4 |
+
m:
|
| 5 |
+
- "1": loader1/val_loss/dataloader_idx_1
|
| 6 |
+
"5": 2
|
| 7 |
+
"6":
|
| 8 |
+
- 1
|
| 9 |
+
- 3
|
| 10 |
+
"7": []
|
| 11 |
+
- "1": trainer/global_step
|
| 12 |
+
"6":
|
| 13 |
+
- 3
|
| 14 |
+
"7": []
|
| 15 |
+
- "1": loader1/val_loss_ptm/dataloader_idx_1
|
| 16 |
+
"5": 2
|
| 17 |
+
"6":
|
| 18 |
+
- 1
|
| 19 |
+
- 3
|
| 20 |
+
"7": []
|
| 21 |
+
- "1": train_loss
|
| 22 |
+
"5": 2
|
| 23 |
+
"6":
|
| 24 |
+
- 1
|
| 25 |
+
- 3
|
| 26 |
+
"7": []
|
| 27 |
+
- "1": loader2/val_loss_ptm/dataloader_idx_2
|
| 28 |
+
"5": 2
|
| 29 |
+
"6":
|
| 30 |
+
- 1
|
| 31 |
+
- 3
|
| 32 |
+
"7": []
|
| 33 |
+
- "1": loader2/val_loss/dataloader_idx_2
|
| 34 |
+
"5": 2
|
| 35 |
+
"6":
|
| 36 |
+
- 1
|
| 37 |
+
- 3
|
| 38 |
+
"7": []
|
| 39 |
+
- "1": epoch
|
| 40 |
+
"5": 2
|
| 41 |
+
"6":
|
| 42 |
+
- 1
|
| 43 |
+
- 3
|
| 44 |
+
"7": []
|
| 45 |
+
- "1": train_loss_ptm
|
| 46 |
+
"5": 2
|
| 47 |
+
"6":
|
| 48 |
+
- 1
|
| 49 |
+
- 3
|
| 50 |
+
"7": []
|
| 51 |
+
- "1": train_loss_lm
|
| 52 |
+
"5": 2
|
| 53 |
+
"6":
|
| 54 |
+
- 1
|
| 55 |
+
- 3
|
| 56 |
+
"7": []
|
| 57 |
+
- "1": loader1/val_loss_ptc/dataloader_idx_1
|
| 58 |
+
"5": 2
|
| 59 |
+
"6":
|
| 60 |
+
- 1
|
| 61 |
+
- 3
|
| 62 |
+
"7": []
|
| 63 |
+
- "1": loader0/val_loss_ptm/dataloader_idx_0
|
| 64 |
+
"5": 2
|
| 65 |
+
"6":
|
| 66 |
+
- 1
|
| 67 |
+
- 3
|
| 68 |
+
"7": []
|
| 69 |
+
- "1": loader0/val_loss_lm/dataloader_idx_0
|
| 70 |
+
"5": 2
|
| 71 |
+
"6":
|
| 72 |
+
- 1
|
| 73 |
+
- 3
|
| 74 |
+
"7": []
|
| 75 |
+
- "1": loader2/val_loss_lm/dataloader_idx_2
|
| 76 |
+
"5": 2
|
| 77 |
+
"6":
|
| 78 |
+
- 1
|
| 79 |
+
- 3
|
| 80 |
+
"7": []
|
| 81 |
+
- "1": loader1/val_loss_lm/dataloader_idx_1
|
| 82 |
+
"5": 2
|
| 83 |
+
"6":
|
| 84 |
+
- 1
|
| 85 |
+
- 3
|
| 86 |
+
"7": []
|
| 87 |
+
- "1": train_loss_ptc
|
| 88 |
+
"5": 2
|
| 89 |
+
"6":
|
| 90 |
+
- 1
|
| 91 |
+
- 3
|
| 92 |
+
"7": []
|
| 93 |
+
- "1": loader0/val_loss_ptc/dataloader_idx_0
|
| 94 |
+
"5": 2
|
| 95 |
+
"6":
|
| 96 |
+
- 1
|
| 97 |
+
- 3
|
| 98 |
+
"7": []
|
| 99 |
+
- "1": loader0/val_loss/dataloader_idx_0
|
| 100 |
+
"5": 2
|
| 101 |
+
"6":
|
| 102 |
+
- 1
|
| 103 |
+
- 3
|
| 104 |
+
"7": []
|
| 105 |
+
- "1": lr
|
| 106 |
+
"5": 2
|
| 107 |
+
"6":
|
| 108 |
+
- 1
|
| 109 |
+
- 3
|
| 110 |
+
"7": []
|
| 111 |
+
- "1": loader2/val_loss_ptc/dataloader_idx_2
|
| 112 |
+
"5": 2
|
| 113 |
+
"6":
|
| 114 |
+
- 1
|
| 115 |
+
- 3
|
| 116 |
+
"7": []
|
| 117 |
+
python_version: 3.10.0
|
| 118 |
+
t:
|
| 119 |
+
"1":
|
| 120 |
+
- 1
|
| 121 |
+
- 5
|
| 122 |
+
- 9
|
| 123 |
+
- 11
|
| 124 |
+
- 33
|
| 125 |
+
- 41
|
| 126 |
+
- 49
|
| 127 |
+
- 53
|
| 128 |
+
- 55
|
| 129 |
+
- 63
|
| 130 |
+
- 103
|
| 131 |
+
"2":
|
| 132 |
+
- 1
|
| 133 |
+
- 5
|
| 134 |
+
- 9
|
| 135 |
+
- 11
|
| 136 |
+
- 33
|
| 137 |
+
- 41
|
| 138 |
+
- 49
|
| 139 |
+
- 53
|
| 140 |
+
- 55
|
| 141 |
+
- 63
|
| 142 |
+
- 103
|
| 143 |
+
"3":
|
| 144 |
+
- 7
|
| 145 |
+
- 23
|
| 146 |
+
- 55
|
| 147 |
+
- 66
|
| 148 |
+
"4": 3.10.0
|
| 149 |
+
"5": 0.19.11
|
| 150 |
+
"6": 4.52.3
|
| 151 |
+
"8":
|
| 152 |
+
- 5
|
| 153 |
+
"12": 0.19.11
|
| 154 |
+
"13": linux-x86_64
|
| 155 |
+
accelerator:
|
| 156 |
+
value: gpu
|
| 157 |
+
batch_size:
|
| 158 |
+
value: 32
|
| 159 |
+
bert_hidden_dim:
|
| 160 |
+
value: 768
|
| 161 |
+
bert_name:
|
| 162 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
|
| 163 |
+
check_val_every_n_epoch:
|
| 164 |
+
value: 1
|
| 165 |
+
cross_attention_freq:
|
| 166 |
+
value: 2
|
| 167 |
+
devices:
|
| 168 |
+
value: 0,1,2,3,4,5,6,7
|
| 169 |
+
filename:
|
| 170 |
+
value: stage1_06221723
|
| 171 |
+
init_checkpoint:
|
| 172 |
+
value: ""
|
| 173 |
+
init_lr:
|
| 174 |
+
value: 0.0001
|
| 175 |
+
lm:
|
| 176 |
+
value: true
|
| 177 |
+
load_4bit:
|
| 178 |
+
value: false
|
| 179 |
+
lr_decay_rate:
|
| 180 |
+
value: 0.9
|
| 181 |
+
match_batch_size:
|
| 182 |
+
value: 64
|
| 183 |
+
max_epochs:
|
| 184 |
+
value: 20
|
| 185 |
+
min_lr:
|
| 186 |
+
value: 1e-05
|
| 187 |
+
mix_dataset:
|
| 188 |
+
value: true
|
| 189 |
+
mode:
|
| 190 |
+
value: train
|
| 191 |
+
num_query_token:
|
| 192 |
+
value: 8
|
| 193 |
+
num_workers:
|
| 194 |
+
value: 8
|
| 195 |
+
plm_name:
|
| 196 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
|
| 197 |
+
plm_tune:
|
| 198 |
+
value: freeze
|
| 199 |
+
pool_size:
|
| 200 |
+
value: 0
|
| 201 |
+
precision:
|
| 202 |
+
value: bf16-mixed
|
| 203 |
+
projection_dim:
|
| 204 |
+
value: 256
|
| 205 |
+
prot_aug:
|
| 206 |
+
value: None
|
| 207 |
+
prot_max_len:
|
| 208 |
+
value: 1024
|
| 209 |
+
ptm:
|
| 210 |
+
value: true
|
| 211 |
+
rerank_cand_num:
|
| 212 |
+
value: 128
|
| 213 |
+
retrieval_eval_epoch:
|
| 214 |
+
value: 10
|
| 215 |
+
root:
|
| 216 |
+
value: data
|
| 217 |
+
save_every_n_epochs:
|
| 218 |
+
value: 5
|
| 219 |
+
scheduler:
|
| 220 |
+
value: linear_warmup_cosine_lr
|
| 221 |
+
seed:
|
| 222 |
+
value: 42
|
| 223 |
+
strategy:
|
| 224 |
+
value: deepspeed
|
| 225 |
+
temperature:
|
| 226 |
+
value: 0.1
|
| 227 |
+
text_max_len:
|
| 228 |
+
value: 128
|
| 229 |
+
use_wandb_logger:
|
| 230 |
+
value: true
|
| 231 |
+
warmup_lr:
|
| 232 |
+
value: 1e-06
|
| 233 |
+
warmup_steps:
|
| 234 |
+
value: 1000
|
| 235 |
+
weight_decay:
|
| 236 |
+
value: 0.05
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/output.log
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
|
| 2 |
+
Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
|
| 3 |
+
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
|
| 4 |
+
|
| 5 |
+
| Name | Type | Params | Mode
|
| 6 |
+
------------------------------------------------------
|
| 7 |
+
0 | blip2qformer | Blip2Qformer | 327 M | train
|
| 8 |
+
------------------------------------------------------
|
| 9 |
+
179 M Trainable params
|
| 10 |
+
147 M Non-trainable params
|
| 11 |
+
327 M Total params
|
| 12 |
+
1,309.467 Total estimated model params size (MB)
|
| 13 |
+
5 Modules in train mode
|
| 14 |
+
926 Modules in eval mode
|
| 15 |
+
Epoch 3: 18%|██████████▊ | 11/61 [00:08<00:37, 1.35it/s, v_num=24k3]
|
| 16 |
+
/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
|
| 17 |
+
with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
Detected KeyboardInterrupt, attempting graceful shutdown ...
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/requirements.txt
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy==2.2.6
|
| 2 |
+
confection==0.1.5
|
| 3 |
+
text-unidecode==1.3
|
| 4 |
+
contexttimer==0.3.3
|
| 5 |
+
omegaconf==2.3.0
|
| 6 |
+
tzdata==2025.2
|
| 7 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 8 |
+
plotly==6.1.1
|
| 9 |
+
decord==0.6.0
|
| 10 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 11 |
+
scipy==1.15.3
|
| 12 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 13 |
+
parso==0.8.4
|
| 14 |
+
python-dateutil==2.9.0.post0
|
| 15 |
+
setuptools==78.1.1
|
| 16 |
+
aiosignal==1.3.2
|
| 17 |
+
joblib==1.5.1
|
| 18 |
+
platformdirs==4.3.8
|
| 19 |
+
regex==2024.11.6
|
| 20 |
+
aiohappyeyeballs==2.6.1
|
| 21 |
+
virtualenv==20.31.2
|
| 22 |
+
lazy_loader==0.4
|
| 23 |
+
rich==14.0.0
|
| 24 |
+
timm==0.4.12
|
| 25 |
+
antlr4-python3-runtime==4.9.3
|
| 26 |
+
pandas==2.2.3
|
| 27 |
+
salesforce-lavis==1.0.2
|
| 28 |
+
gitdb==4.0.12
|
| 29 |
+
six==1.17.0
|
| 30 |
+
smmap==5.0.2
|
| 31 |
+
annotated-types==0.7.0
|
| 32 |
+
pyparsing==3.2.3
|
| 33 |
+
Jinja2==3.1.6
|
| 34 |
+
ptyprocess==0.7.0
|
| 35 |
+
streamlit==1.45.1
|
| 36 |
+
idna==3.10
|
| 37 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 38 |
+
tenacity==9.1.2
|
| 39 |
+
sentencepiece==0.2.0
|
| 40 |
+
matplotlib-inline==0.1.7
|
| 41 |
+
typing-inspection==0.4.1
|
| 42 |
+
packaging==24.2
|
| 43 |
+
nltk==3.9.1
|
| 44 |
+
wheel==0.45.1
|
| 45 |
+
catalogue==2.0.10
|
| 46 |
+
matplotlib==3.10.3
|
| 47 |
+
propcache==0.3.1
|
| 48 |
+
Pygments==2.19.1
|
| 49 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 50 |
+
requests==2.32.3
|
| 51 |
+
filelock==3.18.0
|
| 52 |
+
pexpect==4.9.0
|
| 53 |
+
opencv-python-headless==4.5.5.64
|
| 54 |
+
certifi==2025.4.26
|
| 55 |
+
nvidia-nvtx-cu12==12.4.127
|
| 56 |
+
bleach==6.2.0
|
| 57 |
+
typing_extensions==4.13.2
|
| 58 |
+
tornado==6.5.1
|
| 59 |
+
networkx==3.4.2
|
| 60 |
+
sympy==1.13.1
|
| 61 |
+
watchdog==6.0.0
|
| 62 |
+
kaggle==1.7.4.5
|
| 63 |
+
nvidia-ml-py==12.575.51
|
| 64 |
+
pyarrow==20.0.0
|
| 65 |
+
mpmath==1.3.0
|
| 66 |
+
lightning-utilities==0.14.3
|
| 67 |
+
ftfy==6.3.1
|
| 68 |
+
triton==3.2.0
|
| 69 |
+
referencing==0.36.2
|
| 70 |
+
ipython==8.36.0
|
| 71 |
+
yarl==1.20.0
|
| 72 |
+
language_data==1.3.0
|
| 73 |
+
cycler==0.12.1
|
| 74 |
+
python-magic==0.4.27
|
| 75 |
+
wasabi==1.1.3
|
| 76 |
+
protobuf==6.31.0
|
| 77 |
+
murmurhash==1.0.13
|
| 78 |
+
jsonschema-specifications==2025.4.1
|
| 79 |
+
blinker==1.9.0
|
| 80 |
+
fonttools==4.58.0
|
| 81 |
+
imageio==2.37.0
|
| 82 |
+
pycocoevalcap==1.2
|
| 83 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 84 |
+
fairscale==0.4.4
|
| 85 |
+
hjson==3.1.0
|
| 86 |
+
identify==2.6.12
|
| 87 |
+
mdurl==0.1.2
|
| 88 |
+
decorator==5.2.1
|
| 89 |
+
distlib==0.3.9
|
| 90 |
+
webencodings==0.5.1
|
| 91 |
+
kiwisolver==1.4.8
|
| 92 |
+
srsly==2.5.1
|
| 93 |
+
frozenlist==1.6.0
|
| 94 |
+
blis==1.3.0
|
| 95 |
+
contourpy==1.3.2
|
| 96 |
+
hf-xet==1.1.2
|
| 97 |
+
cymem==2.0.11
|
| 98 |
+
pillow==11.2.1
|
| 99 |
+
pycocotools==2.0.8
|
| 100 |
+
pre_commit==4.2.0
|
| 101 |
+
wrapt==1.17.2
|
| 102 |
+
nvidia-curand-cu12==10.3.5.147
|
| 103 |
+
spacy==3.8.7
|
| 104 |
+
rpds-py==0.25.1
|
| 105 |
+
exceptiongroup==1.3.0
|
| 106 |
+
braceexpand==0.1.7
|
| 107 |
+
rouge_score==0.1.2
|
| 108 |
+
async-timeout==5.0.1
|
| 109 |
+
torchmetrics==1.7.1
|
| 110 |
+
nvidia-nccl-cu12==2.21.5
|
| 111 |
+
wcwidth==0.2.13
|
| 112 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 113 |
+
scikit-image==0.25.2
|
| 114 |
+
urllib3==2.4.0
|
| 115 |
+
portalocker==3.1.1
|
| 116 |
+
smart-open==7.1.0
|
| 117 |
+
cfgv==3.4.0
|
| 118 |
+
markdown-it-py==3.0.0
|
| 119 |
+
charset-normalizer==3.4.2
|
| 120 |
+
executing==2.2.0
|
| 121 |
+
pure_eval==0.2.3
|
| 122 |
+
safetensors==0.5.3
|
| 123 |
+
spacy-legacy==3.0.12
|
| 124 |
+
shellingham==1.5.4
|
| 125 |
+
langcodes==3.5.0
|
| 126 |
+
pytz==2025.2
|
| 127 |
+
iopath==0.1.10
|
| 128 |
+
weasel==0.4.1
|
| 129 |
+
tifffile==2025.5.10
|
| 130 |
+
nodeenv==1.9.1
|
| 131 |
+
absl-py==2.2.2
|
| 132 |
+
einops==0.8.1
|
| 133 |
+
msgpack==1.1.0
|
| 134 |
+
pydantic_core==2.33.2
|
| 135 |
+
ninja==1.11.1.4
|
| 136 |
+
altair==5.5.0
|
| 137 |
+
attrs==25.3.0
|
| 138 |
+
tqdm==4.67.1
|
| 139 |
+
deepspeed==0.16.10+b666844f
|
| 140 |
+
pydeck==0.9.1
|
| 141 |
+
stack-data==0.6.3
|
| 142 |
+
pydantic==2.11.5
|
| 143 |
+
torch==2.6.0
|
| 144 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 145 |
+
python-slugify==8.0.4
|
| 146 |
+
webdataset==0.2.111
|
| 147 |
+
pytorch-lightning==2.5.1.post0
|
| 148 |
+
prompt_toolkit==3.0.51
|
| 149 |
+
psutil==7.0.0
|
| 150 |
+
opendatasets==0.1.22
|
| 151 |
+
asttokens==3.0.0
|
| 152 |
+
MarkupSafe==3.0.2
|
| 153 |
+
multidict==6.4.4
|
| 154 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 155 |
+
GitPython==3.1.44
|
| 156 |
+
PyYAML==6.0.2
|
| 157 |
+
cloudpathlib==0.21.1
|
| 158 |
+
toml==0.10.2
|
| 159 |
+
marisa-trie==1.2.1
|
| 160 |
+
traitlets==5.14.3
|
| 161 |
+
cachetools==5.5.2
|
| 162 |
+
spacy-loggers==1.0.5
|
| 163 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 164 |
+
torchvision==0.21.0
|
| 165 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 166 |
+
jedi==0.19.2
|
| 167 |
+
thinc==8.3.6
|
| 168 |
+
py-cpuinfo==9.0.0
|
| 169 |
+
yacs==0.1.8
|
| 170 |
+
cffi==1.17.1
|
| 171 |
+
preshed==3.0.10
|
| 172 |
+
more-itertools==10.7.0
|
| 173 |
+
bigmodelvis==0.0.1
|
| 174 |
+
datasets==3.6.0
|
| 175 |
+
huggingface-hub==0.32.1
|
| 176 |
+
narwhals==1.41.0
|
| 177 |
+
xxhash==3.5.0
|
| 178 |
+
sentry-sdk==2.29.1
|
| 179 |
+
aiohttp==3.12.2
|
| 180 |
+
opencv-python==4.11.0.86
|
| 181 |
+
pycryptodome==3.23.0
|
| 182 |
+
threadpoolctl==3.6.0
|
| 183 |
+
flash-attn==2.7.1.post1
|
| 184 |
+
transformers==4.52.3
|
| 185 |
+
pycparser==2.22
|
| 186 |
+
pathlib==1.0.1
|
| 187 |
+
dill==0.3.8
|
| 188 |
+
scikit-learn==1.6.1
|
| 189 |
+
tokenizers==0.21.1
|
| 190 |
+
aliyun-python-sdk-core==2.16.0
|
| 191 |
+
fsspec==2025.3.0
|
| 192 |
+
jmespath==0.10.0
|
| 193 |
+
click==8.2.1
|
| 194 |
+
delta-center-client==0.0.4
|
| 195 |
+
cheroot==10.0.1
|
| 196 |
+
wandb==0.19.11
|
| 197 |
+
setproctitle==1.3.6
|
| 198 |
+
jsonschema==4.24.0
|
| 199 |
+
oss2==2.15.0
|
| 200 |
+
multiprocess==0.70.16
|
| 201 |
+
jaraco.functools==4.1.0
|
| 202 |
+
web.py==0.62
|
| 203 |
+
aliyun-python-sdk-kms==2.16.5
|
| 204 |
+
cryptography==45.0.3
|
| 205 |
+
pip==25.1.1
|
| 206 |
+
docker-pycreds==0.4.0
|
| 207 |
+
typer==0.16.0
|
| 208 |
+
opendelta==0.3.2
|
| 209 |
+
crcmod==1.7
|
| 210 |
+
jaraco.functools==4.0.1
|
| 211 |
+
inflect==7.3.1
|
| 212 |
+
jaraco.collections==5.1.0
|
| 213 |
+
packaging==24.2
|
| 214 |
+
wheel==0.45.1
|
| 215 |
+
tomli==2.0.1
|
| 216 |
+
platformdirs==4.2.2
|
| 217 |
+
typing_extensions==4.12.2
|
| 218 |
+
more-itertools==10.3.0
|
| 219 |
+
autocommand==2.2.2
|
| 220 |
+
jaraco.text==3.12.1
|
| 221 |
+
importlib_metadata==8.0.0
|
| 222 |
+
jaraco.context==5.3.0
|
| 223 |
+
zipp==3.19.2
|
| 224 |
+
backports.tarfile==1.2.0
|
| 225 |
+
typeguard==4.3.0
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-06-22T11:06:31.087290Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--devices",
|
| 7 |
+
"0,1,2,3,4,5,6,7",
|
| 8 |
+
"--mode",
|
| 9 |
+
"train",
|
| 10 |
+
"--filename",
|
| 11 |
+
"stage1_06221723",
|
| 12 |
+
"--num_query_token",
|
| 13 |
+
"8",
|
| 14 |
+
"--plm_name",
|
| 15 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
|
| 16 |
+
"--bert_name",
|
| 17 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
|
| 18 |
+
"--save_every_n_epochs",
|
| 19 |
+
"5",
|
| 20 |
+
"--max_epochs",
|
| 21 |
+
"20",
|
| 22 |
+
"--batch_size",
|
| 23 |
+
"32",
|
| 24 |
+
"--precision",
|
| 25 |
+
"bf16-mixed",
|
| 26 |
+
"--mix_dataset",
|
| 27 |
+
"--num_workers",
|
| 28 |
+
"8",
|
| 29 |
+
"--use_wandb_logger"
|
| 30 |
+
],
|
| 31 |
+
"program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
|
| 32 |
+
"codePath": "stage1.py",
|
| 33 |
+
"email": "gia0603yucca@gmail.com",
|
| 34 |
+
"root": "./all_checkpoints/stage1_06221723/",
|
| 35 |
+
"host": "dsw-251511-c5cfcb8-lwcpt",
|
| 36 |
+
"executable": "/root/miniconda3/envs/protT3/bin/python",
|
| 37 |
+
"codePathLocal": "stage1.py",
|
| 38 |
+
"cpu_count": 64,
|
| 39 |
+
"cpu_count_logical": 64,
|
| 40 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 41 |
+
"gpu_count": 8,
|
| 42 |
+
"disk": {
|
| 43 |
+
"/": {
|
| 44 |
+
"total": "1623302262784",
|
| 45 |
+
"used": "1363152896"
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"memory": {
|
| 49 |
+
"total": "549755813888"
|
| 50 |
+
},
|
| 51 |
+
"cpu": {
|
| 52 |
+
"count": 64,
|
| 53 |
+
"countLogical": 64
|
| 54 |
+
},
|
| 55 |
+
"gpu_nvidia": [
|
| 56 |
+
{
|
| 57 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 58 |
+
"memoryTotal": "85198045184",
|
| 59 |
+
"architecture": "Ampere"
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 63 |
+
"memoryTotal": "85198045184",
|
| 64 |
+
"architecture": "Ampere"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 68 |
+
"memoryTotal": "85198045184",
|
| 69 |
+
"architecture": "Ampere"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 73 |
+
"memoryTotal": "85198045184",
|
| 74 |
+
"architecture": "Ampere"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 78 |
+
"memoryTotal": "85198045184",
|
| 79 |
+
"architecture": "Ampere"
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 83 |
+
"memoryTotal": "85198045184",
|
| 84 |
+
"architecture": "Ampere"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 88 |
+
"memoryTotal": "85198045184",
|
| 89 |
+
"architecture": "Ampere"
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 93 |
+
"memoryTotal": "85198045184",
|
| 94 |
+
"architecture": "Ampere"
|
| 95 |
+
}
|
| 96 |
+
],
|
| 97 |
+
"cudaVersion": "12.1"
|
| 98 |
+
}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"trainer/global_step":182,"loader2/val_loss_lm/dataloader_idx_2":4.984375,"loader0/val_loss_ptc/dataloader_idx_0":2.392578125,"loader2/val_loss_ptc/dataloader_idx_2":3.49609375,"loader0/val_loss/dataloader_idx_0":7.27734375,"loader0/val_loss_lm/dataloader_idx_0":4.2421875,"train_loss":6.375,"train_loss_ptm":0.63671875,"train_loss_lm":2.783203125,"loader1/val_loss/dataloader_idx_1":5.7578125,"_step":5,"_runtime":135.905828304,"_wandb":{"runtime":145},"lr":9.779754327610135e-05,"epoch":2,"loader1/val_loss_ptm/dataloader_idx_1":0.63330078125,"loader1/val_loss_ptc/dataloader_idx_1":2.71484375,"loader0/val_loss_ptm/dataloader_idx_0":0.6416015625,"loader2/val_loss_ptm/dataloader_idx_2":0.63525390625,"loader1/val_loss_lm/dataloader_idx_1":2.412109375,"loader2/val_loss/dataloader_idx_2":9.109375,"train_loss_ptc":2.953125,"_timestamp":1.7505905269926882e+09}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-22T19:06:31.147471317+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-22T19:06:32.175593884+08:00","level":"INFO","msg":"created new stream","id":"9wqt24k3"}
|
| 3 |
+
{"time":"2025-06-22T19:06:32.175639093+08:00","level":"INFO","msg":"stream: started","id":"9wqt24k3"}
|
| 4 |
+
{"time":"2025-06-22T19:06:32.175689814+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"9wqt24k3"}
|
| 5 |
+
{"time":"2025-06-22T19:06:32.17571173+08:00","level":"INFO","msg":"sender: started","stream_id":"9wqt24k3"}
|
| 6 |
+
{"time":"2025-06-22T19:06:32.175747445+08:00","level":"INFO","msg":"handler: started","stream_id":"9wqt24k3"}
|
| 7 |
+
{"time":"2025-06-22T19:06:33.341553042+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-06-22T19:08:56.485679626+08:00","level":"INFO","msg":"stream: closing","id":"9wqt24k3"}
|
| 9 |
+
{"time":"2025-06-22T19:08:56.485774311+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 10 |
+
{"time":"2025-06-22T19:08:56.486438679+08:00","level":"INFO","msg":"Stopped system monitor"}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Configure stats pid to 82552
|
| 3 |
+
2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug.log
|
| 7 |
+
2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-internal.log
|
| 8 |
+
2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-22 19:06:31,084 INFO MainThread:82552 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-22 19:06:31,085 INFO MainThread:82552 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-22 19:06:31,087 INFO MainThread:82552 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-22 19:06:31,088 INFO MainThread:82552 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-22 19:06:31,147 INFO MainThread:82552 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-22 19:06:33,293 INFO MainThread:82552 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-06-22 19:06:33,470 INFO MainThread:82552 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-06-22 19:06:33,471 INFO MainThread:82552 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-06-22 19:06:33,474 INFO MainThread:82552 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-06-22 19:06:33,474 INFO MainThread:82552 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-06-22 19:06:33,475 INFO MainThread:82552 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-06-22 19:06:38,587 INFO MainThread:82552 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
|
| 24 |
+
2025-06-22 19:08:56,484 INFO MsgRouterThr:82552 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/config.yaml
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.11
|
| 4 |
+
m:
|
| 5 |
+
- "1": trainer/global_step
|
| 6 |
+
"6":
|
| 7 |
+
- 3
|
| 8 |
+
"7": []
|
| 9 |
+
python_version: 3.10.0
|
| 10 |
+
t:
|
| 11 |
+
"1":
|
| 12 |
+
- 1
|
| 13 |
+
- 5
|
| 14 |
+
- 9
|
| 15 |
+
- 11
|
| 16 |
+
- 33
|
| 17 |
+
- 41
|
| 18 |
+
- 49
|
| 19 |
+
- 53
|
| 20 |
+
- 55
|
| 21 |
+
- 63
|
| 22 |
+
- 103
|
| 23 |
+
"2":
|
| 24 |
+
- 1
|
| 25 |
+
- 5
|
| 26 |
+
- 9
|
| 27 |
+
- 11
|
| 28 |
+
- 33
|
| 29 |
+
- 41
|
| 30 |
+
- 49
|
| 31 |
+
- 53
|
| 32 |
+
- 55
|
| 33 |
+
- 63
|
| 34 |
+
- 103
|
| 35 |
+
"3":
|
| 36 |
+
- 7
|
| 37 |
+
- 23
|
| 38 |
+
- 55
|
| 39 |
+
- 66
|
| 40 |
+
"4": 3.10.0
|
| 41 |
+
"5": 0.19.11
|
| 42 |
+
"6": 4.52.3
|
| 43 |
+
"8":
|
| 44 |
+
- 5
|
| 45 |
+
"12": 0.19.11
|
| 46 |
+
"13": linux-x86_64
|
| 47 |
+
accelerator:
|
| 48 |
+
value: gpu
|
| 49 |
+
batch_size:
|
| 50 |
+
value: 32
|
| 51 |
+
bert_hidden_dim:
|
| 52 |
+
value: 768
|
| 53 |
+
bert_name:
|
| 54 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
|
| 55 |
+
check_val_every_n_epoch:
|
| 56 |
+
value: 1
|
| 57 |
+
cross_attention_freq:
|
| 58 |
+
value: 2
|
| 59 |
+
devices:
|
| 60 |
+
value: 0,1,2,3
|
| 61 |
+
filename:
|
| 62 |
+
value: stage1_06221723
|
| 63 |
+
init_checkpoint:
|
| 64 |
+
value: ""
|
| 65 |
+
init_lr:
|
| 66 |
+
value: 0.0001
|
| 67 |
+
lm:
|
| 68 |
+
value: true
|
| 69 |
+
load_4bit:
|
| 70 |
+
value: false
|
| 71 |
+
lr_decay_rate:
|
| 72 |
+
value: 0.9
|
| 73 |
+
match_batch_size:
|
| 74 |
+
value: 64
|
| 75 |
+
max_epochs:
|
| 76 |
+
value: 20
|
| 77 |
+
min_lr:
|
| 78 |
+
value: 1e-05
|
| 79 |
+
mix_dataset:
|
| 80 |
+
value: true
|
| 81 |
+
mode:
|
| 82 |
+
value: train
|
| 83 |
+
num_query_token:
|
| 84 |
+
value: 8
|
| 85 |
+
num_workers:
|
| 86 |
+
value: 8
|
| 87 |
+
plm_name:
|
| 88 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
|
| 89 |
+
plm_tune:
|
| 90 |
+
value: freeze
|
| 91 |
+
pool_size:
|
| 92 |
+
value: 0
|
| 93 |
+
precision:
|
| 94 |
+
value: bf16-mixed
|
| 95 |
+
projection_dim:
|
| 96 |
+
value: 256
|
| 97 |
+
prot_aug:
|
| 98 |
+
value: None
|
| 99 |
+
prot_max_len:
|
| 100 |
+
value: 1024
|
| 101 |
+
ptm:
|
| 102 |
+
value: true
|
| 103 |
+
rerank_cand_num:
|
| 104 |
+
value: 128
|
| 105 |
+
retrieval_eval_epoch:
|
| 106 |
+
value: 10
|
| 107 |
+
root:
|
| 108 |
+
value: data
|
| 109 |
+
save_every_n_epochs:
|
| 110 |
+
value: 5
|
| 111 |
+
scheduler:
|
| 112 |
+
value: linear_warmup_cosine_lr
|
| 113 |
+
seed:
|
| 114 |
+
value: 42
|
| 115 |
+
strategy:
|
| 116 |
+
value: deepspeed
|
| 117 |
+
temperature:
|
| 118 |
+
value: 0.1
|
| 119 |
+
text_max_len:
|
| 120 |
+
value: 128
|
| 121 |
+
use_wandb_logger:
|
| 122 |
+
value: true
|
| 123 |
+
warmup_lr:
|
| 124 |
+
value: 1e-06
|
| 125 |
+
warmup_steps:
|
| 126 |
+
value: 1000
|
| 127 |
+
weight_decay:
|
| 128 |
+
value: 0.05
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/output.log
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
|
| 2 |
+
Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
|
| 3 |
+
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
|
| 4 |
+
|
| 5 |
+
| Name | Type | Params | Mode
|
| 6 |
+
------------------------------------------------------
|
| 7 |
+
0 | blip2qformer | Blip2Qformer | 327 M | train
|
| 8 |
+
------------------------------------------------------
|
| 9 |
+
179 M Trainable params
|
| 10 |
+
147 M Non-trainable params
|
| 11 |
+
327 M Total params
|
| 12 |
+
1,309.467 Total estimated model params size (MB)
|
| 13 |
+
5 Modules in train mode
|
| 14 |
+
926 Modules in eval mode
|
| 15 |
+
Sanity Checking DataLoader 2: 50%|██████████████████████████▌ | 1/2 [00:00<00:00, 1.83it/s]
|
| 16 |
+
/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
|
| 17 |
+
with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
|
| 18 |
+
Traceback (most recent call last):
|
| 19 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
|
| 20 |
+
main(args)
|
| 21 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
|
| 22 |
+
trainer.fit(model, datamodule=dm)
|
| 23 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
|
| 24 |
+
call._call_and_handle_interrupt(
|
| 25 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
|
| 26 |
+
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
|
| 27 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
|
| 28 |
+
return function(*args, **kwargs)
|
| 29 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
|
| 30 |
+
self._run(model, ckpt_path=ckpt_path)
|
| 31 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
|
| 32 |
+
results = self._run_stage()
|
| 33 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
|
| 34 |
+
self._run_sanity_check()
|
| 35 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
|
| 36 |
+
val_loop.run()
|
| 37 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
|
| 38 |
+
return loop_run(self, *args, **kwargs)
|
| 39 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
|
| 40 |
+
self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
|
| 41 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
|
| 42 |
+
output = call._call_strategy_hook(trainer, hook_name, *step_args)
|
| 43 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
|
| 44 |
+
output = fn(*args, **kwargs)
|
| 45 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
|
| 46 |
+
return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
|
| 47 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
|
| 48 |
+
wrapper_output = wrapper_module(*args, **kwargs)
|
| 49 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 50 |
+
return self._call_impl(*args, **kwargs)
|
| 51 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 52 |
+
return forward_call(*args, **kwargs)
|
| 53 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
|
| 54 |
+
ret_val = func(*args, **kwargs)
|
| 55 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
|
| 56 |
+
loss = self.module(*inputs, **kwargs)
|
| 57 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 58 |
+
return self._call_impl(*args, **kwargs)
|
| 59 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
|
| 60 |
+
return inner()
|
| 61 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
|
| 62 |
+
result = forward_call(*args, **kwargs)
|
| 63 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
|
| 64 |
+
out = method(*_args, **_kwargs)
|
| 65 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
|
| 66 |
+
return func(*args, **kwargs)
|
| 67 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
|
| 68 |
+
blip2_loss = self.blip2qformer(batch)
|
| 69 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 70 |
+
return self._call_impl(*args, **kwargs)
|
| 71 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 72 |
+
return forward_call(*args, **kwargs)
|
| 73 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
|
| 74 |
+
sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
|
| 75 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
|
| 76 |
+
loss_graph = F.cross_entropy(logits_per_graph, labels)
|
| 77 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
|
| 78 |
+
return torch._C._nn.cross_entropy_loss(
|
| 79 |
+
RuntimeError: size mismatch (got input: [4], target: [1])
|
| 80 |
+
[rank0]: Traceback (most recent call last):
|
| 81 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
|
| 82 |
+
[rank0]: main(args)
|
| 83 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
|
| 84 |
+
[rank0]: trainer.fit(model, datamodule=dm)
|
| 85 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
|
| 86 |
+
[rank0]: call._call_and_handle_interrupt(
|
| 87 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
|
| 88 |
+
[rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
|
| 89 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
|
| 90 |
+
[rank0]: return function(*args, **kwargs)
|
| 91 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
|
| 92 |
+
[rank0]: self._run(model, ckpt_path=ckpt_path)
|
| 93 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
|
| 94 |
+
[rank0]: results = self._run_stage()
|
| 95 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
|
| 96 |
+
[rank0]: self._run_sanity_check()
|
| 97 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
|
| 98 |
+
[rank0]: val_loop.run()
|
| 99 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
|
| 100 |
+
[rank0]: return loop_run(self, *args, **kwargs)
|
| 101 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
|
| 102 |
+
[rank0]: self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
|
| 103 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
|
| 104 |
+
[rank0]: output = call._call_strategy_hook(trainer, hook_name, *step_args)
|
| 105 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
|
| 106 |
+
[rank0]: output = fn(*args, **kwargs)
|
| 107 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
|
| 108 |
+
[rank0]: return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
|
| 109 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
|
| 110 |
+
[rank0]: wrapper_output = wrapper_module(*args, **kwargs)
|
| 111 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 112 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 113 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 114 |
+
[rank0]: return forward_call(*args, **kwargs)
|
| 115 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
|
| 116 |
+
[rank0]: ret_val = func(*args, **kwargs)
|
| 117 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
|
| 118 |
+
[rank0]: loss = self.module(*inputs, **kwargs)
|
| 119 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 120 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 121 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
|
| 122 |
+
[rank0]: return inner()
|
| 123 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
|
| 124 |
+
[rank0]: result = forward_call(*args, **kwargs)
|
| 125 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
|
| 126 |
+
[rank0]: out = method(*_args, **_kwargs)
|
| 127 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
|
| 128 |
+
[rank0]: return func(*args, **kwargs)
|
| 129 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
|
| 130 |
+
[rank0]: blip2_loss = self.blip2qformer(batch)
|
| 131 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 132 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 133 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 134 |
+
[rank0]: return forward_call(*args, **kwargs)
|
| 135 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
|
| 136 |
+
[rank0]: sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
|
| 137 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
|
| 138 |
+
[rank0]: loss_graph = F.cross_entropy(logits_per_graph, labels)
|
| 139 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
|
| 140 |
+
[rank0]: return torch._C._nn.cross_entropy_loss(
|
| 141 |
+
[rank0]: RuntimeError: size mismatch (got input: [4], target: [1])
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/requirements.txt
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy==2.2.6
|
| 2 |
+
confection==0.1.5
|
| 3 |
+
text-unidecode==1.3
|
| 4 |
+
contexttimer==0.3.3
|
| 5 |
+
omegaconf==2.3.0
|
| 6 |
+
tzdata==2025.2
|
| 7 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 8 |
+
plotly==6.1.1
|
| 9 |
+
decord==0.6.0
|
| 10 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 11 |
+
scipy==1.15.3
|
| 12 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 13 |
+
parso==0.8.4
|
| 14 |
+
python-dateutil==2.9.0.post0
|
| 15 |
+
setuptools==78.1.1
|
| 16 |
+
aiosignal==1.3.2
|
| 17 |
+
joblib==1.5.1
|
| 18 |
+
platformdirs==4.3.8
|
| 19 |
+
regex==2024.11.6
|
| 20 |
+
aiohappyeyeballs==2.6.1
|
| 21 |
+
virtualenv==20.31.2
|
| 22 |
+
lazy_loader==0.4
|
| 23 |
+
rich==14.0.0
|
| 24 |
+
timm==0.4.12
|
| 25 |
+
antlr4-python3-runtime==4.9.3
|
| 26 |
+
pandas==2.2.3
|
| 27 |
+
salesforce-lavis==1.0.2
|
| 28 |
+
gitdb==4.0.12
|
| 29 |
+
six==1.17.0
|
| 30 |
+
smmap==5.0.2
|
| 31 |
+
annotated-types==0.7.0
|
| 32 |
+
pyparsing==3.2.3
|
| 33 |
+
Jinja2==3.1.6
|
| 34 |
+
ptyprocess==0.7.0
|
| 35 |
+
streamlit==1.45.1
|
| 36 |
+
idna==3.10
|
| 37 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 38 |
+
tenacity==9.1.2
|
| 39 |
+
sentencepiece==0.2.0
|
| 40 |
+
matplotlib-inline==0.1.7
|
| 41 |
+
typing-inspection==0.4.1
|
| 42 |
+
packaging==24.2
|
| 43 |
+
nltk==3.9.1
|
| 44 |
+
wheel==0.45.1
|
| 45 |
+
catalogue==2.0.10
|
| 46 |
+
matplotlib==3.10.3
|
| 47 |
+
propcache==0.3.1
|
| 48 |
+
Pygments==2.19.1
|
| 49 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 50 |
+
requests==2.32.3
|
| 51 |
+
filelock==3.18.0
|
| 52 |
+
pexpect==4.9.0
|
| 53 |
+
opencv-python-headless==4.5.5.64
|
| 54 |
+
certifi==2025.4.26
|
| 55 |
+
nvidia-nvtx-cu12==12.4.127
|
| 56 |
+
bleach==6.2.0
|
| 57 |
+
typing_extensions==4.13.2
|
| 58 |
+
tornado==6.5.1
|
| 59 |
+
networkx==3.4.2
|
| 60 |
+
sympy==1.13.1
|
| 61 |
+
watchdog==6.0.0
|
| 62 |
+
kaggle==1.7.4.5
|
| 63 |
+
nvidia-ml-py==12.575.51
|
| 64 |
+
pyarrow==20.0.0
|
| 65 |
+
mpmath==1.3.0
|
| 66 |
+
lightning-utilities==0.14.3
|
| 67 |
+
ftfy==6.3.1
|
| 68 |
+
triton==3.2.0
|
| 69 |
+
referencing==0.36.2
|
| 70 |
+
ipython==8.36.0
|
| 71 |
+
yarl==1.20.0
|
| 72 |
+
language_data==1.3.0
|
| 73 |
+
cycler==0.12.1
|
| 74 |
+
python-magic==0.4.27
|
| 75 |
+
wasabi==1.1.3
|
| 76 |
+
protobuf==6.31.0
|
| 77 |
+
murmurhash==1.0.13
|
| 78 |
+
jsonschema-specifications==2025.4.1
|
| 79 |
+
blinker==1.9.0
|
| 80 |
+
fonttools==4.58.0
|
| 81 |
+
imageio==2.37.0
|
| 82 |
+
pycocoevalcap==1.2
|
| 83 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 84 |
+
fairscale==0.4.4
|
| 85 |
+
hjson==3.1.0
|
| 86 |
+
identify==2.6.12
|
| 87 |
+
mdurl==0.1.2
|
| 88 |
+
decorator==5.2.1
|
| 89 |
+
distlib==0.3.9
|
| 90 |
+
webencodings==0.5.1
|
| 91 |
+
kiwisolver==1.4.8
|
| 92 |
+
srsly==2.5.1
|
| 93 |
+
frozenlist==1.6.0
|
| 94 |
+
blis==1.3.0
|
| 95 |
+
contourpy==1.3.2
|
| 96 |
+
hf-xet==1.1.2
|
| 97 |
+
cymem==2.0.11
|
| 98 |
+
pillow==11.2.1
|
| 99 |
+
pycocotools==2.0.8
|
| 100 |
+
pre_commit==4.2.0
|
| 101 |
+
wrapt==1.17.2
|
| 102 |
+
nvidia-curand-cu12==10.3.5.147
|
| 103 |
+
spacy==3.8.7
|
| 104 |
+
rpds-py==0.25.1
|
| 105 |
+
exceptiongroup==1.3.0
|
| 106 |
+
braceexpand==0.1.7
|
| 107 |
+
rouge_score==0.1.2
|
| 108 |
+
async-timeout==5.0.1
|
| 109 |
+
torchmetrics==1.7.1
|
| 110 |
+
nvidia-nccl-cu12==2.21.5
|
| 111 |
+
wcwidth==0.2.13
|
| 112 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 113 |
+
scikit-image==0.25.2
|
| 114 |
+
urllib3==2.4.0
|
| 115 |
+
portalocker==3.1.1
|
| 116 |
+
smart-open==7.1.0
|
| 117 |
+
cfgv==3.4.0
|
| 118 |
+
markdown-it-py==3.0.0
|
| 119 |
+
charset-normalizer==3.4.2
|
| 120 |
+
executing==2.2.0
|
| 121 |
+
pure_eval==0.2.3
|
| 122 |
+
safetensors==0.5.3
|
| 123 |
+
spacy-legacy==3.0.12
|
| 124 |
+
shellingham==1.5.4
|
| 125 |
+
langcodes==3.5.0
|
| 126 |
+
pytz==2025.2
|
| 127 |
+
iopath==0.1.10
|
| 128 |
+
weasel==0.4.1
|
| 129 |
+
tifffile==2025.5.10
|
| 130 |
+
nodeenv==1.9.1
|
| 131 |
+
absl-py==2.2.2
|
| 132 |
+
einops==0.8.1
|
| 133 |
+
msgpack==1.1.0
|
| 134 |
+
pydantic_core==2.33.2
|
| 135 |
+
ninja==1.11.1.4
|
| 136 |
+
altair==5.5.0
|
| 137 |
+
attrs==25.3.0
|
| 138 |
+
tqdm==4.67.1
|
| 139 |
+
deepspeed==0.16.10+b666844f
|
| 140 |
+
pydeck==0.9.1
|
| 141 |
+
stack-data==0.6.3
|
| 142 |
+
pydantic==2.11.5
|
| 143 |
+
torch==2.6.0
|
| 144 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 145 |
+
python-slugify==8.0.4
|
| 146 |
+
webdataset==0.2.111
|
| 147 |
+
pytorch-lightning==2.5.1.post0
|
| 148 |
+
prompt_toolkit==3.0.51
|
| 149 |
+
psutil==7.0.0
|
| 150 |
+
opendatasets==0.1.22
|
| 151 |
+
asttokens==3.0.0
|
| 152 |
+
MarkupSafe==3.0.2
|
| 153 |
+
multidict==6.4.4
|
| 154 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 155 |
+
GitPython==3.1.44
|
| 156 |
+
PyYAML==6.0.2
|
| 157 |
+
cloudpathlib==0.21.1
|
| 158 |
+
toml==0.10.2
|
| 159 |
+
marisa-trie==1.2.1
|
| 160 |
+
traitlets==5.14.3
|
| 161 |
+
cachetools==5.5.2
|
| 162 |
+
spacy-loggers==1.0.5
|
| 163 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 164 |
+
torchvision==0.21.0
|
| 165 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 166 |
+
jedi==0.19.2
|
| 167 |
+
thinc==8.3.6
|
| 168 |
+
py-cpuinfo==9.0.0
|
| 169 |
+
yacs==0.1.8
|
| 170 |
+
cffi==1.17.1
|
| 171 |
+
preshed==3.0.10
|
| 172 |
+
more-itertools==10.7.0
|
| 173 |
+
bigmodelvis==0.0.1
|
| 174 |
+
datasets==3.6.0
|
| 175 |
+
huggingface-hub==0.32.1
|
| 176 |
+
narwhals==1.41.0
|
| 177 |
+
xxhash==3.5.0
|
| 178 |
+
sentry-sdk==2.29.1
|
| 179 |
+
aiohttp==3.12.2
|
| 180 |
+
opencv-python==4.11.0.86
|
| 181 |
+
pycryptodome==3.23.0
|
| 182 |
+
threadpoolctl==3.6.0
|
| 183 |
+
flash-attn==2.7.1.post1
|
| 184 |
+
transformers==4.52.3
|
| 185 |
+
pycparser==2.22
|
| 186 |
+
pathlib==1.0.1
|
| 187 |
+
dill==0.3.8
|
| 188 |
+
scikit-learn==1.6.1
|
| 189 |
+
tokenizers==0.21.1
|
| 190 |
+
aliyun-python-sdk-core==2.16.0
|
| 191 |
+
fsspec==2025.3.0
|
| 192 |
+
jmespath==0.10.0
|
| 193 |
+
click==8.2.1
|
| 194 |
+
delta-center-client==0.0.4
|
| 195 |
+
cheroot==10.0.1
|
| 196 |
+
wandb==0.19.11
|
| 197 |
+
setproctitle==1.3.6
|
| 198 |
+
jsonschema==4.24.0
|
| 199 |
+
oss2==2.15.0
|
| 200 |
+
multiprocess==0.70.16
|
| 201 |
+
jaraco.functools==4.1.0
|
| 202 |
+
web.py==0.62
|
| 203 |
+
aliyun-python-sdk-kms==2.16.5
|
| 204 |
+
cryptography==45.0.3
|
| 205 |
+
pip==25.1.1
|
| 206 |
+
docker-pycreds==0.4.0
|
| 207 |
+
typer==0.16.0
|
| 208 |
+
opendelta==0.3.2
|
| 209 |
+
crcmod==1.7
|
| 210 |
+
jaraco.functools==4.0.1
|
| 211 |
+
inflect==7.3.1
|
| 212 |
+
jaraco.collections==5.1.0
|
| 213 |
+
packaging==24.2
|
| 214 |
+
wheel==0.45.1
|
| 215 |
+
tomli==2.0.1
|
| 216 |
+
platformdirs==4.2.2
|
| 217 |
+
typing_extensions==4.12.2
|
| 218 |
+
more-itertools==10.3.0
|
| 219 |
+
autocommand==2.2.2
|
| 220 |
+
jaraco.text==3.12.1
|
| 221 |
+
importlib_metadata==8.0.0
|
| 222 |
+
jaraco.context==5.3.0
|
| 223 |
+
zipp==3.19.2
|
| 224 |
+
backports.tarfile==1.2.0
|
| 225 |
+
typeguard==4.3.0
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-06-22T11:10:38.675049Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--devices",
|
| 7 |
+
"0,1,2,3",
|
| 8 |
+
"--mode",
|
| 9 |
+
"train",
|
| 10 |
+
"--filename",
|
| 11 |
+
"stage1_06221723",
|
| 12 |
+
"--num_query_token",
|
| 13 |
+
"8",
|
| 14 |
+
"--plm_name",
|
| 15 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
|
| 16 |
+
"--bert_name",
|
| 17 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
|
| 18 |
+
"--save_every_n_epochs",
|
| 19 |
+
"5",
|
| 20 |
+
"--max_epochs",
|
| 21 |
+
"20",
|
| 22 |
+
"--batch_size",
|
| 23 |
+
"32",
|
| 24 |
+
"--precision",
|
| 25 |
+
"bf16-mixed",
|
| 26 |
+
"--mix_dataset",
|
| 27 |
+
"--num_workers",
|
| 28 |
+
"8",
|
| 29 |
+
"--use_wandb_logger"
|
| 30 |
+
],
|
| 31 |
+
"program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
|
| 32 |
+
"codePath": "stage1.py",
|
| 33 |
+
"email": "gia0603yucca@gmail.com",
|
| 34 |
+
"root": "./all_checkpoints/stage1_06221723/",
|
| 35 |
+
"host": "dsw-251511-c5cfcb8-lwcpt",
|
| 36 |
+
"executable": "/root/miniconda3/envs/protT3/bin/python",
|
| 37 |
+
"codePathLocal": "stage1.py",
|
| 38 |
+
"cpu_count": 64,
|
| 39 |
+
"cpu_count_logical": 64,
|
| 40 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 41 |
+
"gpu_count": 8,
|
| 42 |
+
"disk": {
|
| 43 |
+
"/": {
|
| 44 |
+
"total": "1623302262784",
|
| 45 |
+
"used": "1363165184"
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"memory": {
|
| 49 |
+
"total": "549755813888"
|
| 50 |
+
},
|
| 51 |
+
"cpu": {
|
| 52 |
+
"count": 64,
|
| 53 |
+
"countLogical": 64
|
| 54 |
+
},
|
| 55 |
+
"gpu_nvidia": [
|
| 56 |
+
{
|
| 57 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 58 |
+
"memoryTotal": "85198045184",
|
| 59 |
+
"architecture": "Ampere"
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 63 |
+
"memoryTotal": "85198045184",
|
| 64 |
+
"architecture": "Ampere"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 68 |
+
"memoryTotal": "85198045184",
|
| 69 |
+
"architecture": "Ampere"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 73 |
+
"memoryTotal": "85198045184",
|
| 74 |
+
"architecture": "Ampere"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 78 |
+
"memoryTotal": "85198045184",
|
| 79 |
+
"architecture": "Ampere"
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 83 |
+
"memoryTotal": "85198045184",
|
| 84 |
+
"architecture": "Ampere"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 88 |
+
"memoryTotal": "85198045184",
|
| 89 |
+
"architecture": "Ampere"
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 93 |
+
"memoryTotal": "85198045184",
|
| 94 |
+
"architecture": "Ampere"
|
| 95 |
+
}
|
| 96 |
+
],
|
| 97 |
+
"cudaVersion": "12.1"
|
| 98 |
+
}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":11}}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-22T19:10:38.679318052+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-22T19:10:39.726410578+08:00","level":"INFO","msg":"created new stream","id":"bu5vqvh5"}
|
| 3 |
+
{"time":"2025-06-22T19:10:39.726458298+08:00","level":"INFO","msg":"stream: started","id":"bu5vqvh5"}
|
| 4 |
+
{"time":"2025-06-22T19:10:39.726477847+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"bu5vqvh5"}
|
| 5 |
+
{"time":"2025-06-22T19:10:39.726508843+08:00","level":"INFO","msg":"sender: started","stream_id":"bu5vqvh5"}
|
| 6 |
+
{"time":"2025-06-22T19:10:39.726652161+08:00","level":"INFO","msg":"handler: started","stream_id":"bu5vqvh5"}
|
| 7 |
+
{"time":"2025-06-22T19:10:40.990067167+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-06-22T19:10:49.81571789+08:00","level":"INFO","msg":"stream: closing","id":"bu5vqvh5"}
|
| 9 |
+
{"time":"2025-06-22T19:10:49.815789673+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 10 |
+
{"time":"2025-06-22T19:10:49.816535239+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 11 |
+
{"time":"2025-06-22T19:10:52.523961836+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 12 |
+
{"time":"2025-06-22T19:10:53.665534986+08:00","level":"INFO","msg":"handler: closed","stream_id":"bu5vqvh5"}
|
| 13 |
+
{"time":"2025-06-22T19:10:53.665598663+08:00","level":"INFO","msg":"sender: closed","stream_id":"bu5vqvh5"}
|
| 14 |
+
{"time":"2025-06-22T19:10:53.665594948+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"bu5vqvh5"}
|
| 15 |
+
{"time":"2025-06-22T19:10:53.669904066+08:00","level":"INFO","msg":"stream: closed","id":"bu5vqvh5"}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Configure stats pid to 95583
|
| 3 |
+
2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug.log
|
| 7 |
+
2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-internal.log
|
| 8 |
+
2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-22 19:10:38,673 INFO MainThread:95583 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-22 19:10:38,674 INFO MainThread:95583 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-22 19:10:38,675 INFO MainThread:95583 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-22 19:10:38,678 INFO MainThread:95583 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-22 19:10:40,981 INFO MainThread:95583 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-06-22 19:10:41,122 INFO MainThread:95583 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-06-22 19:10:41,122 INFO MainThread:95583 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-06-22 19:10:41,125 INFO MainThread:95583 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-06-22 19:10:41,125 INFO MainThread:95583 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-06-22 19:10:41,126 INFO MainThread:95583 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-06-22 19:10:44,823 INFO MainThread:95583 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
|
| 24 |
+
2025-06-22 19:10:49,814 INFO MsgRouterThr:95583 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/run-bu5vqvh5.wandb
ADDED
|
Binary file (20.5 kB). View file
|
|
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/config.yaml
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.11
|
| 4 |
+
m:
|
| 5 |
+
- "1": trainer/global_step
|
| 6 |
+
"6":
|
| 7 |
+
- 3
|
| 8 |
+
"7": []
|
| 9 |
+
python_version: 3.10.0
|
| 10 |
+
t:
|
| 11 |
+
"1":
|
| 12 |
+
- 1
|
| 13 |
+
- 5
|
| 14 |
+
- 9
|
| 15 |
+
- 11
|
| 16 |
+
- 33
|
| 17 |
+
- 41
|
| 18 |
+
- 49
|
| 19 |
+
- 53
|
| 20 |
+
- 55
|
| 21 |
+
- 63
|
| 22 |
+
- 103
|
| 23 |
+
"2":
|
| 24 |
+
- 1
|
| 25 |
+
- 5
|
| 26 |
+
- 9
|
| 27 |
+
- 11
|
| 28 |
+
- 33
|
| 29 |
+
- 41
|
| 30 |
+
- 49
|
| 31 |
+
- 53
|
| 32 |
+
- 55
|
| 33 |
+
- 63
|
| 34 |
+
- 103
|
| 35 |
+
"3":
|
| 36 |
+
- 7
|
| 37 |
+
- 23
|
| 38 |
+
- 55
|
| 39 |
+
- 66
|
| 40 |
+
"4": 3.10.0
|
| 41 |
+
"5": 0.19.11
|
| 42 |
+
"6": 4.52.3
|
| 43 |
+
"8":
|
| 44 |
+
- 5
|
| 45 |
+
"12": 0.19.11
|
| 46 |
+
"13": linux-x86_64
|
| 47 |
+
accelerator:
|
| 48 |
+
value: gpu
|
| 49 |
+
batch_size:
|
| 50 |
+
value: 32
|
| 51 |
+
bert_hidden_dim:
|
| 52 |
+
value: 768
|
| 53 |
+
bert_name:
|
| 54 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
|
| 55 |
+
check_val_every_n_epoch:
|
| 56 |
+
value: 1
|
| 57 |
+
cross_attention_freq:
|
| 58 |
+
value: 2
|
| 59 |
+
devices:
|
| 60 |
+
value: 0,1,2,3
|
| 61 |
+
filename:
|
| 62 |
+
value: stage1_06221723
|
| 63 |
+
init_checkpoint:
|
| 64 |
+
value: ""
|
| 65 |
+
init_lr:
|
| 66 |
+
value: 0.0001
|
| 67 |
+
lm:
|
| 68 |
+
value: true
|
| 69 |
+
load_4bit:
|
| 70 |
+
value: false
|
| 71 |
+
lr_decay_rate:
|
| 72 |
+
value: 0.9
|
| 73 |
+
match_batch_size:
|
| 74 |
+
value: 64
|
| 75 |
+
max_epochs:
|
| 76 |
+
value: 20
|
| 77 |
+
min_lr:
|
| 78 |
+
value: 1e-05
|
| 79 |
+
mix_dataset:
|
| 80 |
+
value: true
|
| 81 |
+
mode:
|
| 82 |
+
value: train
|
| 83 |
+
num_query_token:
|
| 84 |
+
value: 8
|
| 85 |
+
num_workers:
|
| 86 |
+
value: 8
|
| 87 |
+
plm_name:
|
| 88 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
|
| 89 |
+
plm_tune:
|
| 90 |
+
value: freeze
|
| 91 |
+
pool_size:
|
| 92 |
+
value: 0
|
| 93 |
+
precision:
|
| 94 |
+
value: bf16-mixed
|
| 95 |
+
projection_dim:
|
| 96 |
+
value: 256
|
| 97 |
+
prot_aug:
|
| 98 |
+
value: None
|
| 99 |
+
prot_max_len:
|
| 100 |
+
value: 1024
|
| 101 |
+
ptm:
|
| 102 |
+
value: true
|
| 103 |
+
rerank_cand_num:
|
| 104 |
+
value: 128
|
| 105 |
+
retrieval_eval_epoch:
|
| 106 |
+
value: 10
|
| 107 |
+
root:
|
| 108 |
+
value: data
|
| 109 |
+
save_every_n_epochs:
|
| 110 |
+
value: 5
|
| 111 |
+
scheduler:
|
| 112 |
+
value: linear_warmup_cosine_lr
|
| 113 |
+
seed:
|
| 114 |
+
value: 42
|
| 115 |
+
strategy:
|
| 116 |
+
value: deepspeed
|
| 117 |
+
temperature:
|
| 118 |
+
value: 0.1
|
| 119 |
+
text_max_len:
|
| 120 |
+
value: 128
|
| 121 |
+
use_wandb_logger:
|
| 122 |
+
value: true
|
| 123 |
+
warmup_lr:
|
| 124 |
+
value: 1e-06
|
| 125 |
+
warmup_steps:
|
| 126 |
+
value: 1000
|
| 127 |
+
weight_decay:
|
| 128 |
+
value: 0.05
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/output.log
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
|
| 2 |
+
Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
|
| 3 |
+
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
|
| 4 |
+
|
| 5 |
+
| Name | Type | Params | Mode
|
| 6 |
+
------------------------------------------------------
|
| 7 |
+
0 | blip2qformer | Blip2Qformer | 327 M | train
|
| 8 |
+
------------------------------------------------------
|
| 9 |
+
179 M Trainable params
|
| 10 |
+
147 M Non-trainable params
|
| 11 |
+
327 M Total params
|
| 12 |
+
1,309.467 Total estimated model params size (MB)
|
| 13 |
+
5 Modules in train mode
|
| 14 |
+
926 Modules in eval mode
|
| 15 |
+
Sanity Checking DataLoader 0: 0%| | 0/1 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([25, 100])
|
| 16 |
+
labels.shape: torch.Size([25])
|
| 17 |
+
/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:220: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
|
| 18 |
+
with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
|
| 19 |
+
Sanity Checking DataLoader 1: 0%| | 0/2 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([32, 128])
|
| 20 |
+
labels.shape: torch.Size([32])
|
| 21 |
+
Sanity Checking DataLoader 1: 50%|██████████████████████████▌ | 1/2 [00:00<00:00, 1.84it/s]logits_per_graph.shape: torch.Size([18, 72])
|
| 22 |
+
labels.shape: torch.Size([18])
|
| 23 |
+
Sanity Checking DataLoader 2: 0%| | 0/2 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([32, 128])
|
| 24 |
+
labels.shape: torch.Size([32])
|
| 25 |
+
Sanity Checking DataLoader 2: 50%|██████████████████████████▌ | 1/2 [00:00<00:00, 1.69it/s]logits_per_graph.shape: torch.Size([4])
|
| 26 |
+
labels.shape: torch.Size([1])
|
| 27 |
+
Traceback (most recent call last):
|
| 28 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
|
| 29 |
+
main(args)
|
| 30 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
|
| 31 |
+
trainer.fit(model, datamodule=dm)
|
| 32 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
|
| 33 |
+
call._call_and_handle_interrupt(
|
| 34 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
|
| 35 |
+
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
|
| 36 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
|
| 37 |
+
return function(*args, **kwargs)
|
| 38 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
|
| 39 |
+
self._run(model, ckpt_path=ckpt_path)
|
| 40 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
|
| 41 |
+
results = self._run_stage()
|
| 42 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
|
| 43 |
+
self._run_sanity_check()
|
| 44 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
|
| 45 |
+
val_loop.run()
|
| 46 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
|
| 47 |
+
return loop_run(self, *args, **kwargs)
|
| 48 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
|
| 49 |
+
self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
|
| 50 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
|
| 51 |
+
output = call._call_strategy_hook(trainer, hook_name, *step_args)
|
| 52 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
|
| 53 |
+
output = fn(*args, **kwargs)
|
| 54 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
|
| 55 |
+
return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
|
| 56 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
|
| 57 |
+
wrapper_output = wrapper_module(*args, **kwargs)
|
| 58 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 59 |
+
return self._call_impl(*args, **kwargs)
|
| 60 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 61 |
+
return forward_call(*args, **kwargs)
|
| 62 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
|
| 63 |
+
ret_val = func(*args, **kwargs)
|
| 64 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
|
| 65 |
+
loss = self.module(*inputs, **kwargs)
|
| 66 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 67 |
+
return self._call_impl(*args, **kwargs)
|
| 68 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
|
| 69 |
+
return inner()
|
| 70 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
|
| 71 |
+
result = forward_call(*args, **kwargs)
|
| 72 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
|
| 73 |
+
out = method(*_args, **_kwargs)
|
| 74 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
|
| 75 |
+
return func(*args, **kwargs)
|
| 76 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
|
| 77 |
+
blip2_loss = self.blip2qformer(batch)
|
| 78 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 79 |
+
return self._call_impl(*args, **kwargs)
|
| 80 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 81 |
+
return forward_call(*args, **kwargs)
|
| 82 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 141, in forward
|
| 83 |
+
sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
|
| 84 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 104, in contrast_global
|
| 85 |
+
loss_graph = F.cross_entropy(logits_per_graph, labels)
|
| 86 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
|
| 87 |
+
return torch._C._nn.cross_entropy_loss(
|
| 88 |
+
RuntimeError: size mismatch (got input: [4], target: [1])
|
| 89 |
+
[rank0]: Traceback (most recent call last):
|
| 90 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
|
| 91 |
+
[rank0]: main(args)
|
| 92 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
|
| 93 |
+
[rank0]: trainer.fit(model, datamodule=dm)
|
| 94 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
|
| 95 |
+
[rank0]: call._call_and_handle_interrupt(
|
| 96 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
|
| 97 |
+
[rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
|
| 98 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
|
| 99 |
+
[rank0]: return function(*args, **kwargs)
|
| 100 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
|
| 101 |
+
[rank0]: self._run(model, ckpt_path=ckpt_path)
|
| 102 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
|
| 103 |
+
[rank0]: results = self._run_stage()
|
| 104 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
|
| 105 |
+
[rank0]: self._run_sanity_check()
|
| 106 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
|
| 107 |
+
[rank0]: val_loop.run()
|
| 108 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
|
| 109 |
+
[rank0]: return loop_run(self, *args, **kwargs)
|
| 110 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
|
| 111 |
+
[rank0]: self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
|
| 112 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
|
| 113 |
+
[rank0]: output = call._call_strategy_hook(trainer, hook_name, *step_args)
|
| 114 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
|
| 115 |
+
[rank0]: output = fn(*args, **kwargs)
|
| 116 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
|
| 117 |
+
[rank0]: return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
|
| 118 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
|
| 119 |
+
[rank0]: wrapper_output = wrapper_module(*args, **kwargs)
|
| 120 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 121 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 122 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 123 |
+
[rank0]: return forward_call(*args, **kwargs)
|
| 124 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
|
| 125 |
+
[rank0]: ret_val = func(*args, **kwargs)
|
| 126 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
|
| 127 |
+
[rank0]: loss = self.module(*inputs, **kwargs)
|
| 128 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 129 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 130 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
|
| 131 |
+
[rank0]: return inner()
|
| 132 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
|
| 133 |
+
[rank0]: result = forward_call(*args, **kwargs)
|
| 134 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
|
| 135 |
+
[rank0]: out = method(*_args, **_kwargs)
|
| 136 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
|
| 137 |
+
[rank0]: return func(*args, **kwargs)
|
| 138 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
|
| 139 |
+
[rank0]: blip2_loss = self.blip2qformer(batch)
|
| 140 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 141 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 142 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 143 |
+
[rank0]: return forward_call(*args, **kwargs)
|
| 144 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 141, in forward
|
| 145 |
+
[rank0]: sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
|
| 146 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 104, in contrast_global
|
| 147 |
+
[rank0]: loss_graph = F.cross_entropy(logits_per_graph, labels)
|
| 148 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
|
| 149 |
+
[rank0]: return torch._C._nn.cross_entropy_loss(
|
| 150 |
+
[rank0]: RuntimeError: size mismatch (got input: [4], target: [1])
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/requirements.txt
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy==2.2.6
|
| 2 |
+
confection==0.1.5
|
| 3 |
+
text-unidecode==1.3
|
| 4 |
+
contexttimer==0.3.3
|
| 5 |
+
omegaconf==2.3.0
|
| 6 |
+
tzdata==2025.2
|
| 7 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 8 |
+
plotly==6.1.1
|
| 9 |
+
decord==0.6.0
|
| 10 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 11 |
+
scipy==1.15.3
|
| 12 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 13 |
+
parso==0.8.4
|
| 14 |
+
python-dateutil==2.9.0.post0
|
| 15 |
+
setuptools==78.1.1
|
| 16 |
+
aiosignal==1.3.2
|
| 17 |
+
joblib==1.5.1
|
| 18 |
+
platformdirs==4.3.8
|
| 19 |
+
regex==2024.11.6
|
| 20 |
+
aiohappyeyeballs==2.6.1
|
| 21 |
+
virtualenv==20.31.2
|
| 22 |
+
lazy_loader==0.4
|
| 23 |
+
rich==14.0.0
|
| 24 |
+
timm==0.4.12
|
| 25 |
+
antlr4-python3-runtime==4.9.3
|
| 26 |
+
pandas==2.2.3
|
| 27 |
+
salesforce-lavis==1.0.2
|
| 28 |
+
gitdb==4.0.12
|
| 29 |
+
six==1.17.0
|
| 30 |
+
smmap==5.0.2
|
| 31 |
+
annotated-types==0.7.0
|
| 32 |
+
pyparsing==3.2.3
|
| 33 |
+
Jinja2==3.1.6
|
| 34 |
+
ptyprocess==0.7.0
|
| 35 |
+
streamlit==1.45.1
|
| 36 |
+
idna==3.10
|
| 37 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 38 |
+
tenacity==9.1.2
|
| 39 |
+
sentencepiece==0.2.0
|
| 40 |
+
matplotlib-inline==0.1.7
|
| 41 |
+
typing-inspection==0.4.1
|
| 42 |
+
packaging==24.2
|
| 43 |
+
nltk==3.9.1
|
| 44 |
+
wheel==0.45.1
|
| 45 |
+
catalogue==2.0.10
|
| 46 |
+
matplotlib==3.10.3
|
| 47 |
+
propcache==0.3.1
|
| 48 |
+
Pygments==2.19.1
|
| 49 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 50 |
+
requests==2.32.3
|
| 51 |
+
filelock==3.18.0
|
| 52 |
+
pexpect==4.9.0
|
| 53 |
+
opencv-python-headless==4.5.5.64
|
| 54 |
+
certifi==2025.4.26
|
| 55 |
+
nvidia-nvtx-cu12==12.4.127
|
| 56 |
+
bleach==6.2.0
|
| 57 |
+
typing_extensions==4.13.2
|
| 58 |
+
tornado==6.5.1
|
| 59 |
+
networkx==3.4.2
|
| 60 |
+
sympy==1.13.1
|
| 61 |
+
watchdog==6.0.0
|
| 62 |
+
kaggle==1.7.4.5
|
| 63 |
+
nvidia-ml-py==12.575.51
|
| 64 |
+
pyarrow==20.0.0
|
| 65 |
+
mpmath==1.3.0
|
| 66 |
+
lightning-utilities==0.14.3
|
| 67 |
+
ftfy==6.3.1
|
| 68 |
+
triton==3.2.0
|
| 69 |
+
referencing==0.36.2
|
| 70 |
+
ipython==8.36.0
|
| 71 |
+
yarl==1.20.0
|
| 72 |
+
language_data==1.3.0
|
| 73 |
+
cycler==0.12.1
|
| 74 |
+
python-magic==0.4.27
|
| 75 |
+
wasabi==1.1.3
|
| 76 |
+
protobuf==6.31.0
|
| 77 |
+
murmurhash==1.0.13
|
| 78 |
+
jsonschema-specifications==2025.4.1
|
| 79 |
+
blinker==1.9.0
|
| 80 |
+
fonttools==4.58.0
|
| 81 |
+
imageio==2.37.0
|
| 82 |
+
pycocoevalcap==1.2
|
| 83 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 84 |
+
fairscale==0.4.4
|
| 85 |
+
hjson==3.1.0
|
| 86 |
+
identify==2.6.12
|
| 87 |
+
mdurl==0.1.2
|
| 88 |
+
decorator==5.2.1
|
| 89 |
+
distlib==0.3.9
|
| 90 |
+
webencodings==0.5.1
|
| 91 |
+
kiwisolver==1.4.8
|
| 92 |
+
srsly==2.5.1
|
| 93 |
+
frozenlist==1.6.0
|
| 94 |
+
blis==1.3.0
|
| 95 |
+
contourpy==1.3.2
|
| 96 |
+
hf-xet==1.1.2
|
| 97 |
+
cymem==2.0.11
|
| 98 |
+
pillow==11.2.1
|
| 99 |
+
pycocotools==2.0.8
|
| 100 |
+
pre_commit==4.2.0
|
| 101 |
+
wrapt==1.17.2
|
| 102 |
+
nvidia-curand-cu12==10.3.5.147
|
| 103 |
+
spacy==3.8.7
|
| 104 |
+
rpds-py==0.25.1
|
| 105 |
+
exceptiongroup==1.3.0
|
| 106 |
+
braceexpand==0.1.7
|
| 107 |
+
rouge_score==0.1.2
|
| 108 |
+
async-timeout==5.0.1
|
| 109 |
+
torchmetrics==1.7.1
|
| 110 |
+
nvidia-nccl-cu12==2.21.5
|
| 111 |
+
wcwidth==0.2.13
|
| 112 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 113 |
+
scikit-image==0.25.2
|
| 114 |
+
urllib3==2.4.0
|
| 115 |
+
portalocker==3.1.1
|
| 116 |
+
smart-open==7.1.0
|
| 117 |
+
cfgv==3.4.0
|
| 118 |
+
markdown-it-py==3.0.0
|
| 119 |
+
charset-normalizer==3.4.2
|
| 120 |
+
executing==2.2.0
|
| 121 |
+
pure_eval==0.2.3
|
| 122 |
+
safetensors==0.5.3
|
| 123 |
+
spacy-legacy==3.0.12
|
| 124 |
+
shellingham==1.5.4
|
| 125 |
+
langcodes==3.5.0
|
| 126 |
+
pytz==2025.2
|
| 127 |
+
iopath==0.1.10
|
| 128 |
+
weasel==0.4.1
|
| 129 |
+
tifffile==2025.5.10
|
| 130 |
+
nodeenv==1.9.1
|
| 131 |
+
absl-py==2.2.2
|
| 132 |
+
einops==0.8.1
|
| 133 |
+
msgpack==1.1.0
|
| 134 |
+
pydantic_core==2.33.2
|
| 135 |
+
ninja==1.11.1.4
|
| 136 |
+
altair==5.5.0
|
| 137 |
+
attrs==25.3.0
|
| 138 |
+
tqdm==4.67.1
|
| 139 |
+
deepspeed==0.16.10+b666844f
|
| 140 |
+
pydeck==0.9.1
|
| 141 |
+
stack-data==0.6.3
|
| 142 |
+
pydantic==2.11.5
|
| 143 |
+
torch==2.6.0
|
| 144 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 145 |
+
python-slugify==8.0.4
|
| 146 |
+
webdataset==0.2.111
|
| 147 |
+
pytorch-lightning==2.5.1.post0
|
| 148 |
+
prompt_toolkit==3.0.51
|
| 149 |
+
psutil==7.0.0
|
| 150 |
+
opendatasets==0.1.22
|
| 151 |
+
asttokens==3.0.0
|
| 152 |
+
MarkupSafe==3.0.2
|
| 153 |
+
multidict==6.4.4
|
| 154 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 155 |
+
GitPython==3.1.44
|
| 156 |
+
PyYAML==6.0.2
|
| 157 |
+
cloudpathlib==0.21.1
|
| 158 |
+
toml==0.10.2
|
| 159 |
+
marisa-trie==1.2.1
|
| 160 |
+
traitlets==5.14.3
|
| 161 |
+
cachetools==5.5.2
|
| 162 |
+
spacy-loggers==1.0.5
|
| 163 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 164 |
+
torchvision==0.21.0
|
| 165 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 166 |
+
jedi==0.19.2
|
| 167 |
+
thinc==8.3.6
|
| 168 |
+
py-cpuinfo==9.0.0
|
| 169 |
+
yacs==0.1.8
|
| 170 |
+
cffi==1.17.1
|
| 171 |
+
preshed==3.0.10
|
| 172 |
+
more-itertools==10.7.0
|
| 173 |
+
bigmodelvis==0.0.1
|
| 174 |
+
datasets==3.6.0
|
| 175 |
+
huggingface-hub==0.32.1
|
| 176 |
+
narwhals==1.41.0
|
| 177 |
+
xxhash==3.5.0
|
| 178 |
+
sentry-sdk==2.29.1
|
| 179 |
+
aiohttp==3.12.2
|
| 180 |
+
opencv-python==4.11.0.86
|
| 181 |
+
pycryptodome==3.23.0
|
| 182 |
+
threadpoolctl==3.6.0
|
| 183 |
+
flash-attn==2.7.1.post1
|
| 184 |
+
transformers==4.52.3
|
| 185 |
+
pycparser==2.22
|
| 186 |
+
pathlib==1.0.1
|
| 187 |
+
dill==0.3.8
|
| 188 |
+
scikit-learn==1.6.1
|
| 189 |
+
tokenizers==0.21.1
|
| 190 |
+
aliyun-python-sdk-core==2.16.0
|
| 191 |
+
fsspec==2025.3.0
|
| 192 |
+
jmespath==0.10.0
|
| 193 |
+
click==8.2.1
|
| 194 |
+
delta-center-client==0.0.4
|
| 195 |
+
cheroot==10.0.1
|
| 196 |
+
wandb==0.19.11
|
| 197 |
+
setproctitle==1.3.6
|
| 198 |
+
jsonschema==4.24.0
|
| 199 |
+
oss2==2.15.0
|
| 200 |
+
multiprocess==0.70.16
|
| 201 |
+
jaraco.functools==4.1.0
|
| 202 |
+
web.py==0.62
|
| 203 |
+
aliyun-python-sdk-kms==2.16.5
|
| 204 |
+
cryptography==45.0.3
|
| 205 |
+
pip==25.1.1
|
| 206 |
+
docker-pycreds==0.4.0
|
| 207 |
+
typer==0.16.0
|
| 208 |
+
opendelta==0.3.2
|
| 209 |
+
crcmod==1.7
|
| 210 |
+
jaraco.functools==4.0.1
|
| 211 |
+
inflect==7.3.1
|
| 212 |
+
jaraco.collections==5.1.0
|
| 213 |
+
packaging==24.2
|
| 214 |
+
wheel==0.45.1
|
| 215 |
+
tomli==2.0.1
|
| 216 |
+
platformdirs==4.2.2
|
| 217 |
+
typing_extensions==4.12.2
|
| 218 |
+
more-itertools==10.3.0
|
| 219 |
+
autocommand==2.2.2
|
| 220 |
+
jaraco.text==3.12.1
|
| 221 |
+
importlib_metadata==8.0.0
|
| 222 |
+
jaraco.context==5.3.0
|
| 223 |
+
zipp==3.19.2
|
| 224 |
+
backports.tarfile==1.2.0
|
| 225 |
+
typeguard==4.3.0
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-06-22T11:18:08.739768Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--devices",
|
| 7 |
+
"0,1,2,3",
|
| 8 |
+
"--mode",
|
| 9 |
+
"train",
|
| 10 |
+
"--filename",
|
| 11 |
+
"stage1_06221723",
|
| 12 |
+
"--num_query_token",
|
| 13 |
+
"8",
|
| 14 |
+
"--plm_name",
|
| 15 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
|
| 16 |
+
"--bert_name",
|
| 17 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
|
| 18 |
+
"--save_every_n_epochs",
|
| 19 |
+
"5",
|
| 20 |
+
"--max_epochs",
|
| 21 |
+
"20",
|
| 22 |
+
"--batch_size",
|
| 23 |
+
"32",
|
| 24 |
+
"--precision",
|
| 25 |
+
"bf16-mixed",
|
| 26 |
+
"--mix_dataset",
|
| 27 |
+
"--num_workers",
|
| 28 |
+
"8",
|
| 29 |
+
"--use_wandb_logger"
|
| 30 |
+
],
|
| 31 |
+
"program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
|
| 32 |
+
"codePath": "stage1.py",
|
| 33 |
+
"email": "gia0603yucca@gmail.com",
|
| 34 |
+
"root": "./all_checkpoints/stage1_06221723/",
|
| 35 |
+
"host": "dsw-251511-c5cfcb8-lwcpt",
|
| 36 |
+
"executable": "/root/miniconda3/envs/protT3/bin/python",
|
| 37 |
+
"codePathLocal": "stage1.py",
|
| 38 |
+
"cpu_count": 64,
|
| 39 |
+
"cpu_count_logical": 64,
|
| 40 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 41 |
+
"gpu_count": 8,
|
| 42 |
+
"disk": {
|
| 43 |
+
"/": {
|
| 44 |
+
"total": "1623302262784",
|
| 45 |
+
"used": "1363197952"
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"memory": {
|
| 49 |
+
"total": "549755813888"
|
| 50 |
+
},
|
| 51 |
+
"cpu": {
|
| 52 |
+
"count": 64,
|
| 53 |
+
"countLogical": 64
|
| 54 |
+
},
|
| 55 |
+
"gpu_nvidia": [
|
| 56 |
+
{
|
| 57 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 58 |
+
"memoryTotal": "85198045184",
|
| 59 |
+
"architecture": "Ampere"
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 63 |
+
"memoryTotal": "85198045184",
|
| 64 |
+
"architecture": "Ampere"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 68 |
+
"memoryTotal": "85198045184",
|
| 69 |
+
"architecture": "Ampere"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 73 |
+
"memoryTotal": "85198045184",
|
| 74 |
+
"architecture": "Ampere"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 78 |
+
"memoryTotal": "85198045184",
|
| 79 |
+
"architecture": "Ampere"
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 83 |
+
"memoryTotal": "85198045184",
|
| 84 |
+
"architecture": "Ampere"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 88 |
+
"memoryTotal": "85198045184",
|
| 89 |
+
"architecture": "Ampere"
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 93 |
+
"memoryTotal": "85198045184",
|
| 94 |
+
"architecture": "Ampere"
|
| 95 |
+
}
|
| 96 |
+
],
|
| 97 |
+
"cudaVersion": "12.1"
|
| 98 |
+
}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":11}}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-22T19:18:08.742601474+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-22T19:18:09.773061599+08:00","level":"INFO","msg":"created new stream","id":"a2dszq0q"}
|
| 3 |
+
{"time":"2025-06-22T19:18:09.773105546+08:00","level":"INFO","msg":"stream: started","id":"a2dszq0q"}
|
| 4 |
+
{"time":"2025-06-22T19:18:09.773141968+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"a2dszq0q"}
|
| 5 |
+
{"time":"2025-06-22T19:18:09.773166056+08:00","level":"INFO","msg":"sender: started","stream_id":"a2dszq0q"}
|
| 6 |
+
{"time":"2025-06-22T19:18:09.773225667+08:00","level":"INFO","msg":"handler: started","stream_id":"a2dszq0q"}
|
| 7 |
+
{"time":"2025-06-22T19:18:11.012086945+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-06-22T19:18:19.860220439+08:00","level":"INFO","msg":"stream: closing","id":"a2dszq0q"}
|
| 9 |
+
{"time":"2025-06-22T19:18:19.860330929+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 10 |
+
{"time":"2025-06-22T19:18:19.861063374+08:00","level":"INFO","msg":"Stopped system monitor"}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-22 19:18:08,732 INFO MainThread:99755 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_setup.py:_flush():70] Configure stats pid to 99755
|
| 3 |
+
2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug.log
|
| 7 |
+
2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-internal.log
|
| 8 |
+
2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-06-22 19:18:08,734 INFO MainThread:99755 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-06-22 19:18:08,736 INFO MainThread:99755 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-06-22 19:18:08,740 INFO MainThread:99755 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-06-22 19:18:08,746 INFO MainThread:99755 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-06-22 19:18:10,963 INFO MainThread:99755 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-06-22 19:18:11,141 INFO MainThread:99755 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-06-22 19:18:11,142 INFO MainThread:99755 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-06-22 19:18:11,145 INFO MainThread:99755 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-06-22 19:18:11,145 INFO MainThread:99755 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-06-22 19:18:11,146 INFO MainThread:99755 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-06-22 19:18:14,870 INFO MainThread:99755 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
|
| 24 |
+
2025-06-22 19:18:19,859 INFO MsgRouterThr:99755 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/run-a2dszq0q.wandb
ADDED
|
File without changes
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/config.yaml
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.11
|
| 4 |
+
m:
|
| 5 |
+
- "1": trainer/global_step
|
| 6 |
+
"6":
|
| 7 |
+
- 3
|
| 8 |
+
"7": []
|
| 9 |
+
python_version: 3.10.0
|
| 10 |
+
t:
|
| 11 |
+
"1":
|
| 12 |
+
- 1
|
| 13 |
+
- 5
|
| 14 |
+
- 9
|
| 15 |
+
- 11
|
| 16 |
+
- 33
|
| 17 |
+
- 41
|
| 18 |
+
- 49
|
| 19 |
+
- 53
|
| 20 |
+
- 55
|
| 21 |
+
- 63
|
| 22 |
+
- 103
|
| 23 |
+
"2":
|
| 24 |
+
- 1
|
| 25 |
+
- 5
|
| 26 |
+
- 9
|
| 27 |
+
- 11
|
| 28 |
+
- 33
|
| 29 |
+
- 41
|
| 30 |
+
- 49
|
| 31 |
+
- 53
|
| 32 |
+
- 55
|
| 33 |
+
- 63
|
| 34 |
+
- 103
|
| 35 |
+
"3":
|
| 36 |
+
- 7
|
| 37 |
+
- 23
|
| 38 |
+
- 55
|
| 39 |
+
- 66
|
| 40 |
+
"4": 3.10.0
|
| 41 |
+
"5": 0.19.11
|
| 42 |
+
"6": 4.52.3
|
| 43 |
+
"8":
|
| 44 |
+
- 5
|
| 45 |
+
"12": 0.19.11
|
| 46 |
+
"13": linux-x86_64
|
| 47 |
+
accelerator:
|
| 48 |
+
value: gpu
|
| 49 |
+
batch_size:
|
| 50 |
+
value: 32
|
| 51 |
+
bert_hidden_dim:
|
| 52 |
+
value: 768
|
| 53 |
+
bert_name:
|
| 54 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
|
| 55 |
+
check_val_every_n_epoch:
|
| 56 |
+
value: 1
|
| 57 |
+
cross_attention_freq:
|
| 58 |
+
value: 2
|
| 59 |
+
devices:
|
| 60 |
+
value: 0,1,2,3,4,5,6,7
|
| 61 |
+
filename:
|
| 62 |
+
value: stage1_06221723
|
| 63 |
+
init_checkpoint:
|
| 64 |
+
value: ""
|
| 65 |
+
init_lr:
|
| 66 |
+
value: 0.0001
|
| 67 |
+
lm:
|
| 68 |
+
value: true
|
| 69 |
+
load_4bit:
|
| 70 |
+
value: false
|
| 71 |
+
lr_decay_rate:
|
| 72 |
+
value: 0.9
|
| 73 |
+
match_batch_size:
|
| 74 |
+
value: 64
|
| 75 |
+
max_epochs:
|
| 76 |
+
value: 20
|
| 77 |
+
min_lr:
|
| 78 |
+
value: 1e-05
|
| 79 |
+
mix_dataset:
|
| 80 |
+
value: true
|
| 81 |
+
mode:
|
| 82 |
+
value: train
|
| 83 |
+
num_query_token:
|
| 84 |
+
value: 8
|
| 85 |
+
num_workers:
|
| 86 |
+
value: 8
|
| 87 |
+
plm_name:
|
| 88 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
|
| 89 |
+
plm_tune:
|
| 90 |
+
value: freeze
|
| 91 |
+
pool_size:
|
| 92 |
+
value: 0
|
| 93 |
+
precision:
|
| 94 |
+
value: bf16-mixed
|
| 95 |
+
projection_dim:
|
| 96 |
+
value: 256
|
| 97 |
+
prot_aug:
|
| 98 |
+
value: None
|
| 99 |
+
prot_max_len:
|
| 100 |
+
value: 1024
|
| 101 |
+
ptm:
|
| 102 |
+
value: true
|
| 103 |
+
rerank_cand_num:
|
| 104 |
+
value: 128
|
| 105 |
+
retrieval_eval_epoch:
|
| 106 |
+
value: 10
|
| 107 |
+
root:
|
| 108 |
+
value: data
|
| 109 |
+
save_every_n_epochs:
|
| 110 |
+
value: 5
|
| 111 |
+
scheduler:
|
| 112 |
+
value: linear_warmup_cosine_lr
|
| 113 |
+
seed:
|
| 114 |
+
value: 42
|
| 115 |
+
strategy:
|
| 116 |
+
value: deepspeed
|
| 117 |
+
temperature:
|
| 118 |
+
value: 0.1
|
| 119 |
+
text_max_len:
|
| 120 |
+
value: 128
|
| 121 |
+
use_wandb_logger:
|
| 122 |
+
value: true
|
| 123 |
+
warmup_lr:
|
| 124 |
+
value: 1e-06
|
| 125 |
+
warmup_steps:
|
| 126 |
+
value: 1000
|
| 127 |
+
weight_decay:
|
| 128 |
+
value: 0.05
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/output.log
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[rank0]:W0622 19:26:47.840041 104393 site-packages/torch/distributed/distributed_c10d.py:2941] _object_to_tensor size: 81 hash value: 6444836214324640892
|
| 2 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
|
| 3 |
+
Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
|
| 4 |
+
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
|
| 5 |
+
|
| 6 |
+
| Name | Type | Params | Mode
|
| 7 |
+
------------------------------------------------------
|
| 8 |
+
0 | blip2qformer | Blip2Qformer | 327 M | train
|
| 9 |
+
------------------------------------------------------
|
| 10 |
+
179 M Trainable params
|
| 11 |
+
147 M Non-trainable params
|
| 12 |
+
327 M Total params
|
| 13 |
+
1,309.467 Total estimated model params size (MB)
|
| 14 |
+
5 Modules in train mode
|
| 15 |
+
926 Modules in eval mode
|
| 16 |
+
Sanity Checking DataLoader 0: 0%| | 0/1 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([13, 104])
|
| 17 |
+
labels.shape: torch.Size([13])
|
| 18 |
+
/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:220: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
|
| 19 |
+
with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
|
| 20 |
+
Sanity Checking DataLoader 1: 0%| | 0/1 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([25, 200])
|
| 21 |
+
labels.shape: torch.Size([25])
|
| 22 |
+
Sanity Checking DataLoader 2: 0%| | 0/1 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([17, 136])
|
| 23 |
+
labels.shape: torch.Size([17])
|
| 24 |
+
Epoch 0: 0%| | 0/61 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([32, 256])
|
| 25 |
+
labels.shape: torch.Size([32])
|
| 26 |
+
Epoch 0: 2%|█ | 1/61 [00:01<01:11, 0.84it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 27 |
+
labels.shape: torch.Size([32])
|
| 28 |
+
Epoch 0: 3%|██ | 2/61 [00:01<00:55, 1.07it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 29 |
+
labels.shape: torch.Size([32])
|
| 30 |
+
Epoch 0: 5%|███ | 3/61 [00:02<00:49, 1.17it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 31 |
+
labels.shape: torch.Size([32])
|
| 32 |
+
Epoch 0: 7%|████ | 4/61 [00:03<00:46, 1.24it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 33 |
+
labels.shape: torch.Size([32])
|
| 34 |
+
Epoch 0: 8%|█████ | 5/61 [00:03<00:43, 1.28it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 35 |
+
labels.shape: torch.Size([32])
|
| 36 |
+
Epoch 0: 10%|██████ | 6/61 [00:04<00:42, 1.31it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 37 |
+
labels.shape: torch.Size([32])
|
| 38 |
+
Epoch 0: 11%|███████ | 7/61 [00:05<00:40, 1.33it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 39 |
+
labels.shape: torch.Size([32])
|
| 40 |
+
Epoch 0: 13%|████████ | 8/61 [00:05<00:39, 1.35it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 41 |
+
labels.shape: torch.Size([32])
|
| 42 |
+
Epoch 0: 15%|█████████ | 9/61 [00:06<00:38, 1.36it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 43 |
+
labels.shape: torch.Size([32])
|
| 44 |
+
Epoch 0: 16%|█████████▊ | 10/61 [00:07<00:37, 1.37it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 45 |
+
labels.shape: torch.Size([32])
|
| 46 |
+
Epoch 0: 18%|██████████▊ | 11/61 [00:07<00:36, 1.38it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 47 |
+
labels.shape: torch.Size([32])
|
| 48 |
+
Epoch 0: 20%|███████████▊ | 12/61 [00:08<00:35, 1.39it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 49 |
+
labels.shape: torch.Size([32])
|
| 50 |
+
Epoch 0: 21%|████████████▊ | 13/61 [00:09<00:34, 1.40it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 51 |
+
labels.shape: torch.Size([32])
|
| 52 |
+
Epoch 0: 23%|█████████████▊ | 14/61 [00:09<00:33, 1.40it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 53 |
+
labels.shape: torch.Size([32])
|
| 54 |
+
Epoch 0: 25%|██████████████▊ | 15/61 [00:10<00:32, 1.41it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 55 |
+
labels.shape: torch.Size([32])
|
| 56 |
+
Epoch 0: 26%|███████████████▋ | 16/61 [00:11<00:31, 1.41it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 57 |
+
labels.shape: torch.Size([32])
|
| 58 |
+
Epoch 0: 28%|████████████████▋ | 17/61 [00:12<00:31, 1.42it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 59 |
+
labels.shape: torch.Size([32])
|
| 60 |
+
Epoch 0: 30%|█████████████████▋ | 18/61 [00:12<00:30, 1.42it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 61 |
+
labels.shape: torch.Size([32])
|
| 62 |
+
Epoch 0: 31%|██████████████████▋ | 19/61 [00:13<00:29, 1.42it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 63 |
+
labels.shape: torch.Size([32])
|
| 64 |
+
Epoch 0: 33%|███████████████████▋ | 20/61 [00:14<00:28, 1.42it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 65 |
+
labels.shape: torch.Size([32])
|
| 66 |
+
Epoch 0: 34%|████████████████████▋ | 21/61 [00:14<00:28, 1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 67 |
+
labels.shape: torch.Size([32])
|
| 68 |
+
Epoch 0: 36%|█████████████████████▋ | 22/61 [00:15<00:27, 1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 69 |
+
labels.shape: torch.Size([32])
|
| 70 |
+
Epoch 0: 38%|██████████████████████▌ | 23/61 [00:16<00:26, 1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 71 |
+
labels.shape: torch.Size([32])
|
| 72 |
+
Epoch 0: 39%|███████████████████████▌ | 24/61 [00:16<00:25, 1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 73 |
+
labels.shape: torch.Size([32])
|
| 74 |
+
Epoch 0: 41%|████████████████████████▌ | 25/61 [00:17<00:25, 1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 75 |
+
labels.shape: torch.Size([32])
|
| 76 |
+
Epoch 0: 43%|█████████████████████████▌ | 26/61 [00:18<00:24, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 77 |
+
labels.shape: torch.Size([32])
|
| 78 |
+
Epoch 0: 44%|██████████████████████████▌ | 27/61 [00:18<00:23, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 79 |
+
labels.shape: torch.Size([32])
|
| 80 |
+
Epoch 0: 46%|███████████████████████████▌ | 28/61 [00:19<00:22, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 81 |
+
labels.shape: torch.Size([32])
|
| 82 |
+
Epoch 0: 48%|████████████████████████████▌ | 29/61 [00:20<00:22, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 83 |
+
labels.shape: torch.Size([32])
|
| 84 |
+
Epoch 0: 49%|█████████████████████████████▌ | 30/61 [00:20<00:21, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 85 |
+
labels.shape: torch.Size([32])
|
| 86 |
+
Epoch 0: 51%|██████████████████████████████▍ | 31/61 [00:21<00:20, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 87 |
+
labels.shape: torch.Size([32])
|
| 88 |
+
Epoch 0: 52%|███████████████████████████████▍ | 32/61 [00:22<00:20, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 89 |
+
labels.shape: torch.Size([32])
|
| 90 |
+
Epoch 0: 54%|████████████████████████████████▍ | 33/61 [00:22<00:19, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 91 |
+
labels.shape: torch.Size([32])
|
| 92 |
+
Epoch 0: 56%|█████████████████████████████████▍ | 34/61 [00:23<00:18, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 93 |
+
labels.shape: torch.Size([32])
|
| 94 |
+
Epoch 0: 57%|██████████████████████████████████▍ | 35/61 [00:24<00:17, 1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 95 |
+
labels.shape: torch.Size([32])
|
| 96 |
+
Epoch 0: 59%|█████████████████████████���█████████▍ | 36/61 [00:24<00:17, 1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 97 |
+
labels.shape: torch.Size([32])
|
| 98 |
+
Epoch 0: 61%|████████████████████████████████████▍ | 37/61 [00:25<00:16, 1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 99 |
+
labels.shape: torch.Size([32])
|
| 100 |
+
Epoch 0: 62%|█████████████████████████████████████▍ | 38/61 [00:26<00:15, 1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 101 |
+
labels.shape: torch.Size([32])
|
| 102 |
+
Epoch 0: 64%|██████████████████████████████████████▎ | 39/61 [00:26<00:15, 1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 103 |
+
labels.shape: torch.Size([32])
|
| 104 |
+
Epoch 0: 66%|███████████████████████████████████████▎ | 40/61 [00:27<00:14, 1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 105 |
+
labels.shape: torch.Size([32])
|
| 106 |
+
Epoch 0: 67%|████████████████████████████████████████▎ | 41/61 [00:28<00:13, 1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
|
| 107 |
+
labels.shape: torch.Size([32])
|
| 108 |
+
Epoch 0: 69%|█████████████████████████████████████████▎ | 42/61 [00:28<00:13, 1.45it/s, v_num=vn72]
|
| 109 |
+
|
| 110 |
+
Detected KeyboardInterrupt, attempting graceful shutdown ...
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/requirements.txt
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy==2.2.6
|
| 2 |
+
confection==0.1.5
|
| 3 |
+
text-unidecode==1.3
|
| 4 |
+
contexttimer==0.3.3
|
| 5 |
+
omegaconf==2.3.0
|
| 6 |
+
tzdata==2025.2
|
| 7 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 8 |
+
plotly==6.1.1
|
| 9 |
+
decord==0.6.0
|
| 10 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 11 |
+
scipy==1.15.3
|
| 12 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 13 |
+
parso==0.8.4
|
| 14 |
+
python-dateutil==2.9.0.post0
|
| 15 |
+
setuptools==78.1.1
|
| 16 |
+
aiosignal==1.3.2
|
| 17 |
+
joblib==1.5.1
|
| 18 |
+
platformdirs==4.3.8
|
| 19 |
+
regex==2024.11.6
|
| 20 |
+
aiohappyeyeballs==2.6.1
|
| 21 |
+
virtualenv==20.31.2
|
| 22 |
+
lazy_loader==0.4
|
| 23 |
+
rich==14.0.0
|
| 24 |
+
timm==0.4.12
|
| 25 |
+
antlr4-python3-runtime==4.9.3
|
| 26 |
+
pandas==2.2.3
|
| 27 |
+
salesforce-lavis==1.0.2
|
| 28 |
+
gitdb==4.0.12
|
| 29 |
+
six==1.17.0
|
| 30 |
+
smmap==5.0.2
|
| 31 |
+
annotated-types==0.7.0
|
| 32 |
+
pyparsing==3.2.3
|
| 33 |
+
Jinja2==3.1.6
|
| 34 |
+
ptyprocess==0.7.0
|
| 35 |
+
streamlit==1.45.1
|
| 36 |
+
idna==3.10
|
| 37 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 38 |
+
tenacity==9.1.2
|
| 39 |
+
sentencepiece==0.2.0
|
| 40 |
+
matplotlib-inline==0.1.7
|
| 41 |
+
typing-inspection==0.4.1
|
| 42 |
+
packaging==24.2
|
| 43 |
+
nltk==3.9.1
|
| 44 |
+
wheel==0.45.1
|
| 45 |
+
catalogue==2.0.10
|
| 46 |
+
matplotlib==3.10.3
|
| 47 |
+
propcache==0.3.1
|
| 48 |
+
Pygments==2.19.1
|
| 49 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 50 |
+
requests==2.32.3
|
| 51 |
+
filelock==3.18.0
|
| 52 |
+
pexpect==4.9.0
|
| 53 |
+
opencv-python-headless==4.5.5.64
|
| 54 |
+
certifi==2025.4.26
|
| 55 |
+
nvidia-nvtx-cu12==12.4.127
|
| 56 |
+
bleach==6.2.0
|
| 57 |
+
typing_extensions==4.13.2
|
| 58 |
+
tornado==6.5.1
|
| 59 |
+
networkx==3.4.2
|
| 60 |
+
sympy==1.13.1
|
| 61 |
+
watchdog==6.0.0
|
| 62 |
+
kaggle==1.7.4.5
|
| 63 |
+
nvidia-ml-py==12.575.51
|
| 64 |
+
pyarrow==20.0.0
|
| 65 |
+
mpmath==1.3.0
|
| 66 |
+
lightning-utilities==0.14.3
|
| 67 |
+
ftfy==6.3.1
|
| 68 |
+
triton==3.2.0
|
| 69 |
+
referencing==0.36.2
|
| 70 |
+
ipython==8.36.0
|
| 71 |
+
yarl==1.20.0
|
| 72 |
+
language_data==1.3.0
|
| 73 |
+
cycler==0.12.1
|
| 74 |
+
python-magic==0.4.27
|
| 75 |
+
wasabi==1.1.3
|
| 76 |
+
protobuf==6.31.0
|
| 77 |
+
murmurhash==1.0.13
|
| 78 |
+
jsonschema-specifications==2025.4.1
|
| 79 |
+
blinker==1.9.0
|
| 80 |
+
fonttools==4.58.0
|
| 81 |
+
imageio==2.37.0
|
| 82 |
+
pycocoevalcap==1.2
|
| 83 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 84 |
+
fairscale==0.4.4
|
| 85 |
+
hjson==3.1.0
|
| 86 |
+
identify==2.6.12
|
| 87 |
+
mdurl==0.1.2
|
| 88 |
+
decorator==5.2.1
|
| 89 |
+
distlib==0.3.9
|
| 90 |
+
webencodings==0.5.1
|
| 91 |
+
kiwisolver==1.4.8
|
| 92 |
+
srsly==2.5.1
|
| 93 |
+
frozenlist==1.6.0
|
| 94 |
+
blis==1.3.0
|
| 95 |
+
contourpy==1.3.2
|
| 96 |
+
hf-xet==1.1.2
|
| 97 |
+
cymem==2.0.11
|
| 98 |
+
pillow==11.2.1
|
| 99 |
+
pycocotools==2.0.8
|
| 100 |
+
pre_commit==4.2.0
|
| 101 |
+
wrapt==1.17.2
|
| 102 |
+
nvidia-curand-cu12==10.3.5.147
|
| 103 |
+
spacy==3.8.7
|
| 104 |
+
rpds-py==0.25.1
|
| 105 |
+
exceptiongroup==1.3.0
|
| 106 |
+
braceexpand==0.1.7
|
| 107 |
+
rouge_score==0.1.2
|
| 108 |
+
async-timeout==5.0.1
|
| 109 |
+
torchmetrics==1.7.1
|
| 110 |
+
nvidia-nccl-cu12==2.21.5
|
| 111 |
+
wcwidth==0.2.13
|
| 112 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 113 |
+
scikit-image==0.25.2
|
| 114 |
+
urllib3==2.4.0
|
| 115 |
+
portalocker==3.1.1
|
| 116 |
+
smart-open==7.1.0
|
| 117 |
+
cfgv==3.4.0
|
| 118 |
+
markdown-it-py==3.0.0
|
| 119 |
+
charset-normalizer==3.4.2
|
| 120 |
+
executing==2.2.0
|
| 121 |
+
pure_eval==0.2.3
|
| 122 |
+
safetensors==0.5.3
|
| 123 |
+
spacy-legacy==3.0.12
|
| 124 |
+
shellingham==1.5.4
|
| 125 |
+
langcodes==3.5.0
|
| 126 |
+
pytz==2025.2
|
| 127 |
+
iopath==0.1.10
|
| 128 |
+
weasel==0.4.1
|
| 129 |
+
tifffile==2025.5.10
|
| 130 |
+
nodeenv==1.9.1
|
| 131 |
+
absl-py==2.2.2
|
| 132 |
+
einops==0.8.1
|
| 133 |
+
msgpack==1.1.0
|
| 134 |
+
pydantic_core==2.33.2
|
| 135 |
+
ninja==1.11.1.4
|
| 136 |
+
altair==5.5.0
|
| 137 |
+
attrs==25.3.0
|
| 138 |
+
tqdm==4.67.1
|
| 139 |
+
deepspeed==0.16.10+b666844f
|
| 140 |
+
pydeck==0.9.1
|
| 141 |
+
stack-data==0.6.3
|
| 142 |
+
pydantic==2.11.5
|
| 143 |
+
torch==2.6.0
|
| 144 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 145 |
+
python-slugify==8.0.4
|
| 146 |
+
webdataset==0.2.111
|
| 147 |
+
pytorch-lightning==2.5.1.post0
|
| 148 |
+
prompt_toolkit==3.0.51
|
| 149 |
+
psutil==7.0.0
|
| 150 |
+
opendatasets==0.1.22
|
| 151 |
+
asttokens==3.0.0
|
| 152 |
+
MarkupSafe==3.0.2
|
| 153 |
+
multidict==6.4.4
|
| 154 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 155 |
+
GitPython==3.1.44
|
| 156 |
+
PyYAML==6.0.2
|
| 157 |
+
cloudpathlib==0.21.1
|
| 158 |
+
toml==0.10.2
|
| 159 |
+
marisa-trie==1.2.1
|
| 160 |
+
traitlets==5.14.3
|
| 161 |
+
cachetools==5.5.2
|
| 162 |
+
spacy-loggers==1.0.5
|
| 163 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 164 |
+
torchvision==0.21.0
|
| 165 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 166 |
+
jedi==0.19.2
|
| 167 |
+
thinc==8.3.6
|
| 168 |
+
py-cpuinfo==9.0.0
|
| 169 |
+
yacs==0.1.8
|
| 170 |
+
cffi==1.17.1
|
| 171 |
+
preshed==3.0.10
|
| 172 |
+
more-itertools==10.7.0
|
| 173 |
+
bigmodelvis==0.0.1
|
| 174 |
+
datasets==3.6.0
|
| 175 |
+
huggingface-hub==0.32.1
|
| 176 |
+
narwhals==1.41.0
|
| 177 |
+
xxhash==3.5.0
|
| 178 |
+
sentry-sdk==2.29.1
|
| 179 |
+
aiohttp==3.12.2
|
| 180 |
+
opencv-python==4.11.0.86
|
| 181 |
+
pycryptodome==3.23.0
|
| 182 |
+
threadpoolctl==3.6.0
|
| 183 |
+
flash-attn==2.7.1.post1
|
| 184 |
+
transformers==4.52.3
|
| 185 |
+
pycparser==2.22
|
| 186 |
+
pathlib==1.0.1
|
| 187 |
+
dill==0.3.8
|
| 188 |
+
scikit-learn==1.6.1
|
| 189 |
+
tokenizers==0.21.1
|
| 190 |
+
aliyun-python-sdk-core==2.16.0
|
| 191 |
+
fsspec==2025.3.0
|
| 192 |
+
jmespath==0.10.0
|
| 193 |
+
click==8.2.1
|
| 194 |
+
delta-center-client==0.0.4
|
| 195 |
+
cheroot==10.0.1
|
| 196 |
+
wandb==0.19.11
|
| 197 |
+
setproctitle==1.3.6
|
| 198 |
+
jsonschema==4.24.0
|
| 199 |
+
oss2==2.15.0
|
| 200 |
+
multiprocess==0.70.16
|
| 201 |
+
jaraco.functools==4.1.0
|
| 202 |
+
web.py==0.62
|
| 203 |
+
aliyun-python-sdk-kms==2.16.5
|
| 204 |
+
cryptography==45.0.3
|
| 205 |
+
pip==25.1.1
|
| 206 |
+
docker-pycreds==0.4.0
|
| 207 |
+
typer==0.16.0
|
| 208 |
+
opendelta==0.3.2
|
| 209 |
+
crcmod==1.7
|
| 210 |
+
jaraco.functools==4.0.1
|
| 211 |
+
inflect==7.3.1
|
| 212 |
+
jaraco.collections==5.1.0
|
| 213 |
+
packaging==24.2
|
| 214 |
+
wheel==0.45.1
|
| 215 |
+
tomli==2.0.1
|
| 216 |
+
platformdirs==4.2.2
|
| 217 |
+
typing_extensions==4.12.2
|
| 218 |
+
more-itertools==10.3.0
|
| 219 |
+
autocommand==2.2.2
|
| 220 |
+
jaraco.text==3.12.1
|
| 221 |
+
importlib_metadata==8.0.0
|
| 222 |
+
jaraco.context==5.3.0
|
| 223 |
+
zipp==3.19.2
|
| 224 |
+
backports.tarfile==1.2.0
|
| 225 |
+
typeguard==4.3.0
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-06-22T11:26:45.457479Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--devices",
|
| 7 |
+
"0,1,2,3,4,5,6,7",
|
| 8 |
+
"--mode",
|
| 9 |
+
"train",
|
| 10 |
+
"--filename",
|
| 11 |
+
"stage1_06221723",
|
| 12 |
+
"--num_query_token",
|
| 13 |
+
"8",
|
| 14 |
+
"--plm_name",
|
| 15 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
|
| 16 |
+
"--bert_name",
|
| 17 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
|
| 18 |
+
"--save_every_n_epochs",
|
| 19 |
+
"5",
|
| 20 |
+
"--max_epochs",
|
| 21 |
+
"20",
|
| 22 |
+
"--batch_size",
|
| 23 |
+
"32",
|
| 24 |
+
"--precision",
|
| 25 |
+
"bf16-mixed",
|
| 26 |
+
"--mix_dataset",
|
| 27 |
+
"--num_workers",
|
| 28 |
+
"8",
|
| 29 |
+
"--use_wandb_logger"
|
| 30 |
+
],
|
| 31 |
+
"program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
|
| 32 |
+
"codePath": "stage1.py",
|
| 33 |
+
"email": "gia0603yucca@gmail.com",
|
| 34 |
+
"root": "./all_checkpoints/stage1_06221723/",
|
| 35 |
+
"host": "dsw-251511-c5cfcb8-lwcpt",
|
| 36 |
+
"executable": "/root/miniconda3/envs/protT3/bin/python",
|
| 37 |
+
"codePathLocal": "stage1.py",
|
| 38 |
+
"cpu_count": 64,
|
| 39 |
+
"cpu_count_logical": 64,
|
| 40 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 41 |
+
"gpu_count": 8,
|
| 42 |
+
"disk": {
|
| 43 |
+
"/": {
|
| 44 |
+
"total": "1623302262784",
|
| 45 |
+
"used": "1363202048"
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"memory": {
|
| 49 |
+
"total": "549755813888"
|
| 50 |
+
},
|
| 51 |
+
"cpu": {
|
| 52 |
+
"count": 64,
|
| 53 |
+
"countLogical": 64
|
| 54 |
+
},
|
| 55 |
+
"gpu_nvidia": [
|
| 56 |
+
{
|
| 57 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 58 |
+
"memoryTotal": "85198045184",
|
| 59 |
+
"architecture": "Ampere"
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 63 |
+
"memoryTotal": "85198045184",
|
| 64 |
+
"architecture": "Ampere"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 68 |
+
"memoryTotal": "85198045184",
|
| 69 |
+
"architecture": "Ampere"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 73 |
+
"memoryTotal": "85198045184",
|
| 74 |
+
"architecture": "Ampere"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 78 |
+
"memoryTotal": "85198045184",
|
| 79 |
+
"architecture": "Ampere"
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 83 |
+
"memoryTotal": "85198045184",
|
| 84 |
+
"architecture": "Ampere"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 88 |
+
"memoryTotal": "85198045184",
|
| 89 |
+
"architecture": "Ampere"
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 93 |
+
"memoryTotal": "85198045184",
|
| 94 |
+
"architecture": "Ampere"
|
| 95 |
+
}
|
| 96 |
+
],
|
| 97 |
+
"cudaVersion": "12.1"
|
| 98 |
+
}
|
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/run-zmuhvn72.wandb
ADDED
|
Binary file (58.8 kB). View file
|
|
|