yuccaaa commited on
Commit
992a397
·
verified ·
1 Parent(s): e48e67e

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_token_acc.png +0 -0
  2. BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_epoch.png +0 -0
  3. BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_grad_norm.png +0 -0
  4. BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_learning_rate.png +0 -0
  5. BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_loss.png +0 -0
  6. BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_memory(GiB).png +0 -0
  7. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/config.yaml +128 -0
  8. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/output.log +141 -0
  9. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/requirements.txt +225 -0
  10. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-metadata.json +98 -0
  11. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-summary.json +1 -0
  12. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-internal.log +10 -0
  13. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug.log +24 -0
  14. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/run-tul2l6xd.wandb +0 -0
  15. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/config.yaml +128 -0
  16. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/output.log +141 -0
  17. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/requirements.txt +225 -0
  18. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-metadata.json +98 -0
  19. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-summary.json +1 -0
  20. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-internal.log +15 -0
  21. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug.log +24 -0
  22. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/run-bq9amgfj.wandb +0 -0
  23. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/config.yaml +236 -0
  24. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/output.log +20 -0
  25. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/requirements.txt +225 -0
  26. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-metadata.json +98 -0
  27. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-summary.json +1 -0
  28. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-internal.log +10 -0
  29. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug.log +24 -0
  30. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/config.yaml +128 -0
  31. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/output.log +141 -0
  32. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/requirements.txt +225 -0
  33. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-metadata.json +98 -0
  34. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-summary.json +1 -0
  35. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-internal.log +15 -0
  36. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug.log +24 -0
  37. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/run-bu5vqvh5.wandb +0 -0
  38. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/config.yaml +128 -0
  39. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/output.log +150 -0
  40. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/requirements.txt +225 -0
  41. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-metadata.json +98 -0
  42. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-summary.json +1 -0
  43. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-internal.log +10 -0
  44. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug.log +24 -0
  45. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/run-a2dszq0q.wandb +0 -0
  46. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/config.yaml +128 -0
  47. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/output.log +110 -0
  48. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/requirements.txt +225 -0
  49. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/wandb-metadata.json +98 -0
  50. ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/run-zmuhvn72.wandb +0 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_token_acc.png ADDED
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_epoch.png ADDED
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_grad_norm.png ADDED
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_learning_rate.png ADDED
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_loss.png ADDED
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_memory(GiB).png ADDED
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/config.yaml ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": trainer/global_step
6
+ "6":
7
+ - 3
8
+ "7": []
9
+ python_version: 3.10.0
10
+ t:
11
+ "1":
12
+ - 1
13
+ - 5
14
+ - 9
15
+ - 11
16
+ - 33
17
+ - 41
18
+ - 49
19
+ - 53
20
+ - 55
21
+ - 63
22
+ - 103
23
+ "2":
24
+ - 1
25
+ - 5
26
+ - 9
27
+ - 11
28
+ - 33
29
+ - 41
30
+ - 49
31
+ - 53
32
+ - 55
33
+ - 63
34
+ - 103
35
+ "3":
36
+ - 7
37
+ - 23
38
+ - 55
39
+ - 66
40
+ "4": 3.10.0
41
+ "5": 0.19.11
42
+ "6": 4.52.3
43
+ "8":
44
+ - 5
45
+ "12": 0.19.11
46
+ "13": linux-x86_64
47
+ accelerator:
48
+ value: gpu
49
+ batch_size:
50
+ value: 32
51
+ bert_hidden_dim:
52
+ value: 768
53
+ bert_name:
54
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
55
+ check_val_every_n_epoch:
56
+ value: 1
57
+ cross_attention_freq:
58
+ value: 2
59
+ devices:
60
+ value: 0,1,2,3
61
+ filename:
62
+ value: stage1_06221723
63
+ init_checkpoint:
64
+ value: ""
65
+ init_lr:
66
+ value: 0.0001
67
+ lm:
68
+ value: true
69
+ load_4bit:
70
+ value: false
71
+ lr_decay_rate:
72
+ value: 0.9
73
+ match_batch_size:
74
+ value: 64
75
+ max_epochs:
76
+ value: 20
77
+ min_lr:
78
+ value: 1e-05
79
+ mix_dataset:
80
+ value: true
81
+ mode:
82
+ value: train
83
+ num_query_token:
84
+ value: 8
85
+ num_workers:
86
+ value: 8
87
+ plm_name:
88
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
89
+ plm_tune:
90
+ value: freeze
91
+ pool_size:
92
+ value: 0
93
+ precision:
94
+ value: bf16-mixed
95
+ projection_dim:
96
+ value: 256
97
+ prot_aug:
98
+ value: None
99
+ prot_max_len:
100
+ value: 1024
101
+ ptm:
102
+ value: true
103
+ rerank_cand_num:
104
+ value: 128
105
+ retrieval_eval_epoch:
106
+ value: 10
107
+ root:
108
+ value: data
109
+ save_every_n_epochs:
110
+ value: 5
111
+ scheduler:
112
+ value: linear_warmup_cosine_lr
113
+ seed:
114
+ value: 42
115
+ strategy:
116
+ value: deepspeed
117
+ temperature:
118
+ value: 0.1
119
+ text_max_len:
120
+ value: 128
121
+ use_wandb_logger:
122
+ value: true
123
+ warmup_lr:
124
+ value: 1e-06
125
+ warmup_steps:
126
+ value: 1000
127
+ weight_decay:
128
+ value: 0.05
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/output.log ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
2
+ Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
3
+ LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
4
+
5
+ | Name | Type | Params | Mode
6
+ ------------------------------------------------------
7
+ 0 | blip2qformer | Blip2Qformer | 327 M | train
8
+ ------------------------------------------------------
9
+ 179 M Trainable params
10
+ 147 M Non-trainable params
11
+ 327 M Total params
12
+ 1,309.467 Total estimated model params size (MB)
13
+ 5 Modules in train mode
14
+ 926 Modules in eval mode
15
+ Sanity Checking DataLoader 2: 50%|██████████████████████████▌ | 1/2 [00:00<00:00, 1.86it/s]
16
+ /nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
17
+ with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
18
+ Traceback (most recent call last):
19
+ File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
20
+ main(args)
21
+ File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
22
+ trainer.fit(model, datamodule=dm)
23
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
24
+ call._call_and_handle_interrupt(
25
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
26
+ return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
27
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
28
+ return function(*args, **kwargs)
29
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
30
+ self._run(model, ckpt_path=ckpt_path)
31
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
32
+ results = self._run_stage()
33
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
34
+ self._run_sanity_check()
35
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
36
+ val_loop.run()
37
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
38
+ return loop_run(self, *args, **kwargs)
39
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
40
+ self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
41
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
42
+ output = call._call_strategy_hook(trainer, hook_name, *step_args)
43
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
44
+ output = fn(*args, **kwargs)
45
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
46
+ return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
47
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
48
+ wrapper_output = wrapper_module(*args, **kwargs)
49
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
50
+ return self._call_impl(*args, **kwargs)
51
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
52
+ return forward_call(*args, **kwargs)
53
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
54
+ ret_val = func(*args, **kwargs)
55
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
56
+ loss = self.module(*inputs, **kwargs)
57
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
58
+ return self._call_impl(*args, **kwargs)
59
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
60
+ return inner()
61
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
62
+ result = forward_call(*args, **kwargs)
63
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
64
+ out = method(*_args, **_kwargs)
65
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
66
+ return func(*args, **kwargs)
67
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
68
+ blip2_loss = self.blip2qformer(batch)
69
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
70
+ return self._call_impl(*args, **kwargs)
71
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
72
+ return forward_call(*args, **kwargs)
73
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
74
+ sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
75
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
76
+ loss_graph = F.cross_entropy(logits_per_graph, labels)
77
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
78
+ return torch._C._nn.cross_entropy_loss(
79
+ RuntimeError: size mismatch (got input: [4], target: [1])
80
+ [rank0]: Traceback (most recent call last):
81
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
82
+ [rank0]: main(args)
83
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
84
+ [rank0]: trainer.fit(model, datamodule=dm)
85
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
86
+ [rank0]: call._call_and_handle_interrupt(
87
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
88
+ [rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
89
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
90
+ [rank0]: return function(*args, **kwargs)
91
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
92
+ [rank0]: self._run(model, ckpt_path=ckpt_path)
93
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
94
+ [rank0]: results = self._run_stage()
95
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
96
+ [rank0]: self._run_sanity_check()
97
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
98
+ [rank0]: val_loop.run()
99
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
100
+ [rank0]: return loop_run(self, *args, **kwargs)
101
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
102
+ [rank0]: self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
103
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
104
+ [rank0]: output = call._call_strategy_hook(trainer, hook_name, *step_args)
105
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
106
+ [rank0]: output = fn(*args, **kwargs)
107
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
108
+ [rank0]: return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
109
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
110
+ [rank0]: wrapper_output = wrapper_module(*args, **kwargs)
111
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
112
+ [rank0]: return self._call_impl(*args, **kwargs)
113
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
114
+ [rank0]: return forward_call(*args, **kwargs)
115
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
116
+ [rank0]: ret_val = func(*args, **kwargs)
117
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
118
+ [rank0]: loss = self.module(*inputs, **kwargs)
119
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
120
+ [rank0]: return self._call_impl(*args, **kwargs)
121
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
122
+ [rank0]: return inner()
123
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
124
+ [rank0]: result = forward_call(*args, **kwargs)
125
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
126
+ [rank0]: out = method(*_args, **_kwargs)
127
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
128
+ [rank0]: return func(*args, **kwargs)
129
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
130
+ [rank0]: blip2_loss = self.blip2qformer(batch)
131
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
132
+ [rank0]: return self._call_impl(*args, **kwargs)
133
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
134
+ [rank0]: return forward_call(*args, **kwargs)
135
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
136
+ [rank0]: sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
137
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
138
+ [rank0]: loss_graph = F.cross_entropy(logits_per_graph, labels)
139
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
140
+ [rank0]: return torch._C._nn.cross_entropy_loss(
141
+ [rank0]: RuntimeError: size mismatch (got input: [4], target: [1])
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==2.2.6
2
+ confection==0.1.5
3
+ text-unidecode==1.3
4
+ contexttimer==0.3.3
5
+ omegaconf==2.3.0
6
+ tzdata==2025.2
7
+ nvidia-cuda-nvrtc-cu12==12.4.127
8
+ plotly==6.1.1
9
+ decord==0.6.0
10
+ nvidia-cublas-cu12==12.4.5.8
11
+ scipy==1.15.3
12
+ nvidia-cufile-cu12==1.11.1.6
13
+ parso==0.8.4
14
+ python-dateutil==2.9.0.post0
15
+ setuptools==78.1.1
16
+ aiosignal==1.3.2
17
+ joblib==1.5.1
18
+ platformdirs==4.3.8
19
+ regex==2024.11.6
20
+ aiohappyeyeballs==2.6.1
21
+ virtualenv==20.31.2
22
+ lazy_loader==0.4
23
+ rich==14.0.0
24
+ timm==0.4.12
25
+ antlr4-python3-runtime==4.9.3
26
+ pandas==2.2.3
27
+ salesforce-lavis==1.0.2
28
+ gitdb==4.0.12
29
+ six==1.17.0
30
+ smmap==5.0.2
31
+ annotated-types==0.7.0
32
+ pyparsing==3.2.3
33
+ Jinja2==3.1.6
34
+ ptyprocess==0.7.0
35
+ streamlit==1.45.1
36
+ idna==3.10
37
+ nvidia-cusolver-cu12==11.6.1.9
38
+ tenacity==9.1.2
39
+ sentencepiece==0.2.0
40
+ matplotlib-inline==0.1.7
41
+ typing-inspection==0.4.1
42
+ packaging==24.2
43
+ nltk==3.9.1
44
+ wheel==0.45.1
45
+ catalogue==2.0.10
46
+ matplotlib==3.10.3
47
+ propcache==0.3.1
48
+ Pygments==2.19.1
49
+ nvidia-nvjitlink-cu12==12.4.127
50
+ requests==2.32.3
51
+ filelock==3.18.0
52
+ pexpect==4.9.0
53
+ opencv-python-headless==4.5.5.64
54
+ certifi==2025.4.26
55
+ nvidia-nvtx-cu12==12.4.127
56
+ bleach==6.2.0
57
+ typing_extensions==4.13.2
58
+ tornado==6.5.1
59
+ networkx==3.4.2
60
+ sympy==1.13.1
61
+ watchdog==6.0.0
62
+ kaggle==1.7.4.5
63
+ nvidia-ml-py==12.575.51
64
+ pyarrow==20.0.0
65
+ mpmath==1.3.0
66
+ lightning-utilities==0.14.3
67
+ ftfy==6.3.1
68
+ triton==3.2.0
69
+ referencing==0.36.2
70
+ ipython==8.36.0
71
+ yarl==1.20.0
72
+ language_data==1.3.0
73
+ cycler==0.12.1
74
+ python-magic==0.4.27
75
+ wasabi==1.1.3
76
+ protobuf==6.31.0
77
+ murmurhash==1.0.13
78
+ jsonschema-specifications==2025.4.1
79
+ blinker==1.9.0
80
+ fonttools==4.58.0
81
+ imageio==2.37.0
82
+ pycocoevalcap==1.2
83
+ nvidia-cuda-cupti-cu12==12.4.127
84
+ fairscale==0.4.4
85
+ hjson==3.1.0
86
+ identify==2.6.12
87
+ mdurl==0.1.2
88
+ decorator==5.2.1
89
+ distlib==0.3.9
90
+ webencodings==0.5.1
91
+ kiwisolver==1.4.8
92
+ srsly==2.5.1
93
+ frozenlist==1.6.0
94
+ blis==1.3.0
95
+ contourpy==1.3.2
96
+ hf-xet==1.1.2
97
+ cymem==2.0.11
98
+ pillow==11.2.1
99
+ pycocotools==2.0.8
100
+ pre_commit==4.2.0
101
+ wrapt==1.17.2
102
+ nvidia-curand-cu12==10.3.5.147
103
+ spacy==3.8.7
104
+ rpds-py==0.25.1
105
+ exceptiongroup==1.3.0
106
+ braceexpand==0.1.7
107
+ rouge_score==0.1.2
108
+ async-timeout==5.0.1
109
+ torchmetrics==1.7.1
110
+ nvidia-nccl-cu12==2.21.5
111
+ wcwidth==0.2.13
112
+ nvidia-cusparselt-cu12==0.6.2
113
+ scikit-image==0.25.2
114
+ urllib3==2.4.0
115
+ portalocker==3.1.1
116
+ smart-open==7.1.0
117
+ cfgv==3.4.0
118
+ markdown-it-py==3.0.0
119
+ charset-normalizer==3.4.2
120
+ executing==2.2.0
121
+ pure_eval==0.2.3
122
+ safetensors==0.5.3
123
+ spacy-legacy==3.0.12
124
+ shellingham==1.5.4
125
+ langcodes==3.5.0
126
+ pytz==2025.2
127
+ iopath==0.1.10
128
+ weasel==0.4.1
129
+ tifffile==2025.5.10
130
+ nodeenv==1.9.1
131
+ absl-py==2.2.2
132
+ einops==0.8.1
133
+ msgpack==1.1.0
134
+ pydantic_core==2.33.2
135
+ ninja==1.11.1.4
136
+ altair==5.5.0
137
+ attrs==25.3.0
138
+ tqdm==4.67.1
139
+ deepspeed==0.16.10+b666844f
140
+ pydeck==0.9.1
141
+ stack-data==0.6.3
142
+ pydantic==2.11.5
143
+ torch==2.6.0
144
+ nvidia-cudnn-cu12==9.1.0.70
145
+ python-slugify==8.0.4
146
+ webdataset==0.2.111
147
+ pytorch-lightning==2.5.1.post0
148
+ prompt_toolkit==3.0.51
149
+ psutil==7.0.0
150
+ opendatasets==0.1.22
151
+ asttokens==3.0.0
152
+ MarkupSafe==3.0.2
153
+ multidict==6.4.4
154
+ nvidia-cufft-cu12==11.2.1.3
155
+ GitPython==3.1.44
156
+ PyYAML==6.0.2
157
+ cloudpathlib==0.21.1
158
+ toml==0.10.2
159
+ marisa-trie==1.2.1
160
+ traitlets==5.14.3
161
+ cachetools==5.5.2
162
+ spacy-loggers==1.0.5
163
+ nvidia-cuda-runtime-cu12==12.4.127
164
+ torchvision==0.21.0
165
+ nvidia-cusparse-cu12==12.3.1.170
166
+ jedi==0.19.2
167
+ thinc==8.3.6
168
+ py-cpuinfo==9.0.0
169
+ yacs==0.1.8
170
+ cffi==1.17.1
171
+ preshed==3.0.10
172
+ more-itertools==10.7.0
173
+ bigmodelvis==0.0.1
174
+ datasets==3.6.0
175
+ huggingface-hub==0.32.1
176
+ narwhals==1.41.0
177
+ xxhash==3.5.0
178
+ sentry-sdk==2.29.1
179
+ aiohttp==3.12.2
180
+ opencv-python==4.11.0.86
181
+ pycryptodome==3.23.0
182
+ threadpoolctl==3.6.0
183
+ flash-attn==2.7.1.post1
184
+ transformers==4.52.3
185
+ pycparser==2.22
186
+ pathlib==1.0.1
187
+ dill==0.3.8
188
+ scikit-learn==1.6.1
189
+ tokenizers==0.21.1
190
+ aliyun-python-sdk-core==2.16.0
191
+ fsspec==2025.3.0
192
+ jmespath==0.10.0
193
+ click==8.2.1
194
+ delta-center-client==0.0.4
195
+ cheroot==10.0.1
196
+ wandb==0.19.11
197
+ setproctitle==1.3.6
198
+ jsonschema==4.24.0
199
+ oss2==2.15.0
200
+ multiprocess==0.70.16
201
+ jaraco.functools==4.1.0
202
+ web.py==0.62
203
+ aliyun-python-sdk-kms==2.16.5
204
+ cryptography==45.0.3
205
+ pip==25.1.1
206
+ docker-pycreds==0.4.0
207
+ typer==0.16.0
208
+ opendelta==0.3.2
209
+ crcmod==1.7
210
+ jaraco.functools==4.0.1
211
+ inflect==7.3.1
212
+ jaraco.collections==5.1.0
213
+ packaging==24.2
214
+ wheel==0.45.1
215
+ tomli==2.0.1
216
+ platformdirs==4.2.2
217
+ typing_extensions==4.12.2
218
+ more-itertools==10.3.0
219
+ autocommand==2.2.2
220
+ jaraco.text==3.12.1
221
+ importlib_metadata==8.0.0
222
+ jaraco.context==5.3.0
223
+ zipp==3.19.2
224
+ backports.tarfile==1.2.0
225
+ typeguard==4.3.0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-metadata.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-06-22T11:01:09.864619Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_06221723",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "20",
22
+ "--batch_size",
23
+ "32",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "8",
29
+ "--use_wandb_logger"
30
+ ],
31
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
32
+ "codePath": "stage1.py",
33
+ "email": "gia0603yucca@gmail.com",
34
+ "root": "./all_checkpoints/stage1_06221723/",
35
+ "host": "dsw-251511-c5cfcb8-lwcpt",
36
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
37
+ "codePathLocal": "stage1.py",
38
+ "cpu_count": 64,
39
+ "cpu_count_logical": 64,
40
+ "gpu": "NVIDIA A800-SXM4-80GB",
41
+ "gpu_count": 8,
42
+ "disk": {
43
+ "/": {
44
+ "total": "1623302262784",
45
+ "used": "1363144704"
46
+ }
47
+ },
48
+ "memory": {
49
+ "total": "549755813888"
50
+ },
51
+ "cpu": {
52
+ "count": 64,
53
+ "countLogical": 64
54
+ },
55
+ "gpu_nvidia": [
56
+ {
57
+ "name": "NVIDIA A800-SXM4-80GB",
58
+ "memoryTotal": "85198045184",
59
+ "architecture": "Ampere"
60
+ },
61
+ {
62
+ "name": "NVIDIA A800-SXM4-80GB",
63
+ "memoryTotal": "85198045184",
64
+ "architecture": "Ampere"
65
+ },
66
+ {
67
+ "name": "NVIDIA A800-SXM4-80GB",
68
+ "memoryTotal": "85198045184",
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA A800-SXM4-80GB",
73
+ "memoryTotal": "85198045184",
74
+ "architecture": "Ampere"
75
+ },
76
+ {
77
+ "name": "NVIDIA A800-SXM4-80GB",
78
+ "memoryTotal": "85198045184",
79
+ "architecture": "Ampere"
80
+ },
81
+ {
82
+ "name": "NVIDIA A800-SXM4-80GB",
83
+ "memoryTotal": "85198045184",
84
+ "architecture": "Ampere"
85
+ },
86
+ {
87
+ "name": "NVIDIA A800-SXM4-80GB",
88
+ "memoryTotal": "85198045184",
89
+ "architecture": "Ampere"
90
+ },
91
+ {
92
+ "name": "NVIDIA A800-SXM4-80GB",
93
+ "memoryTotal": "85198045184",
94
+ "architecture": "Ampere"
95
+ }
96
+ ],
97
+ "cudaVersion": "12.1"
98
+ }
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":11}}
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-internal.log ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-22T19:01:09.866022252+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-core.log"}
2
+ {"time":"2025-06-22T19:01:10.922793248+08:00","level":"INFO","msg":"created new stream","id":"tul2l6xd"}
3
+ {"time":"2025-06-22T19:01:10.922840238+08:00","level":"INFO","msg":"stream: started","id":"tul2l6xd"}
4
+ {"time":"2025-06-22T19:01:10.922861712+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"tul2l6xd"}
5
+ {"time":"2025-06-22T19:01:10.922902903+08:00","level":"INFO","msg":"sender: started","stream_id":"tul2l6xd"}
6
+ {"time":"2025-06-22T19:01:10.922946705+08:00","level":"INFO","msg":"handler: started","stream_id":"tul2l6xd"}
7
+ {"time":"2025-06-22T19:01:12.123540259+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-06-22T19:01:21.097810278+08:00","level":"INFO","msg":"stream: closing","id":"tul2l6xd"}
9
+ {"time":"2025-06-22T19:01:21.097899274+08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2025-06-22T19:01:21.098725356+08:00","level":"INFO","msg":"Stopped system monitor"}
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Configure stats pid to 75754
3
+ 2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug.log
7
+ 2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-internal.log
8
+ 2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:init():893] starting backend
12
+ 2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-22 19:01:09,860 INFO MainThread:75754 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-22 19:01:09,861 INFO MainThread:75754 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-22 19:01:09,865 INFO MainThread:75754 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-22 19:01:09,869 INFO MainThread:75754 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-22 19:01:12,112 INFO MainThread:75754 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-22 19:01:12,253 INFO MainThread:75754 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-22 19:01:12,253 INFO MainThread:75754 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-22 19:01:12,256 INFO MainThread:75754 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-22 19:01:12,256 INFO MainThread:75754 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-22 19:01:12,257 INFO MainThread:75754 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-22 19:01:16,111 INFO MainThread:75754 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
24
+ 2025-06-22 19:01:21,097 INFO MsgRouterThr:75754 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/run-tul2l6xd.wandb ADDED
File without changes
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/config.yaml ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": trainer/global_step
6
+ "6":
7
+ - 3
8
+ "7": []
9
+ python_version: 3.10.0
10
+ t:
11
+ "1":
12
+ - 1
13
+ - 5
14
+ - 9
15
+ - 11
16
+ - 33
17
+ - 41
18
+ - 49
19
+ - 53
20
+ - 55
21
+ - 63
22
+ - 103
23
+ "2":
24
+ - 1
25
+ - 5
26
+ - 9
27
+ - 11
28
+ - 33
29
+ - 41
30
+ - 49
31
+ - 53
32
+ - 55
33
+ - 63
34
+ - 103
35
+ "3":
36
+ - 7
37
+ - 23
38
+ - 55
39
+ - 66
40
+ "4": 3.10.0
41
+ "5": 0.19.11
42
+ "6": 4.52.3
43
+ "8":
44
+ - 5
45
+ "12": 0.19.11
46
+ "13": linux-x86_64
47
+ accelerator:
48
+ value: gpu
49
+ batch_size:
50
+ value: 32
51
+ bert_hidden_dim:
52
+ value: 768
53
+ bert_name:
54
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
55
+ check_val_every_n_epoch:
56
+ value: 1
57
+ cross_attention_freq:
58
+ value: 2
59
+ devices:
60
+ value: 0,1,2,3
61
+ filename:
62
+ value: stage1_06221723
63
+ init_checkpoint:
64
+ value: ""
65
+ init_lr:
66
+ value: 0.0001
67
+ lm:
68
+ value: true
69
+ load_4bit:
70
+ value: false
71
+ lr_decay_rate:
72
+ value: 0.9
73
+ match_batch_size:
74
+ value: 64
75
+ max_epochs:
76
+ value: 20
77
+ min_lr:
78
+ value: 1e-05
79
+ mix_dataset:
80
+ value: true
81
+ mode:
82
+ value: train
83
+ num_query_token:
84
+ value: 8
85
+ num_workers:
86
+ value: 4
87
+ plm_name:
88
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
89
+ plm_tune:
90
+ value: freeze
91
+ pool_size:
92
+ value: 0
93
+ precision:
94
+ value: bf16-mixed
95
+ projection_dim:
96
+ value: 256
97
+ prot_aug:
98
+ value: None
99
+ prot_max_len:
100
+ value: 1024
101
+ ptm:
102
+ value: true
103
+ rerank_cand_num:
104
+ value: 128
105
+ retrieval_eval_epoch:
106
+ value: 10
107
+ root:
108
+ value: data
109
+ save_every_n_epochs:
110
+ value: 5
111
+ scheduler:
112
+ value: linear_warmup_cosine_lr
113
+ seed:
114
+ value: 42
115
+ strategy:
116
+ value: deepspeed
117
+ temperature:
118
+ value: 0.1
119
+ text_max_len:
120
+ value: 128
121
+ use_wandb_logger:
122
+ value: true
123
+ warmup_lr:
124
+ value: 1e-06
125
+ warmup_steps:
126
+ value: 1000
127
+ weight_decay:
128
+ value: 0.05
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/output.log ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
2
+ Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
3
+ LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
4
+
5
+ | Name | Type | Params | Mode
6
+ ------------------------------------------------------
7
+ 0 | blip2qformer | Blip2Qformer | 327 M | train
8
+ ------------------------------------------------------
9
+ 179 M Trainable params
10
+ 147 M Non-trainable params
11
+ 327 M Total params
12
+ 1,309.467 Total estimated model params size (MB)
13
+ 5 Modules in train mode
14
+ 926 Modules in eval mode
15
+ Sanity Checking DataLoader 2: 50%|██████████████████████████▌ | 1/2 [00:00<00:00, 1.92it/s]
16
+ /nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
17
+ with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
18
+ Traceback (most recent call last):
19
+ File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
20
+ main(args)
21
+ File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
22
+ trainer.fit(model, datamodule=dm)
23
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
24
+ call._call_and_handle_interrupt(
25
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
26
+ return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
27
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
28
+ return function(*args, **kwargs)
29
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
30
+ self._run(model, ckpt_path=ckpt_path)
31
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
32
+ results = self._run_stage()
33
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
34
+ self._run_sanity_check()
35
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
36
+ val_loop.run()
37
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
38
+ return loop_run(self, *args, **kwargs)
39
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
40
+ self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
41
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
42
+ output = call._call_strategy_hook(trainer, hook_name, *step_args)
43
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
44
+ output = fn(*args, **kwargs)
45
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
46
+ return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
47
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
48
+ wrapper_output = wrapper_module(*args, **kwargs)
49
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
50
+ return self._call_impl(*args, **kwargs)
51
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
52
+ return forward_call(*args, **kwargs)
53
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
54
+ ret_val = func(*args, **kwargs)
55
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
56
+ loss = self.module(*inputs, **kwargs)
57
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
58
+ return self._call_impl(*args, **kwargs)
59
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
60
+ return inner()
61
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
62
+ result = forward_call(*args, **kwargs)
63
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
64
+ out = method(*_args, **_kwargs)
65
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
66
+ return func(*args, **kwargs)
67
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
68
+ blip2_loss = self.blip2qformer(batch)
69
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
70
+ return self._call_impl(*args, **kwargs)
71
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
72
+ return forward_call(*args, **kwargs)
73
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
74
+ sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
75
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
76
+ loss_graph = F.cross_entropy(logits_per_graph, labels)
77
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
78
+ return torch._C._nn.cross_entropy_loss(
79
+ RuntimeError: size mismatch (got input: [4], target: [1])
80
+ [rank0]: Traceback (most recent call last):
81
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
82
+ [rank0]: main(args)
83
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
84
+ [rank0]: trainer.fit(model, datamodule=dm)
85
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
86
+ [rank0]: call._call_and_handle_interrupt(
87
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
88
+ [rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
89
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
90
+ [rank0]: return function(*args, **kwargs)
91
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
92
+ [rank0]: self._run(model, ckpt_path=ckpt_path)
93
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
94
+ [rank0]: results = self._run_stage()
95
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
96
+ [rank0]: self._run_sanity_check()
97
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
98
+ [rank0]: val_loop.run()
99
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
100
+ [rank0]: return loop_run(self, *args, **kwargs)
101
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
102
+ [rank0]: self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
103
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
104
+ [rank0]: output = call._call_strategy_hook(trainer, hook_name, *step_args)
105
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
106
+ [rank0]: output = fn(*args, **kwargs)
107
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
108
+ [rank0]: return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
109
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
110
+ [rank0]: wrapper_output = wrapper_module(*args, **kwargs)
111
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
112
+ [rank0]: return self._call_impl(*args, **kwargs)
113
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
114
+ [rank0]: return forward_call(*args, **kwargs)
115
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
116
+ [rank0]: ret_val = func(*args, **kwargs)
117
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
118
+ [rank0]: loss = self.module(*inputs, **kwargs)
119
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
120
+ [rank0]: return self._call_impl(*args, **kwargs)
121
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
122
+ [rank0]: return inner()
123
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
124
+ [rank0]: result = forward_call(*args, **kwargs)
125
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
126
+ [rank0]: out = method(*_args, **_kwargs)
127
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
128
+ [rank0]: return func(*args, **kwargs)
129
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
130
+ [rank0]: blip2_loss = self.blip2qformer(batch)
131
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
132
+ [rank0]: return self._call_impl(*args, **kwargs)
133
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
134
+ [rank0]: return forward_call(*args, **kwargs)
135
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
136
+ [rank0]: sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
137
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
138
+ [rank0]: loss_graph = F.cross_entropy(logits_per_graph, labels)
139
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
140
+ [rank0]: return torch._C._nn.cross_entropy_loss(
141
+ [rank0]: RuntimeError: size mismatch (got input: [4], target: [1])
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==2.2.6
2
+ confection==0.1.5
3
+ text-unidecode==1.3
4
+ contexttimer==0.3.3
5
+ omegaconf==2.3.0
6
+ tzdata==2025.2
7
+ nvidia-cuda-nvrtc-cu12==12.4.127
8
+ plotly==6.1.1
9
+ decord==0.6.0
10
+ nvidia-cublas-cu12==12.4.5.8
11
+ scipy==1.15.3
12
+ nvidia-cufile-cu12==1.11.1.6
13
+ parso==0.8.4
14
+ python-dateutil==2.9.0.post0
15
+ setuptools==78.1.1
16
+ aiosignal==1.3.2
17
+ joblib==1.5.1
18
+ platformdirs==4.3.8
19
+ regex==2024.11.6
20
+ aiohappyeyeballs==2.6.1
21
+ virtualenv==20.31.2
22
+ lazy_loader==0.4
23
+ rich==14.0.0
24
+ timm==0.4.12
25
+ antlr4-python3-runtime==4.9.3
26
+ pandas==2.2.3
27
+ salesforce-lavis==1.0.2
28
+ gitdb==4.0.12
29
+ six==1.17.0
30
+ smmap==5.0.2
31
+ annotated-types==0.7.0
32
+ pyparsing==3.2.3
33
+ Jinja2==3.1.6
34
+ ptyprocess==0.7.0
35
+ streamlit==1.45.1
36
+ idna==3.10
37
+ nvidia-cusolver-cu12==11.6.1.9
38
+ tenacity==9.1.2
39
+ sentencepiece==0.2.0
40
+ matplotlib-inline==0.1.7
41
+ typing-inspection==0.4.1
42
+ packaging==24.2
43
+ nltk==3.9.1
44
+ wheel==0.45.1
45
+ catalogue==2.0.10
46
+ matplotlib==3.10.3
47
+ propcache==0.3.1
48
+ Pygments==2.19.1
49
+ nvidia-nvjitlink-cu12==12.4.127
50
+ requests==2.32.3
51
+ filelock==3.18.0
52
+ pexpect==4.9.0
53
+ opencv-python-headless==4.5.5.64
54
+ certifi==2025.4.26
55
+ nvidia-nvtx-cu12==12.4.127
56
+ bleach==6.2.0
57
+ typing_extensions==4.13.2
58
+ tornado==6.5.1
59
+ networkx==3.4.2
60
+ sympy==1.13.1
61
+ watchdog==6.0.0
62
+ kaggle==1.7.4.5
63
+ nvidia-ml-py==12.575.51
64
+ pyarrow==20.0.0
65
+ mpmath==1.3.0
66
+ lightning-utilities==0.14.3
67
+ ftfy==6.3.1
68
+ triton==3.2.0
69
+ referencing==0.36.2
70
+ ipython==8.36.0
71
+ yarl==1.20.0
72
+ language_data==1.3.0
73
+ cycler==0.12.1
74
+ python-magic==0.4.27
75
+ wasabi==1.1.3
76
+ protobuf==6.31.0
77
+ murmurhash==1.0.13
78
+ jsonschema-specifications==2025.4.1
79
+ blinker==1.9.0
80
+ fonttools==4.58.0
81
+ imageio==2.37.0
82
+ pycocoevalcap==1.2
83
+ nvidia-cuda-cupti-cu12==12.4.127
84
+ fairscale==0.4.4
85
+ hjson==3.1.0
86
+ identify==2.6.12
87
+ mdurl==0.1.2
88
+ decorator==5.2.1
89
+ distlib==0.3.9
90
+ webencodings==0.5.1
91
+ kiwisolver==1.4.8
92
+ srsly==2.5.1
93
+ frozenlist==1.6.0
94
+ blis==1.3.0
95
+ contourpy==1.3.2
96
+ hf-xet==1.1.2
97
+ cymem==2.0.11
98
+ pillow==11.2.1
99
+ pycocotools==2.0.8
100
+ pre_commit==4.2.0
101
+ wrapt==1.17.2
102
+ nvidia-curand-cu12==10.3.5.147
103
+ spacy==3.8.7
104
+ rpds-py==0.25.1
105
+ exceptiongroup==1.3.0
106
+ braceexpand==0.1.7
107
+ rouge_score==0.1.2
108
+ async-timeout==5.0.1
109
+ torchmetrics==1.7.1
110
+ nvidia-nccl-cu12==2.21.5
111
+ wcwidth==0.2.13
112
+ nvidia-cusparselt-cu12==0.6.2
113
+ scikit-image==0.25.2
114
+ urllib3==2.4.0
115
+ portalocker==3.1.1
116
+ smart-open==7.1.0
117
+ cfgv==3.4.0
118
+ markdown-it-py==3.0.0
119
+ charset-normalizer==3.4.2
120
+ executing==2.2.0
121
+ pure_eval==0.2.3
122
+ safetensors==0.5.3
123
+ spacy-legacy==3.0.12
124
+ shellingham==1.5.4
125
+ langcodes==3.5.0
126
+ pytz==2025.2
127
+ iopath==0.1.10
128
+ weasel==0.4.1
129
+ tifffile==2025.5.10
130
+ nodeenv==1.9.1
131
+ absl-py==2.2.2
132
+ einops==0.8.1
133
+ msgpack==1.1.0
134
+ pydantic_core==2.33.2
135
+ ninja==1.11.1.4
136
+ altair==5.5.0
137
+ attrs==25.3.0
138
+ tqdm==4.67.1
139
+ deepspeed==0.16.10+b666844f
140
+ pydeck==0.9.1
141
+ stack-data==0.6.3
142
+ pydantic==2.11.5
143
+ torch==2.6.0
144
+ nvidia-cudnn-cu12==9.1.0.70
145
+ python-slugify==8.0.4
146
+ webdataset==0.2.111
147
+ pytorch-lightning==2.5.1.post0
148
+ prompt_toolkit==3.0.51
149
+ psutil==7.0.0
150
+ opendatasets==0.1.22
151
+ asttokens==3.0.0
152
+ MarkupSafe==3.0.2
153
+ multidict==6.4.4
154
+ nvidia-cufft-cu12==11.2.1.3
155
+ GitPython==3.1.44
156
+ PyYAML==6.0.2
157
+ cloudpathlib==0.21.1
158
+ toml==0.10.2
159
+ marisa-trie==1.2.1
160
+ traitlets==5.14.3
161
+ cachetools==5.5.2
162
+ spacy-loggers==1.0.5
163
+ nvidia-cuda-runtime-cu12==12.4.127
164
+ torchvision==0.21.0
165
+ nvidia-cusparse-cu12==12.3.1.170
166
+ jedi==0.19.2
167
+ thinc==8.3.6
168
+ py-cpuinfo==9.0.0
169
+ yacs==0.1.8
170
+ cffi==1.17.1
171
+ preshed==3.0.10
172
+ more-itertools==10.7.0
173
+ bigmodelvis==0.0.1
174
+ datasets==3.6.0
175
+ huggingface-hub==0.32.1
176
+ narwhals==1.41.0
177
+ xxhash==3.5.0
178
+ sentry-sdk==2.29.1
179
+ aiohttp==3.12.2
180
+ opencv-python==4.11.0.86
181
+ pycryptodome==3.23.0
182
+ threadpoolctl==3.6.0
183
+ flash-attn==2.7.1.post1
184
+ transformers==4.52.3
185
+ pycparser==2.22
186
+ pathlib==1.0.1
187
+ dill==0.3.8
188
+ scikit-learn==1.6.1
189
+ tokenizers==0.21.1
190
+ aliyun-python-sdk-core==2.16.0
191
+ fsspec==2025.3.0
192
+ jmespath==0.10.0
193
+ click==8.2.1
194
+ delta-center-client==0.0.4
195
+ cheroot==10.0.1
196
+ wandb==0.19.11
197
+ setproctitle==1.3.6
198
+ jsonschema==4.24.0
199
+ oss2==2.15.0
200
+ multiprocess==0.70.16
201
+ jaraco.functools==4.1.0
202
+ web.py==0.62
203
+ aliyun-python-sdk-kms==2.16.5
204
+ cryptography==45.0.3
205
+ pip==25.1.1
206
+ docker-pycreds==0.4.0
207
+ typer==0.16.0
208
+ opendelta==0.3.2
209
+ crcmod==1.7
210
+ jaraco.functools==4.0.1
211
+ inflect==7.3.1
212
+ jaraco.collections==5.1.0
213
+ packaging==24.2
214
+ wheel==0.45.1
215
+ tomli==2.0.1
216
+ platformdirs==4.2.2
217
+ typing_extensions==4.12.2
218
+ more-itertools==10.3.0
219
+ autocommand==2.2.2
220
+ jaraco.text==3.12.1
221
+ importlib_metadata==8.0.0
222
+ jaraco.context==5.3.0
223
+ zipp==3.19.2
224
+ backports.tarfile==1.2.0
225
+ typeguard==4.3.0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-metadata.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-06-22T11:04:29.455254Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_06221723",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "20",
22
+ "--batch_size",
23
+ "32",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "4",
29
+ "--use_wandb_logger"
30
+ ],
31
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
32
+ "codePath": "stage1.py",
33
+ "email": "gia0603yucca@gmail.com",
34
+ "root": "./all_checkpoints/stage1_06221723/",
35
+ "host": "dsw-251511-c5cfcb8-lwcpt",
36
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
37
+ "codePathLocal": "stage1.py",
38
+ "cpu_count": 64,
39
+ "cpu_count_logical": 64,
40
+ "gpu": "NVIDIA A800-SXM4-80GB",
41
+ "gpu_count": 8,
42
+ "disk": {
43
+ "/": {
44
+ "total": "1623302262784",
45
+ "used": "1363148800"
46
+ }
47
+ },
48
+ "memory": {
49
+ "total": "549755813888"
50
+ },
51
+ "cpu": {
52
+ "count": 64,
53
+ "countLogical": 64
54
+ },
55
+ "gpu_nvidia": [
56
+ {
57
+ "name": "NVIDIA A800-SXM4-80GB",
58
+ "memoryTotal": "85198045184",
59
+ "architecture": "Ampere"
60
+ },
61
+ {
62
+ "name": "NVIDIA A800-SXM4-80GB",
63
+ "memoryTotal": "85198045184",
64
+ "architecture": "Ampere"
65
+ },
66
+ {
67
+ "name": "NVIDIA A800-SXM4-80GB",
68
+ "memoryTotal": "85198045184",
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA A800-SXM4-80GB",
73
+ "memoryTotal": "85198045184",
74
+ "architecture": "Ampere"
75
+ },
76
+ {
77
+ "name": "NVIDIA A800-SXM4-80GB",
78
+ "memoryTotal": "85198045184",
79
+ "architecture": "Ampere"
80
+ },
81
+ {
82
+ "name": "NVIDIA A800-SXM4-80GB",
83
+ "memoryTotal": "85198045184",
84
+ "architecture": "Ampere"
85
+ },
86
+ {
87
+ "name": "NVIDIA A800-SXM4-80GB",
88
+ "memoryTotal": "85198045184",
89
+ "architecture": "Ampere"
90
+ },
91
+ {
92
+ "name": "NVIDIA A800-SXM4-80GB",
93
+ "memoryTotal": "85198045184",
94
+ "architecture": "Ampere"
95
+ }
96
+ ],
97
+ "cudaVersion": "12.1"
98
+ }
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":11}}
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-internal.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-22T19:04:29.457094442+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-core.log"}
2
+ {"time":"2025-06-22T19:04:30.490221799+08:00","level":"INFO","msg":"created new stream","id":"bq9amgfj"}
3
+ {"time":"2025-06-22T19:04:30.490257318+08:00","level":"INFO","msg":"stream: started","id":"bq9amgfj"}
4
+ {"time":"2025-06-22T19:04:30.490294134+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"bq9amgfj"}
5
+ {"time":"2025-06-22T19:04:30.490304877+08:00","level":"INFO","msg":"sender: started","stream_id":"bq9amgfj"}
6
+ {"time":"2025-06-22T19:04:30.490450091+08:00","level":"INFO","msg":"handler: started","stream_id":"bq9amgfj"}
7
+ {"time":"2025-06-22T19:04:31.742014982+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-06-22T19:04:40.504265321+08:00","level":"INFO","msg":"stream: closing","id":"bq9amgfj"}
9
+ {"time":"2025-06-22T19:04:40.504307897+08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2025-06-22T19:04:40.505067489+08:00","level":"INFO","msg":"Stopped system monitor"}
11
+ {"time":"2025-06-22T19:04:41.953923124+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
12
+ {"time":"2025-06-22T19:04:43.137437891+08:00","level":"INFO","msg":"handler: closed","stream_id":"bq9amgfj"}
13
+ {"time":"2025-06-22T19:04:43.137489667+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"bq9amgfj"}
14
+ {"time":"2025-06-22T19:04:43.137501311+08:00","level":"INFO","msg":"sender: closed","stream_id":"bq9amgfj"}
15
+ {"time":"2025-06-22T19:04:43.141402359+08:00","level":"INFO","msg":"stream: closed","id":"bq9amgfj"}
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Configure stats pid to 79876
3
+ 2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug.log
7
+ 2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-internal.log
8
+ 2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:init():893] starting backend
12
+ 2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-22 19:04:29,450 INFO MainThread:79876 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-22 19:04:29,452 INFO MainThread:79876 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-22 19:04:29,456 INFO MainThread:79876 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-22 19:04:29,458 INFO MainThread:79876 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-22 19:04:31,693 INFO MainThread:79876 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-22 19:04:31,896 INFO MainThread:79876 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-22 19:04:31,897 INFO MainThread:79876 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-22 19:04:31,899 INFO MainThread:79876 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-22 19:04:31,899 INFO MainThread:79876 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-22 19:04:31,900 INFO MainThread:79876 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-22 19:04:35,461 INFO MainThread:79876 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 4, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
24
+ 2025-06-22 19:04:40,503 INFO MsgRouterThr:79876 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/run-bq9amgfj.wandb ADDED
Binary file (20.6 kB). View file
 
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/config.yaml ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": loader1/val_loss/dataloader_idx_1
6
+ "5": 2
7
+ "6":
8
+ - 1
9
+ - 3
10
+ "7": []
11
+ - "1": trainer/global_step
12
+ "6":
13
+ - 3
14
+ "7": []
15
+ - "1": loader1/val_loss_ptm/dataloader_idx_1
16
+ "5": 2
17
+ "6":
18
+ - 1
19
+ - 3
20
+ "7": []
21
+ - "1": train_loss
22
+ "5": 2
23
+ "6":
24
+ - 1
25
+ - 3
26
+ "7": []
27
+ - "1": loader2/val_loss_ptm/dataloader_idx_2
28
+ "5": 2
29
+ "6":
30
+ - 1
31
+ - 3
32
+ "7": []
33
+ - "1": loader2/val_loss/dataloader_idx_2
34
+ "5": 2
35
+ "6":
36
+ - 1
37
+ - 3
38
+ "7": []
39
+ - "1": epoch
40
+ "5": 2
41
+ "6":
42
+ - 1
43
+ - 3
44
+ "7": []
45
+ - "1": train_loss_ptm
46
+ "5": 2
47
+ "6":
48
+ - 1
49
+ - 3
50
+ "7": []
51
+ - "1": train_loss_lm
52
+ "5": 2
53
+ "6":
54
+ - 1
55
+ - 3
56
+ "7": []
57
+ - "1": loader1/val_loss_ptc/dataloader_idx_1
58
+ "5": 2
59
+ "6":
60
+ - 1
61
+ - 3
62
+ "7": []
63
+ - "1": loader0/val_loss_ptm/dataloader_idx_0
64
+ "5": 2
65
+ "6":
66
+ - 1
67
+ - 3
68
+ "7": []
69
+ - "1": loader0/val_loss_lm/dataloader_idx_0
70
+ "5": 2
71
+ "6":
72
+ - 1
73
+ - 3
74
+ "7": []
75
+ - "1": loader2/val_loss_lm/dataloader_idx_2
76
+ "5": 2
77
+ "6":
78
+ - 1
79
+ - 3
80
+ "7": []
81
+ - "1": loader1/val_loss_lm/dataloader_idx_1
82
+ "5": 2
83
+ "6":
84
+ - 1
85
+ - 3
86
+ "7": []
87
+ - "1": train_loss_ptc
88
+ "5": 2
89
+ "6":
90
+ - 1
91
+ - 3
92
+ "7": []
93
+ - "1": loader0/val_loss_ptc/dataloader_idx_0
94
+ "5": 2
95
+ "6":
96
+ - 1
97
+ - 3
98
+ "7": []
99
+ - "1": loader0/val_loss/dataloader_idx_0
100
+ "5": 2
101
+ "6":
102
+ - 1
103
+ - 3
104
+ "7": []
105
+ - "1": lr
106
+ "5": 2
107
+ "6":
108
+ - 1
109
+ - 3
110
+ "7": []
111
+ - "1": loader2/val_loss_ptc/dataloader_idx_2
112
+ "5": 2
113
+ "6":
114
+ - 1
115
+ - 3
116
+ "7": []
117
+ python_version: 3.10.0
118
+ t:
119
+ "1":
120
+ - 1
121
+ - 5
122
+ - 9
123
+ - 11
124
+ - 33
125
+ - 41
126
+ - 49
127
+ - 53
128
+ - 55
129
+ - 63
130
+ - 103
131
+ "2":
132
+ - 1
133
+ - 5
134
+ - 9
135
+ - 11
136
+ - 33
137
+ - 41
138
+ - 49
139
+ - 53
140
+ - 55
141
+ - 63
142
+ - 103
143
+ "3":
144
+ - 7
145
+ - 23
146
+ - 55
147
+ - 66
148
+ "4": 3.10.0
149
+ "5": 0.19.11
150
+ "6": 4.52.3
151
+ "8":
152
+ - 5
153
+ "12": 0.19.11
154
+ "13": linux-x86_64
155
+ accelerator:
156
+ value: gpu
157
+ batch_size:
158
+ value: 32
159
+ bert_hidden_dim:
160
+ value: 768
161
+ bert_name:
162
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
163
+ check_val_every_n_epoch:
164
+ value: 1
165
+ cross_attention_freq:
166
+ value: 2
167
+ devices:
168
+ value: 0,1,2,3,4,5,6,7
169
+ filename:
170
+ value: stage1_06221723
171
+ init_checkpoint:
172
+ value: ""
173
+ init_lr:
174
+ value: 0.0001
175
+ lm:
176
+ value: true
177
+ load_4bit:
178
+ value: false
179
+ lr_decay_rate:
180
+ value: 0.9
181
+ match_batch_size:
182
+ value: 64
183
+ max_epochs:
184
+ value: 20
185
+ min_lr:
186
+ value: 1e-05
187
+ mix_dataset:
188
+ value: true
189
+ mode:
190
+ value: train
191
+ num_query_token:
192
+ value: 8
193
+ num_workers:
194
+ value: 8
195
+ plm_name:
196
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
197
+ plm_tune:
198
+ value: freeze
199
+ pool_size:
200
+ value: 0
201
+ precision:
202
+ value: bf16-mixed
203
+ projection_dim:
204
+ value: 256
205
+ prot_aug:
206
+ value: None
207
+ prot_max_len:
208
+ value: 1024
209
+ ptm:
210
+ value: true
211
+ rerank_cand_num:
212
+ value: 128
213
+ retrieval_eval_epoch:
214
+ value: 10
215
+ root:
216
+ value: data
217
+ save_every_n_epochs:
218
+ value: 5
219
+ scheduler:
220
+ value: linear_warmup_cosine_lr
221
+ seed:
222
+ value: 42
223
+ strategy:
224
+ value: deepspeed
225
+ temperature:
226
+ value: 0.1
227
+ text_max_len:
228
+ value: 128
229
+ use_wandb_logger:
230
+ value: true
231
+ warmup_lr:
232
+ value: 1e-06
233
+ warmup_steps:
234
+ value: 1000
235
+ weight_decay:
236
+ value: 0.05
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/output.log ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
2
+ Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
3
+ LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
4
+
5
+ | Name | Type | Params | Mode
6
+ ------------------------------------------------------
7
+ 0 | blip2qformer | Blip2Qformer | 327 M | train
8
+ ------------------------------------------------------
9
+ 179 M Trainable params
10
+ 147 M Non-trainable params
11
+ 327 M Total params
12
+ 1,309.467 Total estimated model params size (MB)
13
+ 5 Modules in train mode
14
+ 926 Modules in eval mode
15
+ Epoch 3: 18%|██████████▊ | 11/61 [00:08<00:37, 1.35it/s, v_num=24k3]
16
+ /nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
17
+ with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
18
+
19
+
20
+ Detected KeyboardInterrupt, attempting graceful shutdown ...
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==2.2.6
2
+ confection==0.1.5
3
+ text-unidecode==1.3
4
+ contexttimer==0.3.3
5
+ omegaconf==2.3.0
6
+ tzdata==2025.2
7
+ nvidia-cuda-nvrtc-cu12==12.4.127
8
+ plotly==6.1.1
9
+ decord==0.6.0
10
+ nvidia-cublas-cu12==12.4.5.8
11
+ scipy==1.15.3
12
+ nvidia-cufile-cu12==1.11.1.6
13
+ parso==0.8.4
14
+ python-dateutil==2.9.0.post0
15
+ setuptools==78.1.1
16
+ aiosignal==1.3.2
17
+ joblib==1.5.1
18
+ platformdirs==4.3.8
19
+ regex==2024.11.6
20
+ aiohappyeyeballs==2.6.1
21
+ virtualenv==20.31.2
22
+ lazy_loader==0.4
23
+ rich==14.0.0
24
+ timm==0.4.12
25
+ antlr4-python3-runtime==4.9.3
26
+ pandas==2.2.3
27
+ salesforce-lavis==1.0.2
28
+ gitdb==4.0.12
29
+ six==1.17.0
30
+ smmap==5.0.2
31
+ annotated-types==0.7.0
32
+ pyparsing==3.2.3
33
+ Jinja2==3.1.6
34
+ ptyprocess==0.7.0
35
+ streamlit==1.45.1
36
+ idna==3.10
37
+ nvidia-cusolver-cu12==11.6.1.9
38
+ tenacity==9.1.2
39
+ sentencepiece==0.2.0
40
+ matplotlib-inline==0.1.7
41
+ typing-inspection==0.4.1
42
+ packaging==24.2
43
+ nltk==3.9.1
44
+ wheel==0.45.1
45
+ catalogue==2.0.10
46
+ matplotlib==3.10.3
47
+ propcache==0.3.1
48
+ Pygments==2.19.1
49
+ nvidia-nvjitlink-cu12==12.4.127
50
+ requests==2.32.3
51
+ filelock==3.18.0
52
+ pexpect==4.9.0
53
+ opencv-python-headless==4.5.5.64
54
+ certifi==2025.4.26
55
+ nvidia-nvtx-cu12==12.4.127
56
+ bleach==6.2.0
57
+ typing_extensions==4.13.2
58
+ tornado==6.5.1
59
+ networkx==3.4.2
60
+ sympy==1.13.1
61
+ watchdog==6.0.0
62
+ kaggle==1.7.4.5
63
+ nvidia-ml-py==12.575.51
64
+ pyarrow==20.0.0
65
+ mpmath==1.3.0
66
+ lightning-utilities==0.14.3
67
+ ftfy==6.3.1
68
+ triton==3.2.0
69
+ referencing==0.36.2
70
+ ipython==8.36.0
71
+ yarl==1.20.0
72
+ language_data==1.3.0
73
+ cycler==0.12.1
74
+ python-magic==0.4.27
75
+ wasabi==1.1.3
76
+ protobuf==6.31.0
77
+ murmurhash==1.0.13
78
+ jsonschema-specifications==2025.4.1
79
+ blinker==1.9.0
80
+ fonttools==4.58.0
81
+ imageio==2.37.0
82
+ pycocoevalcap==1.2
83
+ nvidia-cuda-cupti-cu12==12.4.127
84
+ fairscale==0.4.4
85
+ hjson==3.1.0
86
+ identify==2.6.12
87
+ mdurl==0.1.2
88
+ decorator==5.2.1
89
+ distlib==0.3.9
90
+ webencodings==0.5.1
91
+ kiwisolver==1.4.8
92
+ srsly==2.5.1
93
+ frozenlist==1.6.0
94
+ blis==1.3.0
95
+ contourpy==1.3.2
96
+ hf-xet==1.1.2
97
+ cymem==2.0.11
98
+ pillow==11.2.1
99
+ pycocotools==2.0.8
100
+ pre_commit==4.2.0
101
+ wrapt==1.17.2
102
+ nvidia-curand-cu12==10.3.5.147
103
+ spacy==3.8.7
104
+ rpds-py==0.25.1
105
+ exceptiongroup==1.3.0
106
+ braceexpand==0.1.7
107
+ rouge_score==0.1.2
108
+ async-timeout==5.0.1
109
+ torchmetrics==1.7.1
110
+ nvidia-nccl-cu12==2.21.5
111
+ wcwidth==0.2.13
112
+ nvidia-cusparselt-cu12==0.6.2
113
+ scikit-image==0.25.2
114
+ urllib3==2.4.0
115
+ portalocker==3.1.1
116
+ smart-open==7.1.0
117
+ cfgv==3.4.0
118
+ markdown-it-py==3.0.0
119
+ charset-normalizer==3.4.2
120
+ executing==2.2.0
121
+ pure_eval==0.2.3
122
+ safetensors==0.5.3
123
+ spacy-legacy==3.0.12
124
+ shellingham==1.5.4
125
+ langcodes==3.5.0
126
+ pytz==2025.2
127
+ iopath==0.1.10
128
+ weasel==0.4.1
129
+ tifffile==2025.5.10
130
+ nodeenv==1.9.1
131
+ absl-py==2.2.2
132
+ einops==0.8.1
133
+ msgpack==1.1.0
134
+ pydantic_core==2.33.2
135
+ ninja==1.11.1.4
136
+ altair==5.5.0
137
+ attrs==25.3.0
138
+ tqdm==4.67.1
139
+ deepspeed==0.16.10+b666844f
140
+ pydeck==0.9.1
141
+ stack-data==0.6.3
142
+ pydantic==2.11.5
143
+ torch==2.6.0
144
+ nvidia-cudnn-cu12==9.1.0.70
145
+ python-slugify==8.0.4
146
+ webdataset==0.2.111
147
+ pytorch-lightning==2.5.1.post0
148
+ prompt_toolkit==3.0.51
149
+ psutil==7.0.0
150
+ opendatasets==0.1.22
151
+ asttokens==3.0.0
152
+ MarkupSafe==3.0.2
153
+ multidict==6.4.4
154
+ nvidia-cufft-cu12==11.2.1.3
155
+ GitPython==3.1.44
156
+ PyYAML==6.0.2
157
+ cloudpathlib==0.21.1
158
+ toml==0.10.2
159
+ marisa-trie==1.2.1
160
+ traitlets==5.14.3
161
+ cachetools==5.5.2
162
+ spacy-loggers==1.0.5
163
+ nvidia-cuda-runtime-cu12==12.4.127
164
+ torchvision==0.21.0
165
+ nvidia-cusparse-cu12==12.3.1.170
166
+ jedi==0.19.2
167
+ thinc==8.3.6
168
+ py-cpuinfo==9.0.0
169
+ yacs==0.1.8
170
+ cffi==1.17.1
171
+ preshed==3.0.10
172
+ more-itertools==10.7.0
173
+ bigmodelvis==0.0.1
174
+ datasets==3.6.0
175
+ huggingface-hub==0.32.1
176
+ narwhals==1.41.0
177
+ xxhash==3.5.0
178
+ sentry-sdk==2.29.1
179
+ aiohttp==3.12.2
180
+ opencv-python==4.11.0.86
181
+ pycryptodome==3.23.0
182
+ threadpoolctl==3.6.0
183
+ flash-attn==2.7.1.post1
184
+ transformers==4.52.3
185
+ pycparser==2.22
186
+ pathlib==1.0.1
187
+ dill==0.3.8
188
+ scikit-learn==1.6.1
189
+ tokenizers==0.21.1
190
+ aliyun-python-sdk-core==2.16.0
191
+ fsspec==2025.3.0
192
+ jmespath==0.10.0
193
+ click==8.2.1
194
+ delta-center-client==0.0.4
195
+ cheroot==10.0.1
196
+ wandb==0.19.11
197
+ setproctitle==1.3.6
198
+ jsonschema==4.24.0
199
+ oss2==2.15.0
200
+ multiprocess==0.70.16
201
+ jaraco.functools==4.1.0
202
+ web.py==0.62
203
+ aliyun-python-sdk-kms==2.16.5
204
+ cryptography==45.0.3
205
+ pip==25.1.1
206
+ docker-pycreds==0.4.0
207
+ typer==0.16.0
208
+ opendelta==0.3.2
209
+ crcmod==1.7
210
+ jaraco.functools==4.0.1
211
+ inflect==7.3.1
212
+ jaraco.collections==5.1.0
213
+ packaging==24.2
214
+ wheel==0.45.1
215
+ tomli==2.0.1
216
+ platformdirs==4.2.2
217
+ typing_extensions==4.12.2
218
+ more-itertools==10.3.0
219
+ autocommand==2.2.2
220
+ jaraco.text==3.12.1
221
+ importlib_metadata==8.0.0
222
+ jaraco.context==5.3.0
223
+ zipp==3.19.2
224
+ backports.tarfile==1.2.0
225
+ typeguard==4.3.0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-metadata.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-06-22T11:06:31.087290Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_06221723",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "20",
22
+ "--batch_size",
23
+ "32",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "8",
29
+ "--use_wandb_logger"
30
+ ],
31
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
32
+ "codePath": "stage1.py",
33
+ "email": "gia0603yucca@gmail.com",
34
+ "root": "./all_checkpoints/stage1_06221723/",
35
+ "host": "dsw-251511-c5cfcb8-lwcpt",
36
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
37
+ "codePathLocal": "stage1.py",
38
+ "cpu_count": 64,
39
+ "cpu_count_logical": 64,
40
+ "gpu": "NVIDIA A800-SXM4-80GB",
41
+ "gpu_count": 8,
42
+ "disk": {
43
+ "/": {
44
+ "total": "1623302262784",
45
+ "used": "1363152896"
46
+ }
47
+ },
48
+ "memory": {
49
+ "total": "549755813888"
50
+ },
51
+ "cpu": {
52
+ "count": 64,
53
+ "countLogical": 64
54
+ },
55
+ "gpu_nvidia": [
56
+ {
57
+ "name": "NVIDIA A800-SXM4-80GB",
58
+ "memoryTotal": "85198045184",
59
+ "architecture": "Ampere"
60
+ },
61
+ {
62
+ "name": "NVIDIA A800-SXM4-80GB",
63
+ "memoryTotal": "85198045184",
64
+ "architecture": "Ampere"
65
+ },
66
+ {
67
+ "name": "NVIDIA A800-SXM4-80GB",
68
+ "memoryTotal": "85198045184",
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA A800-SXM4-80GB",
73
+ "memoryTotal": "85198045184",
74
+ "architecture": "Ampere"
75
+ },
76
+ {
77
+ "name": "NVIDIA A800-SXM4-80GB",
78
+ "memoryTotal": "85198045184",
79
+ "architecture": "Ampere"
80
+ },
81
+ {
82
+ "name": "NVIDIA A800-SXM4-80GB",
83
+ "memoryTotal": "85198045184",
84
+ "architecture": "Ampere"
85
+ },
86
+ {
87
+ "name": "NVIDIA A800-SXM4-80GB",
88
+ "memoryTotal": "85198045184",
89
+ "architecture": "Ampere"
90
+ },
91
+ {
92
+ "name": "NVIDIA A800-SXM4-80GB",
93
+ "memoryTotal": "85198045184",
94
+ "architecture": "Ampere"
95
+ }
96
+ ],
97
+ "cudaVersion": "12.1"
98
+ }
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"trainer/global_step":182,"loader2/val_loss_lm/dataloader_idx_2":4.984375,"loader0/val_loss_ptc/dataloader_idx_0":2.392578125,"loader2/val_loss_ptc/dataloader_idx_2":3.49609375,"loader0/val_loss/dataloader_idx_0":7.27734375,"loader0/val_loss_lm/dataloader_idx_0":4.2421875,"train_loss":6.375,"train_loss_ptm":0.63671875,"train_loss_lm":2.783203125,"loader1/val_loss/dataloader_idx_1":5.7578125,"_step":5,"_runtime":135.905828304,"_wandb":{"runtime":145},"lr":9.779754327610135e-05,"epoch":2,"loader1/val_loss_ptm/dataloader_idx_1":0.63330078125,"loader1/val_loss_ptc/dataloader_idx_1":2.71484375,"loader0/val_loss_ptm/dataloader_idx_0":0.6416015625,"loader2/val_loss_ptm/dataloader_idx_2":0.63525390625,"loader1/val_loss_lm/dataloader_idx_1":2.412109375,"loader2/val_loss/dataloader_idx_2":9.109375,"train_loss_ptc":2.953125,"_timestamp":1.7505905269926882e+09}
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-internal.log ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-22T19:06:31.147471317+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-core.log"}
2
+ {"time":"2025-06-22T19:06:32.175593884+08:00","level":"INFO","msg":"created new stream","id":"9wqt24k3"}
3
+ {"time":"2025-06-22T19:06:32.175639093+08:00","level":"INFO","msg":"stream: started","id":"9wqt24k3"}
4
+ {"time":"2025-06-22T19:06:32.175689814+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"9wqt24k3"}
5
+ {"time":"2025-06-22T19:06:32.17571173+08:00","level":"INFO","msg":"sender: started","stream_id":"9wqt24k3"}
6
+ {"time":"2025-06-22T19:06:32.175747445+08:00","level":"INFO","msg":"handler: started","stream_id":"9wqt24k3"}
7
+ {"time":"2025-06-22T19:06:33.341553042+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-06-22T19:08:56.485679626+08:00","level":"INFO","msg":"stream: closing","id":"9wqt24k3"}
9
+ {"time":"2025-06-22T19:08:56.485774311+08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2025-06-22T19:08:56.486438679+08:00","level":"INFO","msg":"Stopped system monitor"}
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Configure stats pid to 82552
3
+ 2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug.log
7
+ 2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-internal.log
8
+ 2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:init():893] starting backend
12
+ 2025-06-22 19:06:31,084 INFO MainThread:82552 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-22 19:06:31,085 INFO MainThread:82552 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-22 19:06:31,087 INFO MainThread:82552 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-22 19:06:31,088 INFO MainThread:82552 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-22 19:06:31,147 INFO MainThread:82552 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-22 19:06:33,293 INFO MainThread:82552 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-22 19:06:33,470 INFO MainThread:82552 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-22 19:06:33,471 INFO MainThread:82552 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-22 19:06:33,474 INFO MainThread:82552 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-22 19:06:33,474 INFO MainThread:82552 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-22 19:06:33,475 INFO MainThread:82552 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-22 19:06:38,587 INFO MainThread:82552 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
24
+ 2025-06-22 19:08:56,484 INFO MsgRouterThr:82552 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/config.yaml ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": trainer/global_step
6
+ "6":
7
+ - 3
8
+ "7": []
9
+ python_version: 3.10.0
10
+ t:
11
+ "1":
12
+ - 1
13
+ - 5
14
+ - 9
15
+ - 11
16
+ - 33
17
+ - 41
18
+ - 49
19
+ - 53
20
+ - 55
21
+ - 63
22
+ - 103
23
+ "2":
24
+ - 1
25
+ - 5
26
+ - 9
27
+ - 11
28
+ - 33
29
+ - 41
30
+ - 49
31
+ - 53
32
+ - 55
33
+ - 63
34
+ - 103
35
+ "3":
36
+ - 7
37
+ - 23
38
+ - 55
39
+ - 66
40
+ "4": 3.10.0
41
+ "5": 0.19.11
42
+ "6": 4.52.3
43
+ "8":
44
+ - 5
45
+ "12": 0.19.11
46
+ "13": linux-x86_64
47
+ accelerator:
48
+ value: gpu
49
+ batch_size:
50
+ value: 32
51
+ bert_hidden_dim:
52
+ value: 768
53
+ bert_name:
54
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
55
+ check_val_every_n_epoch:
56
+ value: 1
57
+ cross_attention_freq:
58
+ value: 2
59
+ devices:
60
+ value: 0,1,2,3
61
+ filename:
62
+ value: stage1_06221723
63
+ init_checkpoint:
64
+ value: ""
65
+ init_lr:
66
+ value: 0.0001
67
+ lm:
68
+ value: true
69
+ load_4bit:
70
+ value: false
71
+ lr_decay_rate:
72
+ value: 0.9
73
+ match_batch_size:
74
+ value: 64
75
+ max_epochs:
76
+ value: 20
77
+ min_lr:
78
+ value: 1e-05
79
+ mix_dataset:
80
+ value: true
81
+ mode:
82
+ value: train
83
+ num_query_token:
84
+ value: 8
85
+ num_workers:
86
+ value: 8
87
+ plm_name:
88
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
89
+ plm_tune:
90
+ value: freeze
91
+ pool_size:
92
+ value: 0
93
+ precision:
94
+ value: bf16-mixed
95
+ projection_dim:
96
+ value: 256
97
+ prot_aug:
98
+ value: None
99
+ prot_max_len:
100
+ value: 1024
101
+ ptm:
102
+ value: true
103
+ rerank_cand_num:
104
+ value: 128
105
+ retrieval_eval_epoch:
106
+ value: 10
107
+ root:
108
+ value: data
109
+ save_every_n_epochs:
110
+ value: 5
111
+ scheduler:
112
+ value: linear_warmup_cosine_lr
113
+ seed:
114
+ value: 42
115
+ strategy:
116
+ value: deepspeed
117
+ temperature:
118
+ value: 0.1
119
+ text_max_len:
120
+ value: 128
121
+ use_wandb_logger:
122
+ value: true
123
+ warmup_lr:
124
+ value: 1e-06
125
+ warmup_steps:
126
+ value: 1000
127
+ weight_decay:
128
+ value: 0.05
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/output.log ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
2
+ Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
3
+ LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
4
+
5
+ | Name | Type | Params | Mode
6
+ ------------------------------------------------------
7
+ 0 | blip2qformer | Blip2Qformer | 327 M | train
8
+ ------------------------------------------------------
9
+ 179 M Trainable params
10
+ 147 M Non-trainable params
11
+ 327 M Total params
12
+ 1,309.467 Total estimated model params size (MB)
13
+ 5 Modules in train mode
14
+ 926 Modules in eval mode
15
+ Sanity Checking DataLoader 2: 50%|██████████████████████████▌ | 1/2 [00:00<00:00, 1.83it/s]
16
+ /nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
17
+ with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
18
+ Traceback (most recent call last):
19
+ File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
20
+ main(args)
21
+ File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
22
+ trainer.fit(model, datamodule=dm)
23
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
24
+ call._call_and_handle_interrupt(
25
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
26
+ return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
27
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
28
+ return function(*args, **kwargs)
29
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
30
+ self._run(model, ckpt_path=ckpt_path)
31
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
32
+ results = self._run_stage()
33
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
34
+ self._run_sanity_check()
35
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
36
+ val_loop.run()
37
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
38
+ return loop_run(self, *args, **kwargs)
39
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
40
+ self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
41
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
42
+ output = call._call_strategy_hook(trainer, hook_name, *step_args)
43
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
44
+ output = fn(*args, **kwargs)
45
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
46
+ return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
47
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
48
+ wrapper_output = wrapper_module(*args, **kwargs)
49
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
50
+ return self._call_impl(*args, **kwargs)
51
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
52
+ return forward_call(*args, **kwargs)
53
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
54
+ ret_val = func(*args, **kwargs)
55
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
56
+ loss = self.module(*inputs, **kwargs)
57
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
58
+ return self._call_impl(*args, **kwargs)
59
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
60
+ return inner()
61
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
62
+ result = forward_call(*args, **kwargs)
63
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
64
+ out = method(*_args, **_kwargs)
65
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
66
+ return func(*args, **kwargs)
67
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
68
+ blip2_loss = self.blip2qformer(batch)
69
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
70
+ return self._call_impl(*args, **kwargs)
71
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
72
+ return forward_call(*args, **kwargs)
73
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
74
+ sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
75
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
76
+ loss_graph = F.cross_entropy(logits_per_graph, labels)
77
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
78
+ return torch._C._nn.cross_entropy_loss(
79
+ RuntimeError: size mismatch (got input: [4], target: [1])
80
+ [rank0]: Traceback (most recent call last):
81
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
82
+ [rank0]: main(args)
83
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
84
+ [rank0]: trainer.fit(model, datamodule=dm)
85
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
86
+ [rank0]: call._call_and_handle_interrupt(
87
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
88
+ [rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
89
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
90
+ [rank0]: return function(*args, **kwargs)
91
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
92
+ [rank0]: self._run(model, ckpt_path=ckpt_path)
93
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
94
+ [rank0]: results = self._run_stage()
95
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
96
+ [rank0]: self._run_sanity_check()
97
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
98
+ [rank0]: val_loop.run()
99
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
100
+ [rank0]: return loop_run(self, *args, **kwargs)
101
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
102
+ [rank0]: self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
103
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
104
+ [rank0]: output = call._call_strategy_hook(trainer, hook_name, *step_args)
105
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
106
+ [rank0]: output = fn(*args, **kwargs)
107
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
108
+ [rank0]: return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
109
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
110
+ [rank0]: wrapper_output = wrapper_module(*args, **kwargs)
111
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
112
+ [rank0]: return self._call_impl(*args, **kwargs)
113
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
114
+ [rank0]: return forward_call(*args, **kwargs)
115
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
116
+ [rank0]: ret_val = func(*args, **kwargs)
117
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
118
+ [rank0]: loss = self.module(*inputs, **kwargs)
119
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
120
+ [rank0]: return self._call_impl(*args, **kwargs)
121
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
122
+ [rank0]: return inner()
123
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
124
+ [rank0]: result = forward_call(*args, **kwargs)
125
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
126
+ [rank0]: out = method(*_args, **_kwargs)
127
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
128
+ [rank0]: return func(*args, **kwargs)
129
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
130
+ [rank0]: blip2_loss = self.blip2qformer(batch)
131
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
132
+ [rank0]: return self._call_impl(*args, **kwargs)
133
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
134
+ [rank0]: return forward_call(*args, **kwargs)
135
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
136
+ [rank0]: sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
137
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
138
+ [rank0]: loss_graph = F.cross_entropy(logits_per_graph, labels)
139
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
140
+ [rank0]: return torch._C._nn.cross_entropy_loss(
141
+ [rank0]: RuntimeError: size mismatch (got input: [4], target: [1])
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==2.2.6
2
+ confection==0.1.5
3
+ text-unidecode==1.3
4
+ contexttimer==0.3.3
5
+ omegaconf==2.3.0
6
+ tzdata==2025.2
7
+ nvidia-cuda-nvrtc-cu12==12.4.127
8
+ plotly==6.1.1
9
+ decord==0.6.0
10
+ nvidia-cublas-cu12==12.4.5.8
11
+ scipy==1.15.3
12
+ nvidia-cufile-cu12==1.11.1.6
13
+ parso==0.8.4
14
+ python-dateutil==2.9.0.post0
15
+ setuptools==78.1.1
16
+ aiosignal==1.3.2
17
+ joblib==1.5.1
18
+ platformdirs==4.3.8
19
+ regex==2024.11.6
20
+ aiohappyeyeballs==2.6.1
21
+ virtualenv==20.31.2
22
+ lazy_loader==0.4
23
+ rich==14.0.0
24
+ timm==0.4.12
25
+ antlr4-python3-runtime==4.9.3
26
+ pandas==2.2.3
27
+ salesforce-lavis==1.0.2
28
+ gitdb==4.0.12
29
+ six==1.17.0
30
+ smmap==5.0.2
31
+ annotated-types==0.7.0
32
+ pyparsing==3.2.3
33
+ Jinja2==3.1.6
34
+ ptyprocess==0.7.0
35
+ streamlit==1.45.1
36
+ idna==3.10
37
+ nvidia-cusolver-cu12==11.6.1.9
38
+ tenacity==9.1.2
39
+ sentencepiece==0.2.0
40
+ matplotlib-inline==0.1.7
41
+ typing-inspection==0.4.1
42
+ packaging==24.2
43
+ nltk==3.9.1
44
+ wheel==0.45.1
45
+ catalogue==2.0.10
46
+ matplotlib==3.10.3
47
+ propcache==0.3.1
48
+ Pygments==2.19.1
49
+ nvidia-nvjitlink-cu12==12.4.127
50
+ requests==2.32.3
51
+ filelock==3.18.0
52
+ pexpect==4.9.0
53
+ opencv-python-headless==4.5.5.64
54
+ certifi==2025.4.26
55
+ nvidia-nvtx-cu12==12.4.127
56
+ bleach==6.2.0
57
+ typing_extensions==4.13.2
58
+ tornado==6.5.1
59
+ networkx==3.4.2
60
+ sympy==1.13.1
61
+ watchdog==6.0.0
62
+ kaggle==1.7.4.5
63
+ nvidia-ml-py==12.575.51
64
+ pyarrow==20.0.0
65
+ mpmath==1.3.0
66
+ lightning-utilities==0.14.3
67
+ ftfy==6.3.1
68
+ triton==3.2.0
69
+ referencing==0.36.2
70
+ ipython==8.36.0
71
+ yarl==1.20.0
72
+ language_data==1.3.0
73
+ cycler==0.12.1
74
+ python-magic==0.4.27
75
+ wasabi==1.1.3
76
+ protobuf==6.31.0
77
+ murmurhash==1.0.13
78
+ jsonschema-specifications==2025.4.1
79
+ blinker==1.9.0
80
+ fonttools==4.58.0
81
+ imageio==2.37.0
82
+ pycocoevalcap==1.2
83
+ nvidia-cuda-cupti-cu12==12.4.127
84
+ fairscale==0.4.4
85
+ hjson==3.1.0
86
+ identify==2.6.12
87
+ mdurl==0.1.2
88
+ decorator==5.2.1
89
+ distlib==0.3.9
90
+ webencodings==0.5.1
91
+ kiwisolver==1.4.8
92
+ srsly==2.5.1
93
+ frozenlist==1.6.0
94
+ blis==1.3.0
95
+ contourpy==1.3.2
96
+ hf-xet==1.1.2
97
+ cymem==2.0.11
98
+ pillow==11.2.1
99
+ pycocotools==2.0.8
100
+ pre_commit==4.2.0
101
+ wrapt==1.17.2
102
+ nvidia-curand-cu12==10.3.5.147
103
+ spacy==3.8.7
104
+ rpds-py==0.25.1
105
+ exceptiongroup==1.3.0
106
+ braceexpand==0.1.7
107
+ rouge_score==0.1.2
108
+ async-timeout==5.0.1
109
+ torchmetrics==1.7.1
110
+ nvidia-nccl-cu12==2.21.5
111
+ wcwidth==0.2.13
112
+ nvidia-cusparselt-cu12==0.6.2
113
+ scikit-image==0.25.2
114
+ urllib3==2.4.0
115
+ portalocker==3.1.1
116
+ smart-open==7.1.0
117
+ cfgv==3.4.0
118
+ markdown-it-py==3.0.0
119
+ charset-normalizer==3.4.2
120
+ executing==2.2.0
121
+ pure_eval==0.2.3
122
+ safetensors==0.5.3
123
+ spacy-legacy==3.0.12
124
+ shellingham==1.5.4
125
+ langcodes==3.5.0
126
+ pytz==2025.2
127
+ iopath==0.1.10
128
+ weasel==0.4.1
129
+ tifffile==2025.5.10
130
+ nodeenv==1.9.1
131
+ absl-py==2.2.2
132
+ einops==0.8.1
133
+ msgpack==1.1.0
134
+ pydantic_core==2.33.2
135
+ ninja==1.11.1.4
136
+ altair==5.5.0
137
+ attrs==25.3.0
138
+ tqdm==4.67.1
139
+ deepspeed==0.16.10+b666844f
140
+ pydeck==0.9.1
141
+ stack-data==0.6.3
142
+ pydantic==2.11.5
143
+ torch==2.6.0
144
+ nvidia-cudnn-cu12==9.1.0.70
145
+ python-slugify==8.0.4
146
+ webdataset==0.2.111
147
+ pytorch-lightning==2.5.1.post0
148
+ prompt_toolkit==3.0.51
149
+ psutil==7.0.0
150
+ opendatasets==0.1.22
151
+ asttokens==3.0.0
152
+ MarkupSafe==3.0.2
153
+ multidict==6.4.4
154
+ nvidia-cufft-cu12==11.2.1.3
155
+ GitPython==3.1.44
156
+ PyYAML==6.0.2
157
+ cloudpathlib==0.21.1
158
+ toml==0.10.2
159
+ marisa-trie==1.2.1
160
+ traitlets==5.14.3
161
+ cachetools==5.5.2
162
+ spacy-loggers==1.0.5
163
+ nvidia-cuda-runtime-cu12==12.4.127
164
+ torchvision==0.21.0
165
+ nvidia-cusparse-cu12==12.3.1.170
166
+ jedi==0.19.2
167
+ thinc==8.3.6
168
+ py-cpuinfo==9.0.0
169
+ yacs==0.1.8
170
+ cffi==1.17.1
171
+ preshed==3.0.10
172
+ more-itertools==10.7.0
173
+ bigmodelvis==0.0.1
174
+ datasets==3.6.0
175
+ huggingface-hub==0.32.1
176
+ narwhals==1.41.0
177
+ xxhash==3.5.0
178
+ sentry-sdk==2.29.1
179
+ aiohttp==3.12.2
180
+ opencv-python==4.11.0.86
181
+ pycryptodome==3.23.0
182
+ threadpoolctl==3.6.0
183
+ flash-attn==2.7.1.post1
184
+ transformers==4.52.3
185
+ pycparser==2.22
186
+ pathlib==1.0.1
187
+ dill==0.3.8
188
+ scikit-learn==1.6.1
189
+ tokenizers==0.21.1
190
+ aliyun-python-sdk-core==2.16.0
191
+ fsspec==2025.3.0
192
+ jmespath==0.10.0
193
+ click==8.2.1
194
+ delta-center-client==0.0.4
195
+ cheroot==10.0.1
196
+ wandb==0.19.11
197
+ setproctitle==1.3.6
198
+ jsonschema==4.24.0
199
+ oss2==2.15.0
200
+ multiprocess==0.70.16
201
+ jaraco.functools==4.1.0
202
+ web.py==0.62
203
+ aliyun-python-sdk-kms==2.16.5
204
+ cryptography==45.0.3
205
+ pip==25.1.1
206
+ docker-pycreds==0.4.0
207
+ typer==0.16.0
208
+ opendelta==0.3.2
209
+ crcmod==1.7
210
+ jaraco.functools==4.0.1
211
+ inflect==7.3.1
212
+ jaraco.collections==5.1.0
213
+ packaging==24.2
214
+ wheel==0.45.1
215
+ tomli==2.0.1
216
+ platformdirs==4.2.2
217
+ typing_extensions==4.12.2
218
+ more-itertools==10.3.0
219
+ autocommand==2.2.2
220
+ jaraco.text==3.12.1
221
+ importlib_metadata==8.0.0
222
+ jaraco.context==5.3.0
223
+ zipp==3.19.2
224
+ backports.tarfile==1.2.0
225
+ typeguard==4.3.0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-metadata.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-06-22T11:10:38.675049Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_06221723",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "20",
22
+ "--batch_size",
23
+ "32",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "8",
29
+ "--use_wandb_logger"
30
+ ],
31
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
32
+ "codePath": "stage1.py",
33
+ "email": "gia0603yucca@gmail.com",
34
+ "root": "./all_checkpoints/stage1_06221723/",
35
+ "host": "dsw-251511-c5cfcb8-lwcpt",
36
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
37
+ "codePathLocal": "stage1.py",
38
+ "cpu_count": 64,
39
+ "cpu_count_logical": 64,
40
+ "gpu": "NVIDIA A800-SXM4-80GB",
41
+ "gpu_count": 8,
42
+ "disk": {
43
+ "/": {
44
+ "total": "1623302262784",
45
+ "used": "1363165184"
46
+ }
47
+ },
48
+ "memory": {
49
+ "total": "549755813888"
50
+ },
51
+ "cpu": {
52
+ "count": 64,
53
+ "countLogical": 64
54
+ },
55
+ "gpu_nvidia": [
56
+ {
57
+ "name": "NVIDIA A800-SXM4-80GB",
58
+ "memoryTotal": "85198045184",
59
+ "architecture": "Ampere"
60
+ },
61
+ {
62
+ "name": "NVIDIA A800-SXM4-80GB",
63
+ "memoryTotal": "85198045184",
64
+ "architecture": "Ampere"
65
+ },
66
+ {
67
+ "name": "NVIDIA A800-SXM4-80GB",
68
+ "memoryTotal": "85198045184",
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA A800-SXM4-80GB",
73
+ "memoryTotal": "85198045184",
74
+ "architecture": "Ampere"
75
+ },
76
+ {
77
+ "name": "NVIDIA A800-SXM4-80GB",
78
+ "memoryTotal": "85198045184",
79
+ "architecture": "Ampere"
80
+ },
81
+ {
82
+ "name": "NVIDIA A800-SXM4-80GB",
83
+ "memoryTotal": "85198045184",
84
+ "architecture": "Ampere"
85
+ },
86
+ {
87
+ "name": "NVIDIA A800-SXM4-80GB",
88
+ "memoryTotal": "85198045184",
89
+ "architecture": "Ampere"
90
+ },
91
+ {
92
+ "name": "NVIDIA A800-SXM4-80GB",
93
+ "memoryTotal": "85198045184",
94
+ "architecture": "Ampere"
95
+ }
96
+ ],
97
+ "cudaVersion": "12.1"
98
+ }
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":11}}
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-internal.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-22T19:10:38.679318052+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-core.log"}
2
+ {"time":"2025-06-22T19:10:39.726410578+08:00","level":"INFO","msg":"created new stream","id":"bu5vqvh5"}
3
+ {"time":"2025-06-22T19:10:39.726458298+08:00","level":"INFO","msg":"stream: started","id":"bu5vqvh5"}
4
+ {"time":"2025-06-22T19:10:39.726477847+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"bu5vqvh5"}
5
+ {"time":"2025-06-22T19:10:39.726508843+08:00","level":"INFO","msg":"sender: started","stream_id":"bu5vqvh5"}
6
+ {"time":"2025-06-22T19:10:39.726652161+08:00","level":"INFO","msg":"handler: started","stream_id":"bu5vqvh5"}
7
+ {"time":"2025-06-22T19:10:40.990067167+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-06-22T19:10:49.81571789+08:00","level":"INFO","msg":"stream: closing","id":"bu5vqvh5"}
9
+ {"time":"2025-06-22T19:10:49.815789673+08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2025-06-22T19:10:49.816535239+08:00","level":"INFO","msg":"Stopped system monitor"}
11
+ {"time":"2025-06-22T19:10:52.523961836+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
12
+ {"time":"2025-06-22T19:10:53.665534986+08:00","level":"INFO","msg":"handler: closed","stream_id":"bu5vqvh5"}
13
+ {"time":"2025-06-22T19:10:53.665598663+08:00","level":"INFO","msg":"sender: closed","stream_id":"bu5vqvh5"}
14
+ {"time":"2025-06-22T19:10:53.665594948+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"bu5vqvh5"}
15
+ {"time":"2025-06-22T19:10:53.669904066+08:00","level":"INFO","msg":"stream: closed","id":"bu5vqvh5"}
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Configure stats pid to 95583
3
+ 2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug.log
7
+ 2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-internal.log
8
+ 2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:init():893] starting backend
12
+ 2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-22 19:10:38,673 INFO MainThread:95583 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-22 19:10:38,674 INFO MainThread:95583 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-22 19:10:38,675 INFO MainThread:95583 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-22 19:10:38,678 INFO MainThread:95583 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-22 19:10:40,981 INFO MainThread:95583 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-22 19:10:41,122 INFO MainThread:95583 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-22 19:10:41,122 INFO MainThread:95583 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-22 19:10:41,125 INFO MainThread:95583 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-22 19:10:41,125 INFO MainThread:95583 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-22 19:10:41,126 INFO MainThread:95583 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-22 19:10:44,823 INFO MainThread:95583 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
24
+ 2025-06-22 19:10:49,814 INFO MsgRouterThr:95583 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/run-bu5vqvh5.wandb ADDED
Binary file (20.5 kB). View file
 
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/config.yaml ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": trainer/global_step
6
+ "6":
7
+ - 3
8
+ "7": []
9
+ python_version: 3.10.0
10
+ t:
11
+ "1":
12
+ - 1
13
+ - 5
14
+ - 9
15
+ - 11
16
+ - 33
17
+ - 41
18
+ - 49
19
+ - 53
20
+ - 55
21
+ - 63
22
+ - 103
23
+ "2":
24
+ - 1
25
+ - 5
26
+ - 9
27
+ - 11
28
+ - 33
29
+ - 41
30
+ - 49
31
+ - 53
32
+ - 55
33
+ - 63
34
+ - 103
35
+ "3":
36
+ - 7
37
+ - 23
38
+ - 55
39
+ - 66
40
+ "4": 3.10.0
41
+ "5": 0.19.11
42
+ "6": 4.52.3
43
+ "8":
44
+ - 5
45
+ "12": 0.19.11
46
+ "13": linux-x86_64
47
+ accelerator:
48
+ value: gpu
49
+ batch_size:
50
+ value: 32
51
+ bert_hidden_dim:
52
+ value: 768
53
+ bert_name:
54
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
55
+ check_val_every_n_epoch:
56
+ value: 1
57
+ cross_attention_freq:
58
+ value: 2
59
+ devices:
60
+ value: 0,1,2,3
61
+ filename:
62
+ value: stage1_06221723
63
+ init_checkpoint:
64
+ value: ""
65
+ init_lr:
66
+ value: 0.0001
67
+ lm:
68
+ value: true
69
+ load_4bit:
70
+ value: false
71
+ lr_decay_rate:
72
+ value: 0.9
73
+ match_batch_size:
74
+ value: 64
75
+ max_epochs:
76
+ value: 20
77
+ min_lr:
78
+ value: 1e-05
79
+ mix_dataset:
80
+ value: true
81
+ mode:
82
+ value: train
83
+ num_query_token:
84
+ value: 8
85
+ num_workers:
86
+ value: 8
87
+ plm_name:
88
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
89
+ plm_tune:
90
+ value: freeze
91
+ pool_size:
92
+ value: 0
93
+ precision:
94
+ value: bf16-mixed
95
+ projection_dim:
96
+ value: 256
97
+ prot_aug:
98
+ value: None
99
+ prot_max_len:
100
+ value: 1024
101
+ ptm:
102
+ value: true
103
+ rerank_cand_num:
104
+ value: 128
105
+ retrieval_eval_epoch:
106
+ value: 10
107
+ root:
108
+ value: data
109
+ save_every_n_epochs:
110
+ value: 5
111
+ scheduler:
112
+ value: linear_warmup_cosine_lr
113
+ seed:
114
+ value: 42
115
+ strategy:
116
+ value: deepspeed
117
+ temperature:
118
+ value: 0.1
119
+ text_max_len:
120
+ value: 128
121
+ use_wandb_logger:
122
+ value: true
123
+ warmup_lr:
124
+ value: 1e-06
125
+ warmup_steps:
126
+ value: 1000
127
+ weight_decay:
128
+ value: 0.05
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/output.log ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
2
+ Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
3
+ LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
4
+
5
+ | Name | Type | Params | Mode
6
+ ------------------------------------------------------
7
+ 0 | blip2qformer | Blip2Qformer | 327 M | train
8
+ ------------------------------------------------------
9
+ 179 M Trainable params
10
+ 147 M Non-trainable params
11
+ 327 M Total params
12
+ 1,309.467 Total estimated model params size (MB)
13
+ 5 Modules in train mode
14
+ 926 Modules in eval mode
15
+ Sanity Checking DataLoader 0: 0%| | 0/1 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([25, 100])
16
+ labels.shape: torch.Size([25])
17
+ /nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:220: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
18
+ with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
19
+ Sanity Checking DataLoader 1: 0%| | 0/2 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([32, 128])
20
+ labels.shape: torch.Size([32])
21
+ Sanity Checking DataLoader 1: 50%|██████████████████████████▌ | 1/2 [00:00<00:00, 1.84it/s]logits_per_graph.shape: torch.Size([18, 72])
22
+ labels.shape: torch.Size([18])
23
+ Sanity Checking DataLoader 2: 0%| | 0/2 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([32, 128])
24
+ labels.shape: torch.Size([32])
25
+ Sanity Checking DataLoader 2: 50%|██████████████████████████▌ | 1/2 [00:00<00:00, 1.69it/s]logits_per_graph.shape: torch.Size([4])
26
+ labels.shape: torch.Size([1])
27
+ Traceback (most recent call last):
28
+ File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
29
+ main(args)
30
+ File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
31
+ trainer.fit(model, datamodule=dm)
32
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
33
+ call._call_and_handle_interrupt(
34
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
35
+ return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
36
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
37
+ return function(*args, **kwargs)
38
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
39
+ self._run(model, ckpt_path=ckpt_path)
40
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
41
+ results = self._run_stage()
42
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
43
+ self._run_sanity_check()
44
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
45
+ val_loop.run()
46
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
47
+ return loop_run(self, *args, **kwargs)
48
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
49
+ self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
50
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
51
+ output = call._call_strategy_hook(trainer, hook_name, *step_args)
52
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
53
+ output = fn(*args, **kwargs)
54
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
55
+ return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
56
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
57
+ wrapper_output = wrapper_module(*args, **kwargs)
58
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
59
+ return self._call_impl(*args, **kwargs)
60
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
61
+ return forward_call(*args, **kwargs)
62
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
63
+ ret_val = func(*args, **kwargs)
64
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
65
+ loss = self.module(*inputs, **kwargs)
66
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
67
+ return self._call_impl(*args, **kwargs)
68
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
69
+ return inner()
70
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
71
+ result = forward_call(*args, **kwargs)
72
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
73
+ out = method(*_args, **_kwargs)
74
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
75
+ return func(*args, **kwargs)
76
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
77
+ blip2_loss = self.blip2qformer(batch)
78
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
79
+ return self._call_impl(*args, **kwargs)
80
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
81
+ return forward_call(*args, **kwargs)
82
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 141, in forward
83
+ sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
84
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 104, in contrast_global
85
+ loss_graph = F.cross_entropy(logits_per_graph, labels)
86
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
87
+ return torch._C._nn.cross_entropy_loss(
88
+ RuntimeError: size mismatch (got input: [4], target: [1])
89
+ [rank0]: Traceback (most recent call last):
90
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
91
+ [rank0]: main(args)
92
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
93
+ [rank0]: trainer.fit(model, datamodule=dm)
94
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
95
+ [rank0]: call._call_and_handle_interrupt(
96
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
97
+ [rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
98
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
99
+ [rank0]: return function(*args, **kwargs)
100
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
101
+ [rank0]: self._run(model, ckpt_path=ckpt_path)
102
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
103
+ [rank0]: results = self._run_stage()
104
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
105
+ [rank0]: self._run_sanity_check()
106
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
107
+ [rank0]: val_loop.run()
108
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
109
+ [rank0]: return loop_run(self, *args, **kwargs)
110
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
111
+ [rank0]: self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
112
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
113
+ [rank0]: output = call._call_strategy_hook(trainer, hook_name, *step_args)
114
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
115
+ [rank0]: output = fn(*args, **kwargs)
116
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
117
+ [rank0]: return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
118
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
119
+ [rank0]: wrapper_output = wrapper_module(*args, **kwargs)
120
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
121
+ [rank0]: return self._call_impl(*args, **kwargs)
122
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
123
+ [rank0]: return forward_call(*args, **kwargs)
124
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
125
+ [rank0]: ret_val = func(*args, **kwargs)
126
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
127
+ [rank0]: loss = self.module(*inputs, **kwargs)
128
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
129
+ [rank0]: return self._call_impl(*args, **kwargs)
130
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
131
+ [rank0]: return inner()
132
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
133
+ [rank0]: result = forward_call(*args, **kwargs)
134
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
135
+ [rank0]: out = method(*_args, **_kwargs)
136
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
137
+ [rank0]: return func(*args, **kwargs)
138
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
139
+ [rank0]: blip2_loss = self.blip2qformer(batch)
140
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
141
+ [rank0]: return self._call_impl(*args, **kwargs)
142
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
143
+ [rank0]: return forward_call(*args, **kwargs)
144
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 141, in forward
145
+ [rank0]: sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
146
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 104, in contrast_global
147
+ [rank0]: loss_graph = F.cross_entropy(logits_per_graph, labels)
148
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
149
+ [rank0]: return torch._C._nn.cross_entropy_loss(
150
+ [rank0]: RuntimeError: size mismatch (got input: [4], target: [1])
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==2.2.6
2
+ confection==0.1.5
3
+ text-unidecode==1.3
4
+ contexttimer==0.3.3
5
+ omegaconf==2.3.0
6
+ tzdata==2025.2
7
+ nvidia-cuda-nvrtc-cu12==12.4.127
8
+ plotly==6.1.1
9
+ decord==0.6.0
10
+ nvidia-cublas-cu12==12.4.5.8
11
+ scipy==1.15.3
12
+ nvidia-cufile-cu12==1.11.1.6
13
+ parso==0.8.4
14
+ python-dateutil==2.9.0.post0
15
+ setuptools==78.1.1
16
+ aiosignal==1.3.2
17
+ joblib==1.5.1
18
+ platformdirs==4.3.8
19
+ regex==2024.11.6
20
+ aiohappyeyeballs==2.6.1
21
+ virtualenv==20.31.2
22
+ lazy_loader==0.4
23
+ rich==14.0.0
24
+ timm==0.4.12
25
+ antlr4-python3-runtime==4.9.3
26
+ pandas==2.2.3
27
+ salesforce-lavis==1.0.2
28
+ gitdb==4.0.12
29
+ six==1.17.0
30
+ smmap==5.0.2
31
+ annotated-types==0.7.0
32
+ pyparsing==3.2.3
33
+ Jinja2==3.1.6
34
+ ptyprocess==0.7.0
35
+ streamlit==1.45.1
36
+ idna==3.10
37
+ nvidia-cusolver-cu12==11.6.1.9
38
+ tenacity==9.1.2
39
+ sentencepiece==0.2.0
40
+ matplotlib-inline==0.1.7
41
+ typing-inspection==0.4.1
42
+ packaging==24.2
43
+ nltk==3.9.1
44
+ wheel==0.45.1
45
+ catalogue==2.0.10
46
+ matplotlib==3.10.3
47
+ propcache==0.3.1
48
+ Pygments==2.19.1
49
+ nvidia-nvjitlink-cu12==12.4.127
50
+ requests==2.32.3
51
+ filelock==3.18.0
52
+ pexpect==4.9.0
53
+ opencv-python-headless==4.5.5.64
54
+ certifi==2025.4.26
55
+ nvidia-nvtx-cu12==12.4.127
56
+ bleach==6.2.0
57
+ typing_extensions==4.13.2
58
+ tornado==6.5.1
59
+ networkx==3.4.2
60
+ sympy==1.13.1
61
+ watchdog==6.0.0
62
+ kaggle==1.7.4.5
63
+ nvidia-ml-py==12.575.51
64
+ pyarrow==20.0.0
65
+ mpmath==1.3.0
66
+ lightning-utilities==0.14.3
67
+ ftfy==6.3.1
68
+ triton==3.2.0
69
+ referencing==0.36.2
70
+ ipython==8.36.0
71
+ yarl==1.20.0
72
+ language_data==1.3.0
73
+ cycler==0.12.1
74
+ python-magic==0.4.27
75
+ wasabi==1.1.3
76
+ protobuf==6.31.0
77
+ murmurhash==1.0.13
78
+ jsonschema-specifications==2025.4.1
79
+ blinker==1.9.0
80
+ fonttools==4.58.0
81
+ imageio==2.37.0
82
+ pycocoevalcap==1.2
83
+ nvidia-cuda-cupti-cu12==12.4.127
84
+ fairscale==0.4.4
85
+ hjson==3.1.0
86
+ identify==2.6.12
87
+ mdurl==0.1.2
88
+ decorator==5.2.1
89
+ distlib==0.3.9
90
+ webencodings==0.5.1
91
+ kiwisolver==1.4.8
92
+ srsly==2.5.1
93
+ frozenlist==1.6.0
94
+ blis==1.3.0
95
+ contourpy==1.3.2
96
+ hf-xet==1.1.2
97
+ cymem==2.0.11
98
+ pillow==11.2.1
99
+ pycocotools==2.0.8
100
+ pre_commit==4.2.0
101
+ wrapt==1.17.2
102
+ nvidia-curand-cu12==10.3.5.147
103
+ spacy==3.8.7
104
+ rpds-py==0.25.1
105
+ exceptiongroup==1.3.0
106
+ braceexpand==0.1.7
107
+ rouge_score==0.1.2
108
+ async-timeout==5.0.1
109
+ torchmetrics==1.7.1
110
+ nvidia-nccl-cu12==2.21.5
111
+ wcwidth==0.2.13
112
+ nvidia-cusparselt-cu12==0.6.2
113
+ scikit-image==0.25.2
114
+ urllib3==2.4.0
115
+ portalocker==3.1.1
116
+ smart-open==7.1.0
117
+ cfgv==3.4.0
118
+ markdown-it-py==3.0.0
119
+ charset-normalizer==3.4.2
120
+ executing==2.2.0
121
+ pure_eval==0.2.3
122
+ safetensors==0.5.3
123
+ spacy-legacy==3.0.12
124
+ shellingham==1.5.4
125
+ langcodes==3.5.0
126
+ pytz==2025.2
127
+ iopath==0.1.10
128
+ weasel==0.4.1
129
+ tifffile==2025.5.10
130
+ nodeenv==1.9.1
131
+ absl-py==2.2.2
132
+ einops==0.8.1
133
+ msgpack==1.1.0
134
+ pydantic_core==2.33.2
135
+ ninja==1.11.1.4
136
+ altair==5.5.0
137
+ attrs==25.3.0
138
+ tqdm==4.67.1
139
+ deepspeed==0.16.10+b666844f
140
+ pydeck==0.9.1
141
+ stack-data==0.6.3
142
+ pydantic==2.11.5
143
+ torch==2.6.0
144
+ nvidia-cudnn-cu12==9.1.0.70
145
+ python-slugify==8.0.4
146
+ webdataset==0.2.111
147
+ pytorch-lightning==2.5.1.post0
148
+ prompt_toolkit==3.0.51
149
+ psutil==7.0.0
150
+ opendatasets==0.1.22
151
+ asttokens==3.0.0
152
+ MarkupSafe==3.0.2
153
+ multidict==6.4.4
154
+ nvidia-cufft-cu12==11.2.1.3
155
+ GitPython==3.1.44
156
+ PyYAML==6.0.2
157
+ cloudpathlib==0.21.1
158
+ toml==0.10.2
159
+ marisa-trie==1.2.1
160
+ traitlets==5.14.3
161
+ cachetools==5.5.2
162
+ spacy-loggers==1.0.5
163
+ nvidia-cuda-runtime-cu12==12.4.127
164
+ torchvision==0.21.0
165
+ nvidia-cusparse-cu12==12.3.1.170
166
+ jedi==0.19.2
167
+ thinc==8.3.6
168
+ py-cpuinfo==9.0.0
169
+ yacs==0.1.8
170
+ cffi==1.17.1
171
+ preshed==3.0.10
172
+ more-itertools==10.7.0
173
+ bigmodelvis==0.0.1
174
+ datasets==3.6.0
175
+ huggingface-hub==0.32.1
176
+ narwhals==1.41.0
177
+ xxhash==3.5.0
178
+ sentry-sdk==2.29.1
179
+ aiohttp==3.12.2
180
+ opencv-python==4.11.0.86
181
+ pycryptodome==3.23.0
182
+ threadpoolctl==3.6.0
183
+ flash-attn==2.7.1.post1
184
+ transformers==4.52.3
185
+ pycparser==2.22
186
+ pathlib==1.0.1
187
+ dill==0.3.8
188
+ scikit-learn==1.6.1
189
+ tokenizers==0.21.1
190
+ aliyun-python-sdk-core==2.16.0
191
+ fsspec==2025.3.0
192
+ jmespath==0.10.0
193
+ click==8.2.1
194
+ delta-center-client==0.0.4
195
+ cheroot==10.0.1
196
+ wandb==0.19.11
197
+ setproctitle==1.3.6
198
+ jsonschema==4.24.0
199
+ oss2==2.15.0
200
+ multiprocess==0.70.16
201
+ jaraco.functools==4.1.0
202
+ web.py==0.62
203
+ aliyun-python-sdk-kms==2.16.5
204
+ cryptography==45.0.3
205
+ pip==25.1.1
206
+ docker-pycreds==0.4.0
207
+ typer==0.16.0
208
+ opendelta==0.3.2
209
+ crcmod==1.7
210
+ jaraco.functools==4.0.1
211
+ inflect==7.3.1
212
+ jaraco.collections==5.1.0
213
+ packaging==24.2
214
+ wheel==0.45.1
215
+ tomli==2.0.1
216
+ platformdirs==4.2.2
217
+ typing_extensions==4.12.2
218
+ more-itertools==10.3.0
219
+ autocommand==2.2.2
220
+ jaraco.text==3.12.1
221
+ importlib_metadata==8.0.0
222
+ jaraco.context==5.3.0
223
+ zipp==3.19.2
224
+ backports.tarfile==1.2.0
225
+ typeguard==4.3.0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-metadata.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-06-22T11:18:08.739768Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_06221723",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "20",
22
+ "--batch_size",
23
+ "32",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "8",
29
+ "--use_wandb_logger"
30
+ ],
31
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
32
+ "codePath": "stage1.py",
33
+ "email": "gia0603yucca@gmail.com",
34
+ "root": "./all_checkpoints/stage1_06221723/",
35
+ "host": "dsw-251511-c5cfcb8-lwcpt",
36
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
37
+ "codePathLocal": "stage1.py",
38
+ "cpu_count": 64,
39
+ "cpu_count_logical": 64,
40
+ "gpu": "NVIDIA A800-SXM4-80GB",
41
+ "gpu_count": 8,
42
+ "disk": {
43
+ "/": {
44
+ "total": "1623302262784",
45
+ "used": "1363197952"
46
+ }
47
+ },
48
+ "memory": {
49
+ "total": "549755813888"
50
+ },
51
+ "cpu": {
52
+ "count": 64,
53
+ "countLogical": 64
54
+ },
55
+ "gpu_nvidia": [
56
+ {
57
+ "name": "NVIDIA A800-SXM4-80GB",
58
+ "memoryTotal": "85198045184",
59
+ "architecture": "Ampere"
60
+ },
61
+ {
62
+ "name": "NVIDIA A800-SXM4-80GB",
63
+ "memoryTotal": "85198045184",
64
+ "architecture": "Ampere"
65
+ },
66
+ {
67
+ "name": "NVIDIA A800-SXM4-80GB",
68
+ "memoryTotal": "85198045184",
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA A800-SXM4-80GB",
73
+ "memoryTotal": "85198045184",
74
+ "architecture": "Ampere"
75
+ },
76
+ {
77
+ "name": "NVIDIA A800-SXM4-80GB",
78
+ "memoryTotal": "85198045184",
79
+ "architecture": "Ampere"
80
+ },
81
+ {
82
+ "name": "NVIDIA A800-SXM4-80GB",
83
+ "memoryTotal": "85198045184",
84
+ "architecture": "Ampere"
85
+ },
86
+ {
87
+ "name": "NVIDIA A800-SXM4-80GB",
88
+ "memoryTotal": "85198045184",
89
+ "architecture": "Ampere"
90
+ },
91
+ {
92
+ "name": "NVIDIA A800-SXM4-80GB",
93
+ "memoryTotal": "85198045184",
94
+ "architecture": "Ampere"
95
+ }
96
+ ],
97
+ "cudaVersion": "12.1"
98
+ }
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":11}}
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-internal.log ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-22T19:18:08.742601474+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-core.log"}
2
+ {"time":"2025-06-22T19:18:09.773061599+08:00","level":"INFO","msg":"created new stream","id":"a2dszq0q"}
3
+ {"time":"2025-06-22T19:18:09.773105546+08:00","level":"INFO","msg":"stream: started","id":"a2dszq0q"}
4
+ {"time":"2025-06-22T19:18:09.773141968+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"a2dszq0q"}
5
+ {"time":"2025-06-22T19:18:09.773166056+08:00","level":"INFO","msg":"sender: started","stream_id":"a2dszq0q"}
6
+ {"time":"2025-06-22T19:18:09.773225667+08:00","level":"INFO","msg":"handler: started","stream_id":"a2dszq0q"}
7
+ {"time":"2025-06-22T19:18:11.012086945+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-06-22T19:18:19.860220439+08:00","level":"INFO","msg":"stream: closing","id":"a2dszq0q"}
9
+ {"time":"2025-06-22T19:18:19.860330929+08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2025-06-22T19:18:19.861063374+08:00","level":"INFO","msg":"Stopped system monitor"}
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-22 19:18:08,732 INFO MainThread:99755 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_setup.py:_flush():70] Configure stats pid to 99755
3
+ 2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug.log
7
+ 2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-internal.log
8
+ 2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:init():893] starting backend
12
+ 2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-22 19:18:08,734 INFO MainThread:99755 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-22 19:18:08,736 INFO MainThread:99755 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-22 19:18:08,740 INFO MainThread:99755 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-22 19:18:08,746 INFO MainThread:99755 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-22 19:18:10,963 INFO MainThread:99755 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-22 19:18:11,141 INFO MainThread:99755 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-22 19:18:11,142 INFO MainThread:99755 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-22 19:18:11,145 INFO MainThread:99755 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-22 19:18:11,145 INFO MainThread:99755 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-22 19:18:11,146 INFO MainThread:99755 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-22 19:18:14,870 INFO MainThread:99755 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
24
+ 2025-06-22 19:18:19,859 INFO MsgRouterThr:99755 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/run-a2dszq0q.wandb ADDED
File without changes
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/config.yaml ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": trainer/global_step
6
+ "6":
7
+ - 3
8
+ "7": []
9
+ python_version: 3.10.0
10
+ t:
11
+ "1":
12
+ - 1
13
+ - 5
14
+ - 9
15
+ - 11
16
+ - 33
17
+ - 41
18
+ - 49
19
+ - 53
20
+ - 55
21
+ - 63
22
+ - 103
23
+ "2":
24
+ - 1
25
+ - 5
26
+ - 9
27
+ - 11
28
+ - 33
29
+ - 41
30
+ - 49
31
+ - 53
32
+ - 55
33
+ - 63
34
+ - 103
35
+ "3":
36
+ - 7
37
+ - 23
38
+ - 55
39
+ - 66
40
+ "4": 3.10.0
41
+ "5": 0.19.11
42
+ "6": 4.52.3
43
+ "8":
44
+ - 5
45
+ "12": 0.19.11
46
+ "13": linux-x86_64
47
+ accelerator:
48
+ value: gpu
49
+ batch_size:
50
+ value: 32
51
+ bert_hidden_dim:
52
+ value: 768
53
+ bert_name:
54
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
55
+ check_val_every_n_epoch:
56
+ value: 1
57
+ cross_attention_freq:
58
+ value: 2
59
+ devices:
60
+ value: 0,1,2,3,4,5,6,7
61
+ filename:
62
+ value: stage1_06221723
63
+ init_checkpoint:
64
+ value: ""
65
+ init_lr:
66
+ value: 0.0001
67
+ lm:
68
+ value: true
69
+ load_4bit:
70
+ value: false
71
+ lr_decay_rate:
72
+ value: 0.9
73
+ match_batch_size:
74
+ value: 64
75
+ max_epochs:
76
+ value: 20
77
+ min_lr:
78
+ value: 1e-05
79
+ mix_dataset:
80
+ value: true
81
+ mode:
82
+ value: train
83
+ num_query_token:
84
+ value: 8
85
+ num_workers:
86
+ value: 8
87
+ plm_name:
88
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
89
+ plm_tune:
90
+ value: freeze
91
+ pool_size:
92
+ value: 0
93
+ precision:
94
+ value: bf16-mixed
95
+ projection_dim:
96
+ value: 256
97
+ prot_aug:
98
+ value: None
99
+ prot_max_len:
100
+ value: 1024
101
+ ptm:
102
+ value: true
103
+ rerank_cand_num:
104
+ value: 128
105
+ retrieval_eval_epoch:
106
+ value: 10
107
+ root:
108
+ value: data
109
+ save_every_n_epochs:
110
+ value: 5
111
+ scheduler:
112
+ value: linear_warmup_cosine_lr
113
+ seed:
114
+ value: 42
115
+ strategy:
116
+ value: deepspeed
117
+ temperature:
118
+ value: 0.1
119
+ text_max_len:
120
+ value: 128
121
+ use_wandb_logger:
122
+ value: true
123
+ warmup_lr:
124
+ value: 1e-06
125
+ warmup_steps:
126
+ value: 1000
127
+ weight_decay:
128
+ value: 0.05
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/output.log ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [rank0]:W0622 19:26:47.840041 104393 site-packages/torch/distributed/distributed_c10d.py:2941] _object_to_tensor size: 81 hash value: 6444836214324640892
2
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
3
+ Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
4
+ LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
5
+
6
+ | Name | Type | Params | Mode
7
+ ------------------------------------------------------
8
+ 0 | blip2qformer | Blip2Qformer | 327 M | train
9
+ ------------------------------------------------------
10
+ 179 M Trainable params
11
+ 147 M Non-trainable params
12
+ 327 M Total params
13
+ 1,309.467 Total estimated model params size (MB)
14
+ 5 Modules in train mode
15
+ 926 Modules in eval mode
16
+ Sanity Checking DataLoader 0: 0%| | 0/1 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([13, 104])
17
+ labels.shape: torch.Size([13])
18
+ /nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:220: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
19
+ with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
20
+ Sanity Checking DataLoader 1: 0%| | 0/1 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([25, 200])
21
+ labels.shape: torch.Size([25])
22
+ Sanity Checking DataLoader 2: 0%| | 0/1 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([17, 136])
23
+ labels.shape: torch.Size([17])
24
+ Epoch 0: 0%| | 0/61 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([32, 256])
25
+ labels.shape: torch.Size([32])
26
+ Epoch 0: 2%|█ | 1/61 [00:01<01:11, 0.84it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
27
+ labels.shape: torch.Size([32])
28
+ Epoch 0: 3%|██ | 2/61 [00:01<00:55, 1.07it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
29
+ labels.shape: torch.Size([32])
30
+ Epoch 0: 5%|███ | 3/61 [00:02<00:49, 1.17it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
31
+ labels.shape: torch.Size([32])
32
+ Epoch 0: 7%|████ | 4/61 [00:03<00:46, 1.24it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
33
+ labels.shape: torch.Size([32])
34
+ Epoch 0: 8%|█████ | 5/61 [00:03<00:43, 1.28it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
35
+ labels.shape: torch.Size([32])
36
+ Epoch 0: 10%|██████ | 6/61 [00:04<00:42, 1.31it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
37
+ labels.shape: torch.Size([32])
38
+ Epoch 0: 11%|███████ | 7/61 [00:05<00:40, 1.33it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
39
+ labels.shape: torch.Size([32])
40
+ Epoch 0: 13%|████████ | 8/61 [00:05<00:39, 1.35it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
41
+ labels.shape: torch.Size([32])
42
+ Epoch 0: 15%|█████████ | 9/61 [00:06<00:38, 1.36it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
43
+ labels.shape: torch.Size([32])
44
+ Epoch 0: 16%|█████████▊ | 10/61 [00:07<00:37, 1.37it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
45
+ labels.shape: torch.Size([32])
46
+ Epoch 0: 18%|██████████▊ | 11/61 [00:07<00:36, 1.38it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
47
+ labels.shape: torch.Size([32])
48
+ Epoch 0: 20%|███████████▊ | 12/61 [00:08<00:35, 1.39it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
49
+ labels.shape: torch.Size([32])
50
+ Epoch 0: 21%|████████████▊ | 13/61 [00:09<00:34, 1.40it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
51
+ labels.shape: torch.Size([32])
52
+ Epoch 0: 23%|█████████████▊ | 14/61 [00:09<00:33, 1.40it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
53
+ labels.shape: torch.Size([32])
54
+ Epoch 0: 25%|██████████████▊ | 15/61 [00:10<00:32, 1.41it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
55
+ labels.shape: torch.Size([32])
56
+ Epoch 0: 26%|███████████████▋ | 16/61 [00:11<00:31, 1.41it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
57
+ labels.shape: torch.Size([32])
58
+ Epoch 0: 28%|████████████████▋ | 17/61 [00:12<00:31, 1.42it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
59
+ labels.shape: torch.Size([32])
60
+ Epoch 0: 30%|█████████████████▋ | 18/61 [00:12<00:30, 1.42it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
61
+ labels.shape: torch.Size([32])
62
+ Epoch 0: 31%|██████████████████▋ | 19/61 [00:13<00:29, 1.42it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
63
+ labels.shape: torch.Size([32])
64
+ Epoch 0: 33%|███████████████████▋ | 20/61 [00:14<00:28, 1.42it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
65
+ labels.shape: torch.Size([32])
66
+ Epoch 0: 34%|████████████████████▋ | 21/61 [00:14<00:28, 1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
67
+ labels.shape: torch.Size([32])
68
+ Epoch 0: 36%|█████████████████████▋ | 22/61 [00:15<00:27, 1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
69
+ labels.shape: torch.Size([32])
70
+ Epoch 0: 38%|██████████████████████▌ | 23/61 [00:16<00:26, 1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
71
+ labels.shape: torch.Size([32])
72
+ Epoch 0: 39%|███████████████████████▌ | 24/61 [00:16<00:25, 1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
73
+ labels.shape: torch.Size([32])
74
+ Epoch 0: 41%|████████████████████████▌ | 25/61 [00:17<00:25, 1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
75
+ labels.shape: torch.Size([32])
76
+ Epoch 0: 43%|█████████████████████████▌ | 26/61 [00:18<00:24, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
77
+ labels.shape: torch.Size([32])
78
+ Epoch 0: 44%|██████████████████████████▌ | 27/61 [00:18<00:23, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
79
+ labels.shape: torch.Size([32])
80
+ Epoch 0: 46%|███████████████████████████▌ | 28/61 [00:19<00:22, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
81
+ labels.shape: torch.Size([32])
82
+ Epoch 0: 48%|████████████████████████████▌ | 29/61 [00:20<00:22, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
83
+ labels.shape: torch.Size([32])
84
+ Epoch 0: 49%|█████████████████████████████▌ | 30/61 [00:20<00:21, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
85
+ labels.shape: torch.Size([32])
86
+ Epoch 0: 51%|██████████████████████████████▍ | 31/61 [00:21<00:20, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
87
+ labels.shape: torch.Size([32])
88
+ Epoch 0: 52%|███████████████████████████████▍ | 32/61 [00:22<00:20, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
89
+ labels.shape: torch.Size([32])
90
+ Epoch 0: 54%|████████████████████████████████▍ | 33/61 [00:22<00:19, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
91
+ labels.shape: torch.Size([32])
92
+ Epoch 0: 56%|█████████████████████████████████▍ | 34/61 [00:23<00:18, 1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
93
+ labels.shape: torch.Size([32])
94
+ Epoch 0: 57%|██████████████████████████████████▍ | 35/61 [00:24<00:17, 1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
95
+ labels.shape: torch.Size([32])
96
+ Epoch 0: 59%|█████████████████████████���█████████▍ | 36/61 [00:24<00:17, 1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
97
+ labels.shape: torch.Size([32])
98
+ Epoch 0: 61%|████████████████████████████████████▍ | 37/61 [00:25<00:16, 1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
99
+ labels.shape: torch.Size([32])
100
+ Epoch 0: 62%|█████████████████████████████████████▍ | 38/61 [00:26<00:15, 1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
101
+ labels.shape: torch.Size([32])
102
+ Epoch 0: 64%|██████████████████████████████████████▎ | 39/61 [00:26<00:15, 1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
103
+ labels.shape: torch.Size([32])
104
+ Epoch 0: 66%|███████████████████████████████████████▎ | 40/61 [00:27<00:14, 1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
105
+ labels.shape: torch.Size([32])
106
+ Epoch 0: 67%|████████████████████████████████████████▎ | 41/61 [00:28<00:13, 1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
107
+ labels.shape: torch.Size([32])
108
+ Epoch 0: 69%|█████████████████████████████████████████▎ | 42/61 [00:28<00:13, 1.45it/s, v_num=vn72]
109
+
110
+ Detected KeyboardInterrupt, attempting graceful shutdown ...
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==2.2.6
2
+ confection==0.1.5
3
+ text-unidecode==1.3
4
+ contexttimer==0.3.3
5
+ omegaconf==2.3.0
6
+ tzdata==2025.2
7
+ nvidia-cuda-nvrtc-cu12==12.4.127
8
+ plotly==6.1.1
9
+ decord==0.6.0
10
+ nvidia-cublas-cu12==12.4.5.8
11
+ scipy==1.15.3
12
+ nvidia-cufile-cu12==1.11.1.6
13
+ parso==0.8.4
14
+ python-dateutil==2.9.0.post0
15
+ setuptools==78.1.1
16
+ aiosignal==1.3.2
17
+ joblib==1.5.1
18
+ platformdirs==4.3.8
19
+ regex==2024.11.6
20
+ aiohappyeyeballs==2.6.1
21
+ virtualenv==20.31.2
22
+ lazy_loader==0.4
23
+ rich==14.0.0
24
+ timm==0.4.12
25
+ antlr4-python3-runtime==4.9.3
26
+ pandas==2.2.3
27
+ salesforce-lavis==1.0.2
28
+ gitdb==4.0.12
29
+ six==1.17.0
30
+ smmap==5.0.2
31
+ annotated-types==0.7.0
32
+ pyparsing==3.2.3
33
+ Jinja2==3.1.6
34
+ ptyprocess==0.7.0
35
+ streamlit==1.45.1
36
+ idna==3.10
37
+ nvidia-cusolver-cu12==11.6.1.9
38
+ tenacity==9.1.2
39
+ sentencepiece==0.2.0
40
+ matplotlib-inline==0.1.7
41
+ typing-inspection==0.4.1
42
+ packaging==24.2
43
+ nltk==3.9.1
44
+ wheel==0.45.1
45
+ catalogue==2.0.10
46
+ matplotlib==3.10.3
47
+ propcache==0.3.1
48
+ Pygments==2.19.1
49
+ nvidia-nvjitlink-cu12==12.4.127
50
+ requests==2.32.3
51
+ filelock==3.18.0
52
+ pexpect==4.9.0
53
+ opencv-python-headless==4.5.5.64
54
+ certifi==2025.4.26
55
+ nvidia-nvtx-cu12==12.4.127
56
+ bleach==6.2.0
57
+ typing_extensions==4.13.2
58
+ tornado==6.5.1
59
+ networkx==3.4.2
60
+ sympy==1.13.1
61
+ watchdog==6.0.0
62
+ kaggle==1.7.4.5
63
+ nvidia-ml-py==12.575.51
64
+ pyarrow==20.0.0
65
+ mpmath==1.3.0
66
+ lightning-utilities==0.14.3
67
+ ftfy==6.3.1
68
+ triton==3.2.0
69
+ referencing==0.36.2
70
+ ipython==8.36.0
71
+ yarl==1.20.0
72
+ language_data==1.3.0
73
+ cycler==0.12.1
74
+ python-magic==0.4.27
75
+ wasabi==1.1.3
76
+ protobuf==6.31.0
77
+ murmurhash==1.0.13
78
+ jsonschema-specifications==2025.4.1
79
+ blinker==1.9.0
80
+ fonttools==4.58.0
81
+ imageio==2.37.0
82
+ pycocoevalcap==1.2
83
+ nvidia-cuda-cupti-cu12==12.4.127
84
+ fairscale==0.4.4
85
+ hjson==3.1.0
86
+ identify==2.6.12
87
+ mdurl==0.1.2
88
+ decorator==5.2.1
89
+ distlib==0.3.9
90
+ webencodings==0.5.1
91
+ kiwisolver==1.4.8
92
+ srsly==2.5.1
93
+ frozenlist==1.6.0
94
+ blis==1.3.0
95
+ contourpy==1.3.2
96
+ hf-xet==1.1.2
97
+ cymem==2.0.11
98
+ pillow==11.2.1
99
+ pycocotools==2.0.8
100
+ pre_commit==4.2.0
101
+ wrapt==1.17.2
102
+ nvidia-curand-cu12==10.3.5.147
103
+ spacy==3.8.7
104
+ rpds-py==0.25.1
105
+ exceptiongroup==1.3.0
106
+ braceexpand==0.1.7
107
+ rouge_score==0.1.2
108
+ async-timeout==5.0.1
109
+ torchmetrics==1.7.1
110
+ nvidia-nccl-cu12==2.21.5
111
+ wcwidth==0.2.13
112
+ nvidia-cusparselt-cu12==0.6.2
113
+ scikit-image==0.25.2
114
+ urllib3==2.4.0
115
+ portalocker==3.1.1
116
+ smart-open==7.1.0
117
+ cfgv==3.4.0
118
+ markdown-it-py==3.0.0
119
+ charset-normalizer==3.4.2
120
+ executing==2.2.0
121
+ pure_eval==0.2.3
122
+ safetensors==0.5.3
123
+ spacy-legacy==3.0.12
124
+ shellingham==1.5.4
125
+ langcodes==3.5.0
126
+ pytz==2025.2
127
+ iopath==0.1.10
128
+ weasel==0.4.1
129
+ tifffile==2025.5.10
130
+ nodeenv==1.9.1
131
+ absl-py==2.2.2
132
+ einops==0.8.1
133
+ msgpack==1.1.0
134
+ pydantic_core==2.33.2
135
+ ninja==1.11.1.4
136
+ altair==5.5.0
137
+ attrs==25.3.0
138
+ tqdm==4.67.1
139
+ deepspeed==0.16.10+b666844f
140
+ pydeck==0.9.1
141
+ stack-data==0.6.3
142
+ pydantic==2.11.5
143
+ torch==2.6.0
144
+ nvidia-cudnn-cu12==9.1.0.70
145
+ python-slugify==8.0.4
146
+ webdataset==0.2.111
147
+ pytorch-lightning==2.5.1.post0
148
+ prompt_toolkit==3.0.51
149
+ psutil==7.0.0
150
+ opendatasets==0.1.22
151
+ asttokens==3.0.0
152
+ MarkupSafe==3.0.2
153
+ multidict==6.4.4
154
+ nvidia-cufft-cu12==11.2.1.3
155
+ GitPython==3.1.44
156
+ PyYAML==6.0.2
157
+ cloudpathlib==0.21.1
158
+ toml==0.10.2
159
+ marisa-trie==1.2.1
160
+ traitlets==5.14.3
161
+ cachetools==5.5.2
162
+ spacy-loggers==1.0.5
163
+ nvidia-cuda-runtime-cu12==12.4.127
164
+ torchvision==0.21.0
165
+ nvidia-cusparse-cu12==12.3.1.170
166
+ jedi==0.19.2
167
+ thinc==8.3.6
168
+ py-cpuinfo==9.0.0
169
+ yacs==0.1.8
170
+ cffi==1.17.1
171
+ preshed==3.0.10
172
+ more-itertools==10.7.0
173
+ bigmodelvis==0.0.1
174
+ datasets==3.6.0
175
+ huggingface-hub==0.32.1
176
+ narwhals==1.41.0
177
+ xxhash==3.5.0
178
+ sentry-sdk==2.29.1
179
+ aiohttp==3.12.2
180
+ opencv-python==4.11.0.86
181
+ pycryptodome==3.23.0
182
+ threadpoolctl==3.6.0
183
+ flash-attn==2.7.1.post1
184
+ transformers==4.52.3
185
+ pycparser==2.22
186
+ pathlib==1.0.1
187
+ dill==0.3.8
188
+ scikit-learn==1.6.1
189
+ tokenizers==0.21.1
190
+ aliyun-python-sdk-core==2.16.0
191
+ fsspec==2025.3.0
192
+ jmespath==0.10.0
193
+ click==8.2.1
194
+ delta-center-client==0.0.4
195
+ cheroot==10.0.1
196
+ wandb==0.19.11
197
+ setproctitle==1.3.6
198
+ jsonschema==4.24.0
199
+ oss2==2.15.0
200
+ multiprocess==0.70.16
201
+ jaraco.functools==4.1.0
202
+ web.py==0.62
203
+ aliyun-python-sdk-kms==2.16.5
204
+ cryptography==45.0.3
205
+ pip==25.1.1
206
+ docker-pycreds==0.4.0
207
+ typer==0.16.0
208
+ opendelta==0.3.2
209
+ crcmod==1.7
210
+ jaraco.functools==4.0.1
211
+ inflect==7.3.1
212
+ jaraco.collections==5.1.0
213
+ packaging==24.2
214
+ wheel==0.45.1
215
+ tomli==2.0.1
216
+ platformdirs==4.2.2
217
+ typing_extensions==4.12.2
218
+ more-itertools==10.3.0
219
+ autocommand==2.2.2
220
+ jaraco.text==3.12.1
221
+ importlib_metadata==8.0.0
222
+ jaraco.context==5.3.0
223
+ zipp==3.19.2
224
+ backports.tarfile==1.2.0
225
+ typeguard==4.3.0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/wandb-metadata.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-06-22T11:26:45.457479Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_06221723",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "20",
22
+ "--batch_size",
23
+ "32",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "8",
29
+ "--use_wandb_logger"
30
+ ],
31
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
32
+ "codePath": "stage1.py",
33
+ "email": "gia0603yucca@gmail.com",
34
+ "root": "./all_checkpoints/stage1_06221723/",
35
+ "host": "dsw-251511-c5cfcb8-lwcpt",
36
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
37
+ "codePathLocal": "stage1.py",
38
+ "cpu_count": 64,
39
+ "cpu_count_logical": 64,
40
+ "gpu": "NVIDIA A800-SXM4-80GB",
41
+ "gpu_count": 8,
42
+ "disk": {
43
+ "/": {
44
+ "total": "1623302262784",
45
+ "used": "1363202048"
46
+ }
47
+ },
48
+ "memory": {
49
+ "total": "549755813888"
50
+ },
51
+ "cpu": {
52
+ "count": 64,
53
+ "countLogical": 64
54
+ },
55
+ "gpu_nvidia": [
56
+ {
57
+ "name": "NVIDIA A800-SXM4-80GB",
58
+ "memoryTotal": "85198045184",
59
+ "architecture": "Ampere"
60
+ },
61
+ {
62
+ "name": "NVIDIA A800-SXM4-80GB",
63
+ "memoryTotal": "85198045184",
64
+ "architecture": "Ampere"
65
+ },
66
+ {
67
+ "name": "NVIDIA A800-SXM4-80GB",
68
+ "memoryTotal": "85198045184",
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA A800-SXM4-80GB",
73
+ "memoryTotal": "85198045184",
74
+ "architecture": "Ampere"
75
+ },
76
+ {
77
+ "name": "NVIDIA A800-SXM4-80GB",
78
+ "memoryTotal": "85198045184",
79
+ "architecture": "Ampere"
80
+ },
81
+ {
82
+ "name": "NVIDIA A800-SXM4-80GB",
83
+ "memoryTotal": "85198045184",
84
+ "architecture": "Ampere"
85
+ },
86
+ {
87
+ "name": "NVIDIA A800-SXM4-80GB",
88
+ "memoryTotal": "85198045184",
89
+ "architecture": "Ampere"
90
+ },
91
+ {
92
+ "name": "NVIDIA A800-SXM4-80GB",
93
+ "memoryTotal": "85198045184",
94
+ "architecture": "Ampere"
95
+ }
96
+ ],
97
+ "cudaVersion": "12.1"
98
+ }
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/run-zmuhvn72.wandb ADDED
Binary file (58.8 kB). View file