diff --git a/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_token_acc.png b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_token_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..222147e4cec9831188c45a4ba6ade9f1ac1a3413 Binary files /dev/null and b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_token_acc.png differ diff --git a/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_epoch.png b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_epoch.png new file mode 100644 index 0000000000000000000000000000000000000000..7c8e896828ef41bb9ebf1c1db4ce07960056e233 Binary files /dev/null and b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_epoch.png differ diff --git a/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_grad_norm.png b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..2e832a5e1cf8fcae3eff36135494606451bc6681 Binary files /dev/null and b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_grad_norm.png differ diff --git a/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_learning_rate.png b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_learning_rate.png new file mode 100644 index 0000000000000000000000000000000000000000..2a4d9e34bc9895ce2027278445b0d1c8f6cc4c72 Binary files /dev/null and b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_learning_rate.png differ diff --git a/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_loss.png b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..db9af64b1989799f82949ea045620c46f7de49b5 Binary files /dev/null and b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_loss.png differ diff --git a/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_memory(GiB).png b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_memory(GiB).png new file mode 100644 index 0000000000000000000000000000000000000000..47644303c5adbc99e2f353539b353b6bd3e241df Binary files /dev/null and b/BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_memory(GiB).png differ diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/config.yaml b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..225b939fd758f215d2d88ebc5d5ea331dbc59e7e --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/config.yaml @@ -0,0 +1,128 @@ +_wandb: + value: + cli_version: 0.19.11 + m: + - "1": trainer/global_step + "6": + - 3 + "7": [] + python_version: 3.10.0 + t: + "1": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "2": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "3": + - 7 + - 23 + - 55 + - 66 + "4": 3.10.0 + "5": 0.19.11 + "6": 4.52.3 + "8": + - 5 + "12": 0.19.11 + "13": linux-x86_64 +accelerator: + value: gpu +batch_size: + value: 32 +bert_hidden_dim: + value: 768 +bert_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft +check_val_every_n_epoch: + value: 1 +cross_attention_freq: + value: 2 +devices: + value: 0,1,2,3 +filename: + value: stage1_06221723 +init_checkpoint: + value: "" +init_lr: + value: 0.0001 +lm: + value: true +load_4bit: + value: false +lr_decay_rate: + value: 0.9 +match_batch_size: + value: 64 +max_epochs: + value: 20 +min_lr: + value: 1e-05 +mix_dataset: + value: true +mode: + value: train +num_query_token: + value: 8 +num_workers: + value: 8 +plm_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m +plm_tune: + value: freeze +pool_size: + value: 0 +precision: + value: bf16-mixed +projection_dim: + value: 256 +prot_aug: + value: None +prot_max_len: + value: 1024 +ptm: + value: true +rerank_cand_num: + value: 128 +retrieval_eval_epoch: + value: 10 +root: + value: data +save_every_n_epochs: + value: 5 +scheduler: + value: linear_warmup_cosine_lr +seed: + value: 42 +strategy: + value: deepspeed +temperature: + value: 0.1 +text_max_len: + value: 128 +use_wandb_logger: + value: true +warmup_lr: + value: 1e-06 +warmup_steps: + value: 1000 +weight_decay: + value: 0.05 diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/output.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1cad93d53934a8e691d8541132aa0758fd8f3692 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/output.log @@ -0,0 +1,141 @@ +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty. +Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`. +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] + + | Name | Type | Params | Mode +------------------------------------------------------ +0 | blip2qformer | Blip2Qformer | 327 M | train +------------------------------------------------------ +179 M Trainable params +147 M Non-trainable params +327 M Total params +1,309.467 Total estimated model params size (MB) +5 Modules in train mode +926 Modules in eval mode +Sanity Checking DataLoader 2: 50%|██████████████████████████▌ | 1/2 [00:00<00:00, 1.86it/s] +/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. + with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32): +Traceback (most recent call last): + File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in + main(args) + File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main + trainer.fit(model, datamodule=dm) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit + call._call_and_handle_interrupt( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt + return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch + return function(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl + self._run(model, ckpt_path=ckpt_path) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run + results = self._run_stage() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage + self._run_sanity_check() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check + val_loop.run() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator + return loop_run(self, *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run + self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step + output = call._call_strategy_hook(trainer, hook_name, *step_args) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook + output = fn(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step + return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__ + wrapper_output = wrapper_module(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn + ret_val = func(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward + loss = self.module(*inputs, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl + return inner() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner + result = forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward + out = method(*_args, **_kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context + return func(*args, **kwargs) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step + blip2_loss = self.blip2qformer(batch) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward + sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global + loss_graph = F.cross_entropy(logits_per_graph, labels) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy + return torch._C._nn.cross_entropy_loss( +RuntimeError: size mismatch (got input: [4], target: [1]) +[rank0]: Traceback (most recent call last): +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in +[rank0]: main(args) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main +[rank0]: trainer.fit(model, datamodule=dm) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit +[rank0]: call._call_and_handle_interrupt( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt +[rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch +[rank0]: return function(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl +[rank0]: self._run(model, ckpt_path=ckpt_path) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run +[rank0]: results = self._run_stage() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage +[rank0]: self._run_sanity_check() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check +[rank0]: val_loop.run() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator +[rank0]: return loop_run(self, *args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run +[rank0]: self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step +[rank0]: output = call._call_strategy_hook(trainer, hook_name, *step_args) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook +[rank0]: output = fn(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step +[rank0]: return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__ +[rank0]: wrapper_output = wrapper_module(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn +[rank0]: ret_val = func(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward +[rank0]: loss = self.module(*inputs, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl +[rank0]: return inner() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner +[rank0]: result = forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward +[rank0]: out = method(*_args, **_kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context +[rank0]: return func(*args, **kwargs) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step +[rank0]: blip2_loss = self.blip2qformer(batch) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward +[rank0]: sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global +[rank0]: loss_graph = F.cross_entropy(logits_per_graph, labels) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy +[rank0]: return torch._C._nn.cross_entropy_loss( +[rank0]: RuntimeError: size mismatch (got input: [4], target: [1]) diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/requirements.txt b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..92bc26e2f236896f569edd37a24ef914a02f96de --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/requirements.txt @@ -0,0 +1,225 @@ +numpy==2.2.6 +confection==0.1.5 +text-unidecode==1.3 +contexttimer==0.3.3 +omegaconf==2.3.0 +tzdata==2025.2 +nvidia-cuda-nvrtc-cu12==12.4.127 +plotly==6.1.1 +decord==0.6.0 +nvidia-cublas-cu12==12.4.5.8 +scipy==1.15.3 +nvidia-cufile-cu12==1.11.1.6 +parso==0.8.4 +python-dateutil==2.9.0.post0 +setuptools==78.1.1 +aiosignal==1.3.2 +joblib==1.5.1 +platformdirs==4.3.8 +regex==2024.11.6 +aiohappyeyeballs==2.6.1 +virtualenv==20.31.2 +lazy_loader==0.4 +rich==14.0.0 +timm==0.4.12 +antlr4-python3-runtime==4.9.3 +pandas==2.2.3 +salesforce-lavis==1.0.2 +gitdb==4.0.12 +six==1.17.0 +smmap==5.0.2 +annotated-types==0.7.0 +pyparsing==3.2.3 +Jinja2==3.1.6 +ptyprocess==0.7.0 +streamlit==1.45.1 +idna==3.10 +nvidia-cusolver-cu12==11.6.1.9 +tenacity==9.1.2 +sentencepiece==0.2.0 +matplotlib-inline==0.1.7 +typing-inspection==0.4.1 +packaging==24.2 +nltk==3.9.1 +wheel==0.45.1 +catalogue==2.0.10 +matplotlib==3.10.3 +propcache==0.3.1 +Pygments==2.19.1 +nvidia-nvjitlink-cu12==12.4.127 +requests==2.32.3 +filelock==3.18.0 +pexpect==4.9.0 +opencv-python-headless==4.5.5.64 +certifi==2025.4.26 +nvidia-nvtx-cu12==12.4.127 +bleach==6.2.0 +typing_extensions==4.13.2 +tornado==6.5.1 +networkx==3.4.2 +sympy==1.13.1 +watchdog==6.0.0 +kaggle==1.7.4.5 +nvidia-ml-py==12.575.51 +pyarrow==20.0.0 +mpmath==1.3.0 +lightning-utilities==0.14.3 +ftfy==6.3.1 +triton==3.2.0 +referencing==0.36.2 +ipython==8.36.0 +yarl==1.20.0 +language_data==1.3.0 +cycler==0.12.1 +python-magic==0.4.27 +wasabi==1.1.3 +protobuf==6.31.0 +murmurhash==1.0.13 +jsonschema-specifications==2025.4.1 +blinker==1.9.0 +fonttools==4.58.0 +imageio==2.37.0 +pycocoevalcap==1.2 +nvidia-cuda-cupti-cu12==12.4.127 +fairscale==0.4.4 +hjson==3.1.0 +identify==2.6.12 +mdurl==0.1.2 +decorator==5.2.1 +distlib==0.3.9 +webencodings==0.5.1 +kiwisolver==1.4.8 +srsly==2.5.1 +frozenlist==1.6.0 +blis==1.3.0 +contourpy==1.3.2 +hf-xet==1.1.2 +cymem==2.0.11 +pillow==11.2.1 +pycocotools==2.0.8 +pre_commit==4.2.0 +wrapt==1.17.2 +nvidia-curand-cu12==10.3.5.147 +spacy==3.8.7 +rpds-py==0.25.1 +exceptiongroup==1.3.0 +braceexpand==0.1.7 +rouge_score==0.1.2 +async-timeout==5.0.1 +torchmetrics==1.7.1 +nvidia-nccl-cu12==2.21.5 +wcwidth==0.2.13 +nvidia-cusparselt-cu12==0.6.2 +scikit-image==0.25.2 +urllib3==2.4.0 +portalocker==3.1.1 +smart-open==7.1.0 +cfgv==3.4.0 +markdown-it-py==3.0.0 +charset-normalizer==3.4.2 +executing==2.2.0 +pure_eval==0.2.3 +safetensors==0.5.3 +spacy-legacy==3.0.12 +shellingham==1.5.4 +langcodes==3.5.0 +pytz==2025.2 +iopath==0.1.10 +weasel==0.4.1 +tifffile==2025.5.10 +nodeenv==1.9.1 +absl-py==2.2.2 +einops==0.8.1 +msgpack==1.1.0 +pydantic_core==2.33.2 +ninja==1.11.1.4 +altair==5.5.0 +attrs==25.3.0 +tqdm==4.67.1 +deepspeed==0.16.10+b666844f +pydeck==0.9.1 +stack-data==0.6.3 +pydantic==2.11.5 +torch==2.6.0 +nvidia-cudnn-cu12==9.1.0.70 +python-slugify==8.0.4 +webdataset==0.2.111 +pytorch-lightning==2.5.1.post0 +prompt_toolkit==3.0.51 +psutil==7.0.0 +opendatasets==0.1.22 +asttokens==3.0.0 +MarkupSafe==3.0.2 +multidict==6.4.4 +nvidia-cufft-cu12==11.2.1.3 +GitPython==3.1.44 +PyYAML==6.0.2 +cloudpathlib==0.21.1 +toml==0.10.2 +marisa-trie==1.2.1 +traitlets==5.14.3 +cachetools==5.5.2 +spacy-loggers==1.0.5 +nvidia-cuda-runtime-cu12==12.4.127 +torchvision==0.21.0 +nvidia-cusparse-cu12==12.3.1.170 +jedi==0.19.2 +thinc==8.3.6 +py-cpuinfo==9.0.0 +yacs==0.1.8 +cffi==1.17.1 +preshed==3.0.10 +more-itertools==10.7.0 +bigmodelvis==0.0.1 +datasets==3.6.0 +huggingface-hub==0.32.1 +narwhals==1.41.0 +xxhash==3.5.0 +sentry-sdk==2.29.1 +aiohttp==3.12.2 +opencv-python==4.11.0.86 +pycryptodome==3.23.0 +threadpoolctl==3.6.0 +flash-attn==2.7.1.post1 +transformers==4.52.3 +pycparser==2.22 +pathlib==1.0.1 +dill==0.3.8 +scikit-learn==1.6.1 +tokenizers==0.21.1 +aliyun-python-sdk-core==2.16.0 +fsspec==2025.3.0 +jmespath==0.10.0 +click==8.2.1 +delta-center-client==0.0.4 +cheroot==10.0.1 +wandb==0.19.11 +setproctitle==1.3.6 +jsonschema==4.24.0 +oss2==2.15.0 +multiprocess==0.70.16 +jaraco.functools==4.1.0 +web.py==0.62 +aliyun-python-sdk-kms==2.16.5 +cryptography==45.0.3 +pip==25.1.1 +docker-pycreds==0.4.0 +typer==0.16.0 +opendelta==0.3.2 +crcmod==1.7 +jaraco.functools==4.0.1 +inflect==7.3.1 +jaraco.collections==5.1.0 +packaging==24.2 +wheel==0.45.1 +tomli==2.0.1 +platformdirs==4.2.2 +typing_extensions==4.12.2 +more-itertools==10.3.0 +autocommand==2.2.2 +jaraco.text==3.12.1 +importlib_metadata==8.0.0 +jaraco.context==5.3.0 +zipp==3.19.2 +backports.tarfile==1.2.0 +typeguard==4.3.0 diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5d829ee76ce7473894f3aaf2f3a69dfa0c685d08 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-metadata.json @@ -0,0 +1,98 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-06-22T11:01:09.864619Z", + "args": [ + "--devices", + "0,1,2,3", + "--mode", + "train", + "--filename", + "stage1_06221723", + "--num_query_token", + "8", + "--plm_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--save_every_n_epochs", + "5", + "--max_epochs", + "20", + "--batch_size", + "32", + "--precision", + "bf16-mixed", + "--mix_dataset", + "--num_workers", + "8", + "--use_wandb_logger" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", + "codePath": "stage1.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage1_06221723/", + "host": "dsw-251511-c5cfcb8-lwcpt", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage1.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "1363144704" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-summary.json b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..9f61df7d54e464e8f8f30d59b5674f8ef6e7a4a9 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":11}} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-internal.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..85b0632db53385ac194d02f46ea735edae6676a0 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-internal.log @@ -0,0 +1,10 @@ +{"time":"2025-06-22T19:01:09.866022252+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-core.log"} +{"time":"2025-06-22T19:01:10.922793248+08:00","level":"INFO","msg":"created new stream","id":"tul2l6xd"} +{"time":"2025-06-22T19:01:10.922840238+08:00","level":"INFO","msg":"stream: started","id":"tul2l6xd"} +{"time":"2025-06-22T19:01:10.922861712+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"tul2l6xd"} +{"time":"2025-06-22T19:01:10.922902903+08:00","level":"INFO","msg":"sender: started","stream_id":"tul2l6xd"} +{"time":"2025-06-22T19:01:10.922946705+08:00","level":"INFO","msg":"handler: started","stream_id":"tul2l6xd"} +{"time":"2025-06-22T19:01:12.123540259+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-06-22T19:01:21.097810278+08:00","level":"INFO","msg":"stream: closing","id":"tul2l6xd"} +{"time":"2025-06-22T19:01:21.097899274+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-06-22T19:01:21.098725356+08:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..0c8d80a6aef49b30107f5387f8f14228f74bfb3a --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug.log @@ -0,0 +1,24 @@ +2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Configure stats pid to 75754 +2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug.log +2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-internal.log +2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:init():852] calling init triggers +2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:init():893] starting backend +2025-06-22 19:01:09,858 INFO MainThread:75754 [wandb_init.py:init():897] sending inform_init request +2025-06-22 19:01:09,860 INFO MainThread:75754 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-22 19:01:09,861 INFO MainThread:75754 [wandb_init.py:init():907] backend started and connected +2025-06-22 19:01:09,865 INFO MainThread:75754 [wandb_init.py:init():1005] updated telemetry +2025-06-22 19:01:09,869 INFO MainThread:75754 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-06-22 19:01:12,112 INFO MainThread:75754 [wandb_init.py:init():1104] starting run threads in backend +2025-06-22 19:01:12,253 INFO MainThread:75754 [wandb_run.py:_console_start():2573] atexit reg +2025-06-22 19:01:12,253 INFO MainThread:75754 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-06-22 19:01:12,256 INFO MainThread:75754 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-06-22 19:01:12,256 INFO MainThread:75754 [wandb_run.py:_redirect():2513] Redirects installed. +2025-06-22 19:01:12,257 INFO MainThread:75754 [wandb_init.py:init():1150] run started, returning control to user process +2025-06-22 19:01:16,111 INFO MainThread:75754 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'} +2025-06-22 19:01:21,097 INFO MsgRouterThr:75754 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/run-tul2l6xd.wandb b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/run-tul2l6xd.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/config.yaml b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2b55a647b9aa37a8b2e76519b64247dab9550889 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/config.yaml @@ -0,0 +1,128 @@ +_wandb: + value: + cli_version: 0.19.11 + m: + - "1": trainer/global_step + "6": + - 3 + "7": [] + python_version: 3.10.0 + t: + "1": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "2": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "3": + - 7 + - 23 + - 55 + - 66 + "4": 3.10.0 + "5": 0.19.11 + "6": 4.52.3 + "8": + - 5 + "12": 0.19.11 + "13": linux-x86_64 +accelerator: + value: gpu +batch_size: + value: 32 +bert_hidden_dim: + value: 768 +bert_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft +check_val_every_n_epoch: + value: 1 +cross_attention_freq: + value: 2 +devices: + value: 0,1,2,3 +filename: + value: stage1_06221723 +init_checkpoint: + value: "" +init_lr: + value: 0.0001 +lm: + value: true +load_4bit: + value: false +lr_decay_rate: + value: 0.9 +match_batch_size: + value: 64 +max_epochs: + value: 20 +min_lr: + value: 1e-05 +mix_dataset: + value: true +mode: + value: train +num_query_token: + value: 8 +num_workers: + value: 4 +plm_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m +plm_tune: + value: freeze +pool_size: + value: 0 +precision: + value: bf16-mixed +projection_dim: + value: 256 +prot_aug: + value: None +prot_max_len: + value: 1024 +ptm: + value: true +rerank_cand_num: + value: 128 +retrieval_eval_epoch: + value: 10 +root: + value: data +save_every_n_epochs: + value: 5 +scheduler: + value: linear_warmup_cosine_lr +seed: + value: 42 +strategy: + value: deepspeed +temperature: + value: 0.1 +text_max_len: + value: 128 +use_wandb_logger: + value: true +warmup_lr: + value: 1e-06 +warmup_steps: + value: 1000 +weight_decay: + value: 0.05 diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/output.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..05c35ea21619f8233cfe7f17b952e2c1aed5f459 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/output.log @@ -0,0 +1,141 @@ +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty. +Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`. +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] + + | Name | Type | Params | Mode +------------------------------------------------------ +0 | blip2qformer | Blip2Qformer | 327 M | train +------------------------------------------------------ +179 M Trainable params +147 M Non-trainable params +327 M Total params +1,309.467 Total estimated model params size (MB) +5 Modules in train mode +926 Modules in eval mode +Sanity Checking DataLoader 2: 50%|██████████████████████████▌ | 1/2 [00:00<00:00, 1.92it/s] +/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. + with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32): +Traceback (most recent call last): + File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in + main(args) + File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main + trainer.fit(model, datamodule=dm) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit + call._call_and_handle_interrupt( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt + return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch + return function(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl + self._run(model, ckpt_path=ckpt_path) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run + results = self._run_stage() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage + self._run_sanity_check() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check + val_loop.run() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator + return loop_run(self, *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run + self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step + output = call._call_strategy_hook(trainer, hook_name, *step_args) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook + output = fn(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step + return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__ + wrapper_output = wrapper_module(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn + ret_val = func(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward + loss = self.module(*inputs, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl + return inner() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner + result = forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward + out = method(*_args, **_kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context + return func(*args, **kwargs) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step + blip2_loss = self.blip2qformer(batch) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward + sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global + loss_graph = F.cross_entropy(logits_per_graph, labels) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy + return torch._C._nn.cross_entropy_loss( +RuntimeError: size mismatch (got input: [4], target: [1]) +[rank0]: Traceback (most recent call last): +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in +[rank0]: main(args) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main +[rank0]: trainer.fit(model, datamodule=dm) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit +[rank0]: call._call_and_handle_interrupt( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt +[rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch +[rank0]: return function(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl +[rank0]: self._run(model, ckpt_path=ckpt_path) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run +[rank0]: results = self._run_stage() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage +[rank0]: self._run_sanity_check() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check +[rank0]: val_loop.run() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator +[rank0]: return loop_run(self, *args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run +[rank0]: self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step +[rank0]: output = call._call_strategy_hook(trainer, hook_name, *step_args) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook +[rank0]: output = fn(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step +[rank0]: return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__ +[rank0]: wrapper_output = wrapper_module(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn +[rank0]: ret_val = func(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward +[rank0]: loss = self.module(*inputs, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl +[rank0]: return inner() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner +[rank0]: result = forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward +[rank0]: out = method(*_args, **_kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context +[rank0]: return func(*args, **kwargs) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step +[rank0]: blip2_loss = self.blip2qformer(batch) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward +[rank0]: sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global +[rank0]: loss_graph = F.cross_entropy(logits_per_graph, labels) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy +[rank0]: return torch._C._nn.cross_entropy_loss( +[rank0]: RuntimeError: size mismatch (got input: [4], target: [1]) diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/requirements.txt b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..92bc26e2f236896f569edd37a24ef914a02f96de --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/requirements.txt @@ -0,0 +1,225 @@ +numpy==2.2.6 +confection==0.1.5 +text-unidecode==1.3 +contexttimer==0.3.3 +omegaconf==2.3.0 +tzdata==2025.2 +nvidia-cuda-nvrtc-cu12==12.4.127 +plotly==6.1.1 +decord==0.6.0 +nvidia-cublas-cu12==12.4.5.8 +scipy==1.15.3 +nvidia-cufile-cu12==1.11.1.6 +parso==0.8.4 +python-dateutil==2.9.0.post0 +setuptools==78.1.1 +aiosignal==1.3.2 +joblib==1.5.1 +platformdirs==4.3.8 +regex==2024.11.6 +aiohappyeyeballs==2.6.1 +virtualenv==20.31.2 +lazy_loader==0.4 +rich==14.0.0 +timm==0.4.12 +antlr4-python3-runtime==4.9.3 +pandas==2.2.3 +salesforce-lavis==1.0.2 +gitdb==4.0.12 +six==1.17.0 +smmap==5.0.2 +annotated-types==0.7.0 +pyparsing==3.2.3 +Jinja2==3.1.6 +ptyprocess==0.7.0 +streamlit==1.45.1 +idna==3.10 +nvidia-cusolver-cu12==11.6.1.9 +tenacity==9.1.2 +sentencepiece==0.2.0 +matplotlib-inline==0.1.7 +typing-inspection==0.4.1 +packaging==24.2 +nltk==3.9.1 +wheel==0.45.1 +catalogue==2.0.10 +matplotlib==3.10.3 +propcache==0.3.1 +Pygments==2.19.1 +nvidia-nvjitlink-cu12==12.4.127 +requests==2.32.3 +filelock==3.18.0 +pexpect==4.9.0 +opencv-python-headless==4.5.5.64 +certifi==2025.4.26 +nvidia-nvtx-cu12==12.4.127 +bleach==6.2.0 +typing_extensions==4.13.2 +tornado==6.5.1 +networkx==3.4.2 +sympy==1.13.1 +watchdog==6.0.0 +kaggle==1.7.4.5 +nvidia-ml-py==12.575.51 +pyarrow==20.0.0 +mpmath==1.3.0 +lightning-utilities==0.14.3 +ftfy==6.3.1 +triton==3.2.0 +referencing==0.36.2 +ipython==8.36.0 +yarl==1.20.0 +language_data==1.3.0 +cycler==0.12.1 +python-magic==0.4.27 +wasabi==1.1.3 +protobuf==6.31.0 +murmurhash==1.0.13 +jsonschema-specifications==2025.4.1 +blinker==1.9.0 +fonttools==4.58.0 +imageio==2.37.0 +pycocoevalcap==1.2 +nvidia-cuda-cupti-cu12==12.4.127 +fairscale==0.4.4 +hjson==3.1.0 +identify==2.6.12 +mdurl==0.1.2 +decorator==5.2.1 +distlib==0.3.9 +webencodings==0.5.1 +kiwisolver==1.4.8 +srsly==2.5.1 +frozenlist==1.6.0 +blis==1.3.0 +contourpy==1.3.2 +hf-xet==1.1.2 +cymem==2.0.11 +pillow==11.2.1 +pycocotools==2.0.8 +pre_commit==4.2.0 +wrapt==1.17.2 +nvidia-curand-cu12==10.3.5.147 +spacy==3.8.7 +rpds-py==0.25.1 +exceptiongroup==1.3.0 +braceexpand==0.1.7 +rouge_score==0.1.2 +async-timeout==5.0.1 +torchmetrics==1.7.1 +nvidia-nccl-cu12==2.21.5 +wcwidth==0.2.13 +nvidia-cusparselt-cu12==0.6.2 +scikit-image==0.25.2 +urllib3==2.4.0 +portalocker==3.1.1 +smart-open==7.1.0 +cfgv==3.4.0 +markdown-it-py==3.0.0 +charset-normalizer==3.4.2 +executing==2.2.0 +pure_eval==0.2.3 +safetensors==0.5.3 +spacy-legacy==3.0.12 +shellingham==1.5.4 +langcodes==3.5.0 +pytz==2025.2 +iopath==0.1.10 +weasel==0.4.1 +tifffile==2025.5.10 +nodeenv==1.9.1 +absl-py==2.2.2 +einops==0.8.1 +msgpack==1.1.0 +pydantic_core==2.33.2 +ninja==1.11.1.4 +altair==5.5.0 +attrs==25.3.0 +tqdm==4.67.1 +deepspeed==0.16.10+b666844f +pydeck==0.9.1 +stack-data==0.6.3 +pydantic==2.11.5 +torch==2.6.0 +nvidia-cudnn-cu12==9.1.0.70 +python-slugify==8.0.4 +webdataset==0.2.111 +pytorch-lightning==2.5.1.post0 +prompt_toolkit==3.0.51 +psutil==7.0.0 +opendatasets==0.1.22 +asttokens==3.0.0 +MarkupSafe==3.0.2 +multidict==6.4.4 +nvidia-cufft-cu12==11.2.1.3 +GitPython==3.1.44 +PyYAML==6.0.2 +cloudpathlib==0.21.1 +toml==0.10.2 +marisa-trie==1.2.1 +traitlets==5.14.3 +cachetools==5.5.2 +spacy-loggers==1.0.5 +nvidia-cuda-runtime-cu12==12.4.127 +torchvision==0.21.0 +nvidia-cusparse-cu12==12.3.1.170 +jedi==0.19.2 +thinc==8.3.6 +py-cpuinfo==9.0.0 +yacs==0.1.8 +cffi==1.17.1 +preshed==3.0.10 +more-itertools==10.7.0 +bigmodelvis==0.0.1 +datasets==3.6.0 +huggingface-hub==0.32.1 +narwhals==1.41.0 +xxhash==3.5.0 +sentry-sdk==2.29.1 +aiohttp==3.12.2 +opencv-python==4.11.0.86 +pycryptodome==3.23.0 +threadpoolctl==3.6.0 +flash-attn==2.7.1.post1 +transformers==4.52.3 +pycparser==2.22 +pathlib==1.0.1 +dill==0.3.8 +scikit-learn==1.6.1 +tokenizers==0.21.1 +aliyun-python-sdk-core==2.16.0 +fsspec==2025.3.0 +jmespath==0.10.0 +click==8.2.1 +delta-center-client==0.0.4 +cheroot==10.0.1 +wandb==0.19.11 +setproctitle==1.3.6 +jsonschema==4.24.0 +oss2==2.15.0 +multiprocess==0.70.16 +jaraco.functools==4.1.0 +web.py==0.62 +aliyun-python-sdk-kms==2.16.5 +cryptography==45.0.3 +pip==25.1.1 +docker-pycreds==0.4.0 +typer==0.16.0 +opendelta==0.3.2 +crcmod==1.7 +jaraco.functools==4.0.1 +inflect==7.3.1 +jaraco.collections==5.1.0 +packaging==24.2 +wheel==0.45.1 +tomli==2.0.1 +platformdirs==4.2.2 +typing_extensions==4.12.2 +more-itertools==10.3.0 +autocommand==2.2.2 +jaraco.text==3.12.1 +importlib_metadata==8.0.0 +jaraco.context==5.3.0 +zipp==3.19.2 +backports.tarfile==1.2.0 +typeguard==4.3.0 diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..7632b2155b037e82c03a97d46c6dcd15720260ae --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-metadata.json @@ -0,0 +1,98 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-06-22T11:04:29.455254Z", + "args": [ + "--devices", + "0,1,2,3", + "--mode", + "train", + "--filename", + "stage1_06221723", + "--num_query_token", + "8", + "--plm_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--save_every_n_epochs", + "5", + "--max_epochs", + "20", + "--batch_size", + "32", + "--precision", + "bf16-mixed", + "--mix_dataset", + "--num_workers", + "4", + "--use_wandb_logger" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", + "codePath": "stage1.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage1_06221723/", + "host": "dsw-251511-c5cfcb8-lwcpt", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage1.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "1363148800" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-summary.json b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..9f61df7d54e464e8f8f30d59b5674f8ef6e7a4a9 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":11}} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-internal.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..677fc977dbdd49fc3608aa8d86d01d0e87edf5ea --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-internal.log @@ -0,0 +1,15 @@ +{"time":"2025-06-22T19:04:29.457094442+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-core.log"} +{"time":"2025-06-22T19:04:30.490221799+08:00","level":"INFO","msg":"created new stream","id":"bq9amgfj"} +{"time":"2025-06-22T19:04:30.490257318+08:00","level":"INFO","msg":"stream: started","id":"bq9amgfj"} +{"time":"2025-06-22T19:04:30.490294134+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"bq9amgfj"} +{"time":"2025-06-22T19:04:30.490304877+08:00","level":"INFO","msg":"sender: started","stream_id":"bq9amgfj"} +{"time":"2025-06-22T19:04:30.490450091+08:00","level":"INFO","msg":"handler: started","stream_id":"bq9amgfj"} +{"time":"2025-06-22T19:04:31.742014982+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-06-22T19:04:40.504265321+08:00","level":"INFO","msg":"stream: closing","id":"bq9amgfj"} +{"time":"2025-06-22T19:04:40.504307897+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-06-22T19:04:40.505067489+08:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-06-22T19:04:41.953923124+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-06-22T19:04:43.137437891+08:00","level":"INFO","msg":"handler: closed","stream_id":"bq9amgfj"} +{"time":"2025-06-22T19:04:43.137489667+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"bq9amgfj"} +{"time":"2025-06-22T19:04:43.137501311+08:00","level":"INFO","msg":"sender: closed","stream_id":"bq9amgfj"} +{"time":"2025-06-22T19:04:43.141402359+08:00","level":"INFO","msg":"stream: closed","id":"bq9amgfj"} diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..6591d59d24ae9c9af090b161bd439e66c7e908c9 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug.log @@ -0,0 +1,24 @@ +2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Configure stats pid to 79876 +2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug.log +2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-internal.log +2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:init():852] calling init triggers +2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:init():893] starting backend +2025-06-22 19:04:29,449 INFO MainThread:79876 [wandb_init.py:init():897] sending inform_init request +2025-06-22 19:04:29,450 INFO MainThread:79876 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-22 19:04:29,452 INFO MainThread:79876 [wandb_init.py:init():907] backend started and connected +2025-06-22 19:04:29,456 INFO MainThread:79876 [wandb_init.py:init():1005] updated telemetry +2025-06-22 19:04:29,458 INFO MainThread:79876 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-06-22 19:04:31,693 INFO MainThread:79876 [wandb_init.py:init():1104] starting run threads in backend +2025-06-22 19:04:31,896 INFO MainThread:79876 [wandb_run.py:_console_start():2573] atexit reg +2025-06-22 19:04:31,897 INFO MainThread:79876 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-06-22 19:04:31,899 INFO MainThread:79876 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-06-22 19:04:31,899 INFO MainThread:79876 [wandb_run.py:_redirect():2513] Redirects installed. +2025-06-22 19:04:31,900 INFO MainThread:79876 [wandb_init.py:init():1150] run started, returning control to user process +2025-06-22 19:04:35,461 INFO MainThread:79876 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 4, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'} +2025-06-22 19:04:40,503 INFO MsgRouterThr:79876 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/run-bq9amgfj.wandb b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/run-bq9amgfj.wandb new file mode 100644 index 0000000000000000000000000000000000000000..46f220de2e340a53533da480bdcb0c00bb2ffe0b Binary files /dev/null and b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/run-bq9amgfj.wandb differ diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/config.yaml b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c06edaeca61d3376f51f885c3e62fbf1e3ead6a3 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/config.yaml @@ -0,0 +1,236 @@ +_wandb: + value: + cli_version: 0.19.11 + m: + - "1": loader1/val_loss/dataloader_idx_1 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": trainer/global_step + "6": + - 3 + "7": [] + - "1": loader1/val_loss_ptm/dataloader_idx_1 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train_loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader2/val_loss_ptm/dataloader_idx_2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader2/val_loss/dataloader_idx_2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train_loss_ptm + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train_loss_lm + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader1/val_loss_ptc/dataloader_idx_1 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader0/val_loss_ptm/dataloader_idx_0 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader0/val_loss_lm/dataloader_idx_0 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader2/val_loss_lm/dataloader_idx_2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader1/val_loss_lm/dataloader_idx_1 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train_loss_ptc + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader0/val_loss_ptc/dataloader_idx_0 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader0/val_loss/dataloader_idx_0 + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": lr + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": loader2/val_loss_ptc/dataloader_idx_2 + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.10.0 + t: + "1": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "2": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "3": + - 7 + - 23 + - 55 + - 66 + "4": 3.10.0 + "5": 0.19.11 + "6": 4.52.3 + "8": + - 5 + "12": 0.19.11 + "13": linux-x86_64 +accelerator: + value: gpu +batch_size: + value: 32 +bert_hidden_dim: + value: 768 +bert_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft +check_val_every_n_epoch: + value: 1 +cross_attention_freq: + value: 2 +devices: + value: 0,1,2,3,4,5,6,7 +filename: + value: stage1_06221723 +init_checkpoint: + value: "" +init_lr: + value: 0.0001 +lm: + value: true +load_4bit: + value: false +lr_decay_rate: + value: 0.9 +match_batch_size: + value: 64 +max_epochs: + value: 20 +min_lr: + value: 1e-05 +mix_dataset: + value: true +mode: + value: train +num_query_token: + value: 8 +num_workers: + value: 8 +plm_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m +plm_tune: + value: freeze +pool_size: + value: 0 +precision: + value: bf16-mixed +projection_dim: + value: 256 +prot_aug: + value: None +prot_max_len: + value: 1024 +ptm: + value: true +rerank_cand_num: + value: 128 +retrieval_eval_epoch: + value: 10 +root: + value: data +save_every_n_epochs: + value: 5 +scheduler: + value: linear_warmup_cosine_lr +seed: + value: 42 +strategy: + value: deepspeed +temperature: + value: 0.1 +text_max_len: + value: 128 +use_wandb_logger: + value: true +warmup_lr: + value: 1e-06 +warmup_steps: + value: 1000 +weight_decay: + value: 0.05 diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/output.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..29fe1dd563cadf6304cb5ecbab3769f70e371244 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/output.log @@ -0,0 +1,20 @@ +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty. +Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`. +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] + + | Name | Type | Params | Mode +------------------------------------------------------ +0 | blip2qformer | Blip2Qformer | 327 M | train +------------------------------------------------------ +179 M Trainable params +147 M Non-trainable params +327 M Total params +1,309.467 Total estimated model params size (MB) +5 Modules in train mode +926 Modules in eval mode +Epoch 3: 18%|██████████▊ | 11/61 [00:08<00:37, 1.35it/s, v_num=24k3] +/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. + with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32): + + +Detected KeyboardInterrupt, attempting graceful shutdown ... diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/requirements.txt b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..92bc26e2f236896f569edd37a24ef914a02f96de --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/requirements.txt @@ -0,0 +1,225 @@ +numpy==2.2.6 +confection==0.1.5 +text-unidecode==1.3 +contexttimer==0.3.3 +omegaconf==2.3.0 +tzdata==2025.2 +nvidia-cuda-nvrtc-cu12==12.4.127 +plotly==6.1.1 +decord==0.6.0 +nvidia-cublas-cu12==12.4.5.8 +scipy==1.15.3 +nvidia-cufile-cu12==1.11.1.6 +parso==0.8.4 +python-dateutil==2.9.0.post0 +setuptools==78.1.1 +aiosignal==1.3.2 +joblib==1.5.1 +platformdirs==4.3.8 +regex==2024.11.6 +aiohappyeyeballs==2.6.1 +virtualenv==20.31.2 +lazy_loader==0.4 +rich==14.0.0 +timm==0.4.12 +antlr4-python3-runtime==4.9.3 +pandas==2.2.3 +salesforce-lavis==1.0.2 +gitdb==4.0.12 +six==1.17.0 +smmap==5.0.2 +annotated-types==0.7.0 +pyparsing==3.2.3 +Jinja2==3.1.6 +ptyprocess==0.7.0 +streamlit==1.45.1 +idna==3.10 +nvidia-cusolver-cu12==11.6.1.9 +tenacity==9.1.2 +sentencepiece==0.2.0 +matplotlib-inline==0.1.7 +typing-inspection==0.4.1 +packaging==24.2 +nltk==3.9.1 +wheel==0.45.1 +catalogue==2.0.10 +matplotlib==3.10.3 +propcache==0.3.1 +Pygments==2.19.1 +nvidia-nvjitlink-cu12==12.4.127 +requests==2.32.3 +filelock==3.18.0 +pexpect==4.9.0 +opencv-python-headless==4.5.5.64 +certifi==2025.4.26 +nvidia-nvtx-cu12==12.4.127 +bleach==6.2.0 +typing_extensions==4.13.2 +tornado==6.5.1 +networkx==3.4.2 +sympy==1.13.1 +watchdog==6.0.0 +kaggle==1.7.4.5 +nvidia-ml-py==12.575.51 +pyarrow==20.0.0 +mpmath==1.3.0 +lightning-utilities==0.14.3 +ftfy==6.3.1 +triton==3.2.0 +referencing==0.36.2 +ipython==8.36.0 +yarl==1.20.0 +language_data==1.3.0 +cycler==0.12.1 +python-magic==0.4.27 +wasabi==1.1.3 +protobuf==6.31.0 +murmurhash==1.0.13 +jsonschema-specifications==2025.4.1 +blinker==1.9.0 +fonttools==4.58.0 +imageio==2.37.0 +pycocoevalcap==1.2 +nvidia-cuda-cupti-cu12==12.4.127 +fairscale==0.4.4 +hjson==3.1.0 +identify==2.6.12 +mdurl==0.1.2 +decorator==5.2.1 +distlib==0.3.9 +webencodings==0.5.1 +kiwisolver==1.4.8 +srsly==2.5.1 +frozenlist==1.6.0 +blis==1.3.0 +contourpy==1.3.2 +hf-xet==1.1.2 +cymem==2.0.11 +pillow==11.2.1 +pycocotools==2.0.8 +pre_commit==4.2.0 +wrapt==1.17.2 +nvidia-curand-cu12==10.3.5.147 +spacy==3.8.7 +rpds-py==0.25.1 +exceptiongroup==1.3.0 +braceexpand==0.1.7 +rouge_score==0.1.2 +async-timeout==5.0.1 +torchmetrics==1.7.1 +nvidia-nccl-cu12==2.21.5 +wcwidth==0.2.13 +nvidia-cusparselt-cu12==0.6.2 +scikit-image==0.25.2 +urllib3==2.4.0 +portalocker==3.1.1 +smart-open==7.1.0 +cfgv==3.4.0 +markdown-it-py==3.0.0 +charset-normalizer==3.4.2 +executing==2.2.0 +pure_eval==0.2.3 +safetensors==0.5.3 +spacy-legacy==3.0.12 +shellingham==1.5.4 +langcodes==3.5.0 +pytz==2025.2 +iopath==0.1.10 +weasel==0.4.1 +tifffile==2025.5.10 +nodeenv==1.9.1 +absl-py==2.2.2 +einops==0.8.1 +msgpack==1.1.0 +pydantic_core==2.33.2 +ninja==1.11.1.4 +altair==5.5.0 +attrs==25.3.0 +tqdm==4.67.1 +deepspeed==0.16.10+b666844f +pydeck==0.9.1 +stack-data==0.6.3 +pydantic==2.11.5 +torch==2.6.0 +nvidia-cudnn-cu12==9.1.0.70 +python-slugify==8.0.4 +webdataset==0.2.111 +pytorch-lightning==2.5.1.post0 +prompt_toolkit==3.0.51 +psutil==7.0.0 +opendatasets==0.1.22 +asttokens==3.0.0 +MarkupSafe==3.0.2 +multidict==6.4.4 +nvidia-cufft-cu12==11.2.1.3 +GitPython==3.1.44 +PyYAML==6.0.2 +cloudpathlib==0.21.1 +toml==0.10.2 +marisa-trie==1.2.1 +traitlets==5.14.3 +cachetools==5.5.2 +spacy-loggers==1.0.5 +nvidia-cuda-runtime-cu12==12.4.127 +torchvision==0.21.0 +nvidia-cusparse-cu12==12.3.1.170 +jedi==0.19.2 +thinc==8.3.6 +py-cpuinfo==9.0.0 +yacs==0.1.8 +cffi==1.17.1 +preshed==3.0.10 +more-itertools==10.7.0 +bigmodelvis==0.0.1 +datasets==3.6.0 +huggingface-hub==0.32.1 +narwhals==1.41.0 +xxhash==3.5.0 +sentry-sdk==2.29.1 +aiohttp==3.12.2 +opencv-python==4.11.0.86 +pycryptodome==3.23.0 +threadpoolctl==3.6.0 +flash-attn==2.7.1.post1 +transformers==4.52.3 +pycparser==2.22 +pathlib==1.0.1 +dill==0.3.8 +scikit-learn==1.6.1 +tokenizers==0.21.1 +aliyun-python-sdk-core==2.16.0 +fsspec==2025.3.0 +jmespath==0.10.0 +click==8.2.1 +delta-center-client==0.0.4 +cheroot==10.0.1 +wandb==0.19.11 +setproctitle==1.3.6 +jsonschema==4.24.0 +oss2==2.15.0 +multiprocess==0.70.16 +jaraco.functools==4.1.0 +web.py==0.62 +aliyun-python-sdk-kms==2.16.5 +cryptography==45.0.3 +pip==25.1.1 +docker-pycreds==0.4.0 +typer==0.16.0 +opendelta==0.3.2 +crcmod==1.7 +jaraco.functools==4.0.1 +inflect==7.3.1 +jaraco.collections==5.1.0 +packaging==24.2 +wheel==0.45.1 +tomli==2.0.1 +platformdirs==4.2.2 +typing_extensions==4.12.2 +more-itertools==10.3.0 +autocommand==2.2.2 +jaraco.text==3.12.1 +importlib_metadata==8.0.0 +jaraco.context==5.3.0 +zipp==3.19.2 +backports.tarfile==1.2.0 +typeguard==4.3.0 diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..64fe1eaa649ee5f6953e7f4c0b049f6bd499063a --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-metadata.json @@ -0,0 +1,98 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-06-22T11:06:31.087290Z", + "args": [ + "--devices", + "0,1,2,3,4,5,6,7", + "--mode", + "train", + "--filename", + "stage1_06221723", + "--num_query_token", + "8", + "--plm_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--save_every_n_epochs", + "5", + "--max_epochs", + "20", + "--batch_size", + "32", + "--precision", + "bf16-mixed", + "--mix_dataset", + "--num_workers", + "8", + "--use_wandb_logger" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", + "codePath": "stage1.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage1_06221723/", + "host": "dsw-251511-c5cfcb8-lwcpt", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage1.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "1363152896" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-summary.json b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..e7e0de6d18d9e0e9360f295ed764fe1e38e7882e --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-summary.json @@ -0,0 +1 @@ +{"trainer/global_step":182,"loader2/val_loss_lm/dataloader_idx_2":4.984375,"loader0/val_loss_ptc/dataloader_idx_0":2.392578125,"loader2/val_loss_ptc/dataloader_idx_2":3.49609375,"loader0/val_loss/dataloader_idx_0":7.27734375,"loader0/val_loss_lm/dataloader_idx_0":4.2421875,"train_loss":6.375,"train_loss_ptm":0.63671875,"train_loss_lm":2.783203125,"loader1/val_loss/dataloader_idx_1":5.7578125,"_step":5,"_runtime":135.905828304,"_wandb":{"runtime":145},"lr":9.779754327610135e-05,"epoch":2,"loader1/val_loss_ptm/dataloader_idx_1":0.63330078125,"loader1/val_loss_ptc/dataloader_idx_1":2.71484375,"loader0/val_loss_ptm/dataloader_idx_0":0.6416015625,"loader2/val_loss_ptm/dataloader_idx_2":0.63525390625,"loader1/val_loss_lm/dataloader_idx_1":2.412109375,"loader2/val_loss/dataloader_idx_2":9.109375,"train_loss_ptc":2.953125,"_timestamp":1.7505905269926882e+09} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-internal.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..c273e063340e3dafb9e4893f7966db88e4f004a7 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-internal.log @@ -0,0 +1,10 @@ +{"time":"2025-06-22T19:06:31.147471317+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-core.log"} +{"time":"2025-06-22T19:06:32.175593884+08:00","level":"INFO","msg":"created new stream","id":"9wqt24k3"} +{"time":"2025-06-22T19:06:32.175639093+08:00","level":"INFO","msg":"stream: started","id":"9wqt24k3"} +{"time":"2025-06-22T19:06:32.175689814+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"9wqt24k3"} +{"time":"2025-06-22T19:06:32.17571173+08:00","level":"INFO","msg":"sender: started","stream_id":"9wqt24k3"} +{"time":"2025-06-22T19:06:32.175747445+08:00","level":"INFO","msg":"handler: started","stream_id":"9wqt24k3"} +{"time":"2025-06-22T19:06:33.341553042+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-06-22T19:08:56.485679626+08:00","level":"INFO","msg":"stream: closing","id":"9wqt24k3"} +{"time":"2025-06-22T19:08:56.485774311+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-06-22T19:08:56.486438679+08:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..e075583a3150e2503f16f563c10e3fc70b9a4cb0 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug.log @@ -0,0 +1,24 @@ +2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Configure stats pid to 82552 +2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug.log +2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-internal.log +2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:init():852] calling init triggers +2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-22 19:06:31,083 INFO MainThread:82552 [wandb_init.py:init():893] starting backend +2025-06-22 19:06:31,084 INFO MainThread:82552 [wandb_init.py:init():897] sending inform_init request +2025-06-22 19:06:31,085 INFO MainThread:82552 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-22 19:06:31,087 INFO MainThread:82552 [wandb_init.py:init():907] backend started and connected +2025-06-22 19:06:31,088 INFO MainThread:82552 [wandb_init.py:init():1005] updated telemetry +2025-06-22 19:06:31,147 INFO MainThread:82552 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-06-22 19:06:33,293 INFO MainThread:82552 [wandb_init.py:init():1104] starting run threads in backend +2025-06-22 19:06:33,470 INFO MainThread:82552 [wandb_run.py:_console_start():2573] atexit reg +2025-06-22 19:06:33,471 INFO MainThread:82552 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-06-22 19:06:33,474 INFO MainThread:82552 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-06-22 19:06:33,474 INFO MainThread:82552 [wandb_run.py:_redirect():2513] Redirects installed. +2025-06-22 19:06:33,475 INFO MainThread:82552 [wandb_init.py:init():1150] run started, returning control to user process +2025-06-22 19:06:38,587 INFO MainThread:82552 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'} +2025-06-22 19:08:56,484 INFO MsgRouterThr:82552 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/config.yaml b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..225b939fd758f215d2d88ebc5d5ea331dbc59e7e --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/config.yaml @@ -0,0 +1,128 @@ +_wandb: + value: + cli_version: 0.19.11 + m: + - "1": trainer/global_step + "6": + - 3 + "7": [] + python_version: 3.10.0 + t: + "1": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "2": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "3": + - 7 + - 23 + - 55 + - 66 + "4": 3.10.0 + "5": 0.19.11 + "6": 4.52.3 + "8": + - 5 + "12": 0.19.11 + "13": linux-x86_64 +accelerator: + value: gpu +batch_size: + value: 32 +bert_hidden_dim: + value: 768 +bert_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft +check_val_every_n_epoch: + value: 1 +cross_attention_freq: + value: 2 +devices: + value: 0,1,2,3 +filename: + value: stage1_06221723 +init_checkpoint: + value: "" +init_lr: + value: 0.0001 +lm: + value: true +load_4bit: + value: false +lr_decay_rate: + value: 0.9 +match_batch_size: + value: 64 +max_epochs: + value: 20 +min_lr: + value: 1e-05 +mix_dataset: + value: true +mode: + value: train +num_query_token: + value: 8 +num_workers: + value: 8 +plm_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m +plm_tune: + value: freeze +pool_size: + value: 0 +precision: + value: bf16-mixed +projection_dim: + value: 256 +prot_aug: + value: None +prot_max_len: + value: 1024 +ptm: + value: true +rerank_cand_num: + value: 128 +retrieval_eval_epoch: + value: 10 +root: + value: data +save_every_n_epochs: + value: 5 +scheduler: + value: linear_warmup_cosine_lr +seed: + value: 42 +strategy: + value: deepspeed +temperature: + value: 0.1 +text_max_len: + value: 128 +use_wandb_logger: + value: true +warmup_lr: + value: 1e-06 +warmup_steps: + value: 1000 +weight_decay: + value: 0.05 diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/output.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..6d83c55191173e964c70f68cd7b0de964dd0b3f7 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/output.log @@ -0,0 +1,141 @@ +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty. +Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`. +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3] + + | Name | Type | Params | Mode +------------------------------------------------------ +0 | blip2qformer | Blip2Qformer | 327 M | train +------------------------------------------------------ +179 M Trainable params +147 M Non-trainable params +327 M Total params +1,309.467 Total estimated model params size (MB) +5 Modules in train mode +926 Modules in eval mode +Sanity Checking DataLoader 2: 50%|██████████████████████████▌ | 1/2 [00:00<00:00, 1.83it/s] +/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. + with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32): +Traceback (most recent call last): + File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in + main(args) + File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main + trainer.fit(model, datamodule=dm) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit + call._call_and_handle_interrupt( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt + return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch + return function(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl + self._run(model, ckpt_path=ckpt_path) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run + results = self._run_stage() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage + self._run_sanity_check() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check + val_loop.run() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator + return loop_run(self, *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run + self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step + output = call._call_strategy_hook(trainer, hook_name, *step_args) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook + output = fn(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step + return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__ + wrapper_output = wrapper_module(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn + ret_val = func(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward + loss = self.module(*inputs, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl + return inner() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner + result = forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward + out = method(*_args, **_kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context + return func(*args, **kwargs) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step + blip2_loss = self.blip2qformer(batch) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward + sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global + loss_graph = F.cross_entropy(logits_per_graph, labels) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy + return torch._C._nn.cross_entropy_loss( +RuntimeError: size mismatch (got input: [4], target: [1]) +[rank0]: Traceback (most recent call last): +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in +[rank0]: main(args) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main +[rank0]: trainer.fit(model, datamodule=dm) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit +[rank0]: call._call_and_handle_interrupt( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt +[rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch +[rank0]: return function(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl +[rank0]: self._run(model, ckpt_path=ckpt_path) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run +[rank0]: results = self._run_stage() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage +[rank0]: self._run_sanity_check() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check +[rank0]: val_loop.run() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator +[rank0]: return loop_run(self, *args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run +[rank0]: self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step +[rank0]: output = call._call_strategy_hook(trainer, hook_name, *step_args) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook +[rank0]: output = fn(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step +[rank0]: return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__ +[rank0]: wrapper_output = wrapper_module(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn +[rank0]: ret_val = func(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward +[rank0]: loss = self.module(*inputs, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl +[rank0]: return inner() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner +[rank0]: result = forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward +[rank0]: out = method(*_args, **_kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context +[rank0]: return func(*args, **kwargs) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step +[rank0]: blip2_loss = self.blip2qformer(batch) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward +[rank0]: sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global +[rank0]: loss_graph = F.cross_entropy(logits_per_graph, labels) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy +[rank0]: return torch._C._nn.cross_entropy_loss( +[rank0]: RuntimeError: size mismatch (got input: [4], target: [1]) diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/requirements.txt b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..92bc26e2f236896f569edd37a24ef914a02f96de --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/requirements.txt @@ -0,0 +1,225 @@ +numpy==2.2.6 +confection==0.1.5 +text-unidecode==1.3 +contexttimer==0.3.3 +omegaconf==2.3.0 +tzdata==2025.2 +nvidia-cuda-nvrtc-cu12==12.4.127 +plotly==6.1.1 +decord==0.6.0 +nvidia-cublas-cu12==12.4.5.8 +scipy==1.15.3 +nvidia-cufile-cu12==1.11.1.6 +parso==0.8.4 +python-dateutil==2.9.0.post0 +setuptools==78.1.1 +aiosignal==1.3.2 +joblib==1.5.1 +platformdirs==4.3.8 +regex==2024.11.6 +aiohappyeyeballs==2.6.1 +virtualenv==20.31.2 +lazy_loader==0.4 +rich==14.0.0 +timm==0.4.12 +antlr4-python3-runtime==4.9.3 +pandas==2.2.3 +salesforce-lavis==1.0.2 +gitdb==4.0.12 +six==1.17.0 +smmap==5.0.2 +annotated-types==0.7.0 +pyparsing==3.2.3 +Jinja2==3.1.6 +ptyprocess==0.7.0 +streamlit==1.45.1 +idna==3.10 +nvidia-cusolver-cu12==11.6.1.9 +tenacity==9.1.2 +sentencepiece==0.2.0 +matplotlib-inline==0.1.7 +typing-inspection==0.4.1 +packaging==24.2 +nltk==3.9.1 +wheel==0.45.1 +catalogue==2.0.10 +matplotlib==3.10.3 +propcache==0.3.1 +Pygments==2.19.1 +nvidia-nvjitlink-cu12==12.4.127 +requests==2.32.3 +filelock==3.18.0 +pexpect==4.9.0 +opencv-python-headless==4.5.5.64 +certifi==2025.4.26 +nvidia-nvtx-cu12==12.4.127 +bleach==6.2.0 +typing_extensions==4.13.2 +tornado==6.5.1 +networkx==3.4.2 +sympy==1.13.1 +watchdog==6.0.0 +kaggle==1.7.4.5 +nvidia-ml-py==12.575.51 +pyarrow==20.0.0 +mpmath==1.3.0 +lightning-utilities==0.14.3 +ftfy==6.3.1 +triton==3.2.0 +referencing==0.36.2 +ipython==8.36.0 +yarl==1.20.0 +language_data==1.3.0 +cycler==0.12.1 +python-magic==0.4.27 +wasabi==1.1.3 +protobuf==6.31.0 +murmurhash==1.0.13 +jsonschema-specifications==2025.4.1 +blinker==1.9.0 +fonttools==4.58.0 +imageio==2.37.0 +pycocoevalcap==1.2 +nvidia-cuda-cupti-cu12==12.4.127 +fairscale==0.4.4 +hjson==3.1.0 +identify==2.6.12 +mdurl==0.1.2 +decorator==5.2.1 +distlib==0.3.9 +webencodings==0.5.1 +kiwisolver==1.4.8 +srsly==2.5.1 +frozenlist==1.6.0 +blis==1.3.0 +contourpy==1.3.2 +hf-xet==1.1.2 +cymem==2.0.11 +pillow==11.2.1 +pycocotools==2.0.8 +pre_commit==4.2.0 +wrapt==1.17.2 +nvidia-curand-cu12==10.3.5.147 +spacy==3.8.7 +rpds-py==0.25.1 +exceptiongroup==1.3.0 +braceexpand==0.1.7 +rouge_score==0.1.2 +async-timeout==5.0.1 +torchmetrics==1.7.1 +nvidia-nccl-cu12==2.21.5 +wcwidth==0.2.13 +nvidia-cusparselt-cu12==0.6.2 +scikit-image==0.25.2 +urllib3==2.4.0 +portalocker==3.1.1 +smart-open==7.1.0 +cfgv==3.4.0 +markdown-it-py==3.0.0 +charset-normalizer==3.4.2 +executing==2.2.0 +pure_eval==0.2.3 +safetensors==0.5.3 +spacy-legacy==3.0.12 +shellingham==1.5.4 +langcodes==3.5.0 +pytz==2025.2 +iopath==0.1.10 +weasel==0.4.1 +tifffile==2025.5.10 +nodeenv==1.9.1 +absl-py==2.2.2 +einops==0.8.1 +msgpack==1.1.0 +pydantic_core==2.33.2 +ninja==1.11.1.4 +altair==5.5.0 +attrs==25.3.0 +tqdm==4.67.1 +deepspeed==0.16.10+b666844f +pydeck==0.9.1 +stack-data==0.6.3 +pydantic==2.11.5 +torch==2.6.0 +nvidia-cudnn-cu12==9.1.0.70 +python-slugify==8.0.4 +webdataset==0.2.111 +pytorch-lightning==2.5.1.post0 +prompt_toolkit==3.0.51 +psutil==7.0.0 +opendatasets==0.1.22 +asttokens==3.0.0 +MarkupSafe==3.0.2 +multidict==6.4.4 +nvidia-cufft-cu12==11.2.1.3 +GitPython==3.1.44 +PyYAML==6.0.2 +cloudpathlib==0.21.1 +toml==0.10.2 +marisa-trie==1.2.1 +traitlets==5.14.3 +cachetools==5.5.2 +spacy-loggers==1.0.5 +nvidia-cuda-runtime-cu12==12.4.127 +torchvision==0.21.0 +nvidia-cusparse-cu12==12.3.1.170 +jedi==0.19.2 +thinc==8.3.6 +py-cpuinfo==9.0.0 +yacs==0.1.8 +cffi==1.17.1 +preshed==3.0.10 +more-itertools==10.7.0 +bigmodelvis==0.0.1 +datasets==3.6.0 +huggingface-hub==0.32.1 +narwhals==1.41.0 +xxhash==3.5.0 +sentry-sdk==2.29.1 +aiohttp==3.12.2 +opencv-python==4.11.0.86 +pycryptodome==3.23.0 +threadpoolctl==3.6.0 +flash-attn==2.7.1.post1 +transformers==4.52.3 +pycparser==2.22 +pathlib==1.0.1 +dill==0.3.8 +scikit-learn==1.6.1 +tokenizers==0.21.1 +aliyun-python-sdk-core==2.16.0 +fsspec==2025.3.0 +jmespath==0.10.0 +click==8.2.1 +delta-center-client==0.0.4 +cheroot==10.0.1 +wandb==0.19.11 +setproctitle==1.3.6 +jsonschema==4.24.0 +oss2==2.15.0 +multiprocess==0.70.16 +jaraco.functools==4.1.0 +web.py==0.62 +aliyun-python-sdk-kms==2.16.5 +cryptography==45.0.3 +pip==25.1.1 +docker-pycreds==0.4.0 +typer==0.16.0 +opendelta==0.3.2 +crcmod==1.7 +jaraco.functools==4.0.1 +inflect==7.3.1 +jaraco.collections==5.1.0 +packaging==24.2 +wheel==0.45.1 +tomli==2.0.1 +platformdirs==4.2.2 +typing_extensions==4.12.2 +more-itertools==10.3.0 +autocommand==2.2.2 +jaraco.text==3.12.1 +importlib_metadata==8.0.0 +jaraco.context==5.3.0 +zipp==3.19.2 +backports.tarfile==1.2.0 +typeguard==4.3.0 diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..91f250b6ef8daba73dba6e7ec5f0b9a7993fca09 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-metadata.json @@ -0,0 +1,98 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-06-22T11:10:38.675049Z", + "args": [ + "--devices", + "0,1,2,3", + "--mode", + "train", + "--filename", + "stage1_06221723", + "--num_query_token", + "8", + "--plm_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--save_every_n_epochs", + "5", + "--max_epochs", + "20", + "--batch_size", + "32", + "--precision", + "bf16-mixed", + "--mix_dataset", + "--num_workers", + "8", + "--use_wandb_logger" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", + "codePath": "stage1.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage1_06221723/", + "host": "dsw-251511-c5cfcb8-lwcpt", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage1.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "1363165184" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-summary.json b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..9f61df7d54e464e8f8f30d59b5674f8ef6e7a4a9 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":11}} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-internal.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..9c8790e53ae7c0dca569db8b17b4d6de9fc33ff7 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-internal.log @@ -0,0 +1,15 @@ +{"time":"2025-06-22T19:10:38.679318052+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-core.log"} +{"time":"2025-06-22T19:10:39.726410578+08:00","level":"INFO","msg":"created new stream","id":"bu5vqvh5"} +{"time":"2025-06-22T19:10:39.726458298+08:00","level":"INFO","msg":"stream: started","id":"bu5vqvh5"} +{"time":"2025-06-22T19:10:39.726477847+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"bu5vqvh5"} +{"time":"2025-06-22T19:10:39.726508843+08:00","level":"INFO","msg":"sender: started","stream_id":"bu5vqvh5"} +{"time":"2025-06-22T19:10:39.726652161+08:00","level":"INFO","msg":"handler: started","stream_id":"bu5vqvh5"} +{"time":"2025-06-22T19:10:40.990067167+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-06-22T19:10:49.81571789+08:00","level":"INFO","msg":"stream: closing","id":"bu5vqvh5"} +{"time":"2025-06-22T19:10:49.815789673+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-06-22T19:10:49.816535239+08:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-06-22T19:10:52.523961836+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-06-22T19:10:53.665534986+08:00","level":"INFO","msg":"handler: closed","stream_id":"bu5vqvh5"} +{"time":"2025-06-22T19:10:53.665598663+08:00","level":"INFO","msg":"sender: closed","stream_id":"bu5vqvh5"} +{"time":"2025-06-22T19:10:53.665594948+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"bu5vqvh5"} +{"time":"2025-06-22T19:10:53.669904066+08:00","level":"INFO","msg":"stream: closed","id":"bu5vqvh5"} diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..995338d911ca0e9b76f31e355e4baa6738e11200 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug.log @@ -0,0 +1,24 @@ +2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Configure stats pid to 95583 +2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug.log +2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-internal.log +2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:init():852] calling init triggers +2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:init():893] starting backend +2025-06-22 19:10:38,671 INFO MainThread:95583 [wandb_init.py:init():897] sending inform_init request +2025-06-22 19:10:38,673 INFO MainThread:95583 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-22 19:10:38,674 INFO MainThread:95583 [wandb_init.py:init():907] backend started and connected +2025-06-22 19:10:38,675 INFO MainThread:95583 [wandb_init.py:init():1005] updated telemetry +2025-06-22 19:10:38,678 INFO MainThread:95583 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-06-22 19:10:40,981 INFO MainThread:95583 [wandb_init.py:init():1104] starting run threads in backend +2025-06-22 19:10:41,122 INFO MainThread:95583 [wandb_run.py:_console_start():2573] atexit reg +2025-06-22 19:10:41,122 INFO MainThread:95583 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-06-22 19:10:41,125 INFO MainThread:95583 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-06-22 19:10:41,125 INFO MainThread:95583 [wandb_run.py:_redirect():2513] Redirects installed. +2025-06-22 19:10:41,126 INFO MainThread:95583 [wandb_init.py:init():1150] run started, returning control to user process +2025-06-22 19:10:44,823 INFO MainThread:95583 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'} +2025-06-22 19:10:49,814 INFO MsgRouterThr:95583 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/run-bu5vqvh5.wandb b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/run-bu5vqvh5.wandb new file mode 100644 index 0000000000000000000000000000000000000000..73d6653e48f400eaffd6b2de2acc64dad358add0 Binary files /dev/null and b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/run-bu5vqvh5.wandb differ diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/config.yaml b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..225b939fd758f215d2d88ebc5d5ea331dbc59e7e --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/config.yaml @@ -0,0 +1,128 @@ +_wandb: + value: + cli_version: 0.19.11 + m: + - "1": trainer/global_step + "6": + - 3 + "7": [] + python_version: 3.10.0 + t: + "1": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "2": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "3": + - 7 + - 23 + - 55 + - 66 + "4": 3.10.0 + "5": 0.19.11 + "6": 4.52.3 + "8": + - 5 + "12": 0.19.11 + "13": linux-x86_64 +accelerator: + value: gpu +batch_size: + value: 32 +bert_hidden_dim: + value: 768 +bert_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft +check_val_every_n_epoch: + value: 1 +cross_attention_freq: + value: 2 +devices: + value: 0,1,2,3 +filename: + value: stage1_06221723 +init_checkpoint: + value: "" +init_lr: + value: 0.0001 +lm: + value: true +load_4bit: + value: false +lr_decay_rate: + value: 0.9 +match_batch_size: + value: 64 +max_epochs: + value: 20 +min_lr: + value: 1e-05 +mix_dataset: + value: true +mode: + value: train +num_query_token: + value: 8 +num_workers: + value: 8 +plm_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m +plm_tune: + value: freeze +pool_size: + value: 0 +precision: + value: bf16-mixed +projection_dim: + value: 256 +prot_aug: + value: None +prot_max_len: + value: 1024 +ptm: + value: true +rerank_cand_num: + value: 128 +retrieval_eval_epoch: + value: 10 +root: + value: data +save_every_n_epochs: + value: 5 +scheduler: + value: linear_warmup_cosine_lr +seed: + value: 42 +strategy: + value: deepspeed +temperature: + value: 0.1 +text_max_len: + value: 128 +use_wandb_logger: + value: true +warmup_lr: + value: 1e-06 +warmup_steps: + value: 1000 +weight_decay: + value: 0.05 diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/output.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..b0b498c4140e017c96521879e201aa43ed801f9c --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/output.log @@ -0,0 +1,150 @@ +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty. +Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`. +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3] + + | Name | Type | Params | Mode +------------------------------------------------------ +0 | blip2qformer | Blip2Qformer | 327 M | train +------------------------------------------------------ +179 M Trainable params +147 M Non-trainable params +327 M Total params +1,309.467 Total estimated model params size (MB) +5 Modules in train mode +926 Modules in eval mode +Sanity Checking DataLoader 0: 0%| | 0/1 [00:00 + main(args) + File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main + trainer.fit(model, datamodule=dm) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit + call._call_and_handle_interrupt( + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt + return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch + return function(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl + self._run(model, ckpt_path=ckpt_path) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run + results = self._run_stage() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage + self._run_sanity_check() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check + val_loop.run() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator + return loop_run(self, *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run + self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step + output = call._call_strategy_hook(trainer, hook_name, *step_args) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook + output = fn(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step + return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__ + wrapper_output = wrapper_module(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn + ret_val = func(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward + loss = self.module(*inputs, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl + return inner() + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner + result = forward_call(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward + out = method(*_args, **_kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context + return func(*args, **kwargs) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step + blip2_loss = self.blip2qformer(batch) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl + return forward_call(*args, **kwargs) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 141, in forward + sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True) + File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 104, in contrast_global + loss_graph = F.cross_entropy(logits_per_graph, labels) + File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy + return torch._C._nn.cross_entropy_loss( +RuntimeError: size mismatch (got input: [4], target: [1]) +[rank0]: Traceback (most recent call last): +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in +[rank0]: main(args) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main +[rank0]: trainer.fit(model, datamodule=dm) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit +[rank0]: call._call_and_handle_interrupt( +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt +[rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch +[rank0]: return function(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl +[rank0]: self._run(model, ckpt_path=ckpt_path) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run +[rank0]: results = self._run_stage() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage +[rank0]: self._run_sanity_check() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check +[rank0]: val_loop.run() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator +[rank0]: return loop_run(self, *args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run +[rank0]: self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step +[rank0]: output = call._call_strategy_hook(trainer, hook_name, *step_args) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook +[rank0]: output = fn(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step +[rank0]: return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__ +[rank0]: wrapper_output = wrapper_module(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn +[rank0]: ret_val = func(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward +[rank0]: loss = self.module(*inputs, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl +[rank0]: return inner() +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner +[rank0]: result = forward_call(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward +[rank0]: out = method(*_args, **_kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context +[rank0]: return func(*args, **kwargs) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step +[rank0]: blip2_loss = self.blip2qformer(batch) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl +[rank0]: return self._call_impl(*args, **kwargs) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl +[rank0]: return forward_call(*args, **kwargs) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 141, in forward +[rank0]: sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True) +[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 104, in contrast_global +[rank0]: loss_graph = F.cross_entropy(logits_per_graph, labels) +[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy +[rank0]: return torch._C._nn.cross_entropy_loss( +[rank0]: RuntimeError: size mismatch (got input: [4], target: [1]) diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/requirements.txt b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..92bc26e2f236896f569edd37a24ef914a02f96de --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/requirements.txt @@ -0,0 +1,225 @@ +numpy==2.2.6 +confection==0.1.5 +text-unidecode==1.3 +contexttimer==0.3.3 +omegaconf==2.3.0 +tzdata==2025.2 +nvidia-cuda-nvrtc-cu12==12.4.127 +plotly==6.1.1 +decord==0.6.0 +nvidia-cublas-cu12==12.4.5.8 +scipy==1.15.3 +nvidia-cufile-cu12==1.11.1.6 +parso==0.8.4 +python-dateutil==2.9.0.post0 +setuptools==78.1.1 +aiosignal==1.3.2 +joblib==1.5.1 +platformdirs==4.3.8 +regex==2024.11.6 +aiohappyeyeballs==2.6.1 +virtualenv==20.31.2 +lazy_loader==0.4 +rich==14.0.0 +timm==0.4.12 +antlr4-python3-runtime==4.9.3 +pandas==2.2.3 +salesforce-lavis==1.0.2 +gitdb==4.0.12 +six==1.17.0 +smmap==5.0.2 +annotated-types==0.7.0 +pyparsing==3.2.3 +Jinja2==3.1.6 +ptyprocess==0.7.0 +streamlit==1.45.1 +idna==3.10 +nvidia-cusolver-cu12==11.6.1.9 +tenacity==9.1.2 +sentencepiece==0.2.0 +matplotlib-inline==0.1.7 +typing-inspection==0.4.1 +packaging==24.2 +nltk==3.9.1 +wheel==0.45.1 +catalogue==2.0.10 +matplotlib==3.10.3 +propcache==0.3.1 +Pygments==2.19.1 +nvidia-nvjitlink-cu12==12.4.127 +requests==2.32.3 +filelock==3.18.0 +pexpect==4.9.0 +opencv-python-headless==4.5.5.64 +certifi==2025.4.26 +nvidia-nvtx-cu12==12.4.127 +bleach==6.2.0 +typing_extensions==4.13.2 +tornado==6.5.1 +networkx==3.4.2 +sympy==1.13.1 +watchdog==6.0.0 +kaggle==1.7.4.5 +nvidia-ml-py==12.575.51 +pyarrow==20.0.0 +mpmath==1.3.0 +lightning-utilities==0.14.3 +ftfy==6.3.1 +triton==3.2.0 +referencing==0.36.2 +ipython==8.36.0 +yarl==1.20.0 +language_data==1.3.0 +cycler==0.12.1 +python-magic==0.4.27 +wasabi==1.1.3 +protobuf==6.31.0 +murmurhash==1.0.13 +jsonschema-specifications==2025.4.1 +blinker==1.9.0 +fonttools==4.58.0 +imageio==2.37.0 +pycocoevalcap==1.2 +nvidia-cuda-cupti-cu12==12.4.127 +fairscale==0.4.4 +hjson==3.1.0 +identify==2.6.12 +mdurl==0.1.2 +decorator==5.2.1 +distlib==0.3.9 +webencodings==0.5.1 +kiwisolver==1.4.8 +srsly==2.5.1 +frozenlist==1.6.0 +blis==1.3.0 +contourpy==1.3.2 +hf-xet==1.1.2 +cymem==2.0.11 +pillow==11.2.1 +pycocotools==2.0.8 +pre_commit==4.2.0 +wrapt==1.17.2 +nvidia-curand-cu12==10.3.5.147 +spacy==3.8.7 +rpds-py==0.25.1 +exceptiongroup==1.3.0 +braceexpand==0.1.7 +rouge_score==0.1.2 +async-timeout==5.0.1 +torchmetrics==1.7.1 +nvidia-nccl-cu12==2.21.5 +wcwidth==0.2.13 +nvidia-cusparselt-cu12==0.6.2 +scikit-image==0.25.2 +urllib3==2.4.0 +portalocker==3.1.1 +smart-open==7.1.0 +cfgv==3.4.0 +markdown-it-py==3.0.0 +charset-normalizer==3.4.2 +executing==2.2.0 +pure_eval==0.2.3 +safetensors==0.5.3 +spacy-legacy==3.0.12 +shellingham==1.5.4 +langcodes==3.5.0 +pytz==2025.2 +iopath==0.1.10 +weasel==0.4.1 +tifffile==2025.5.10 +nodeenv==1.9.1 +absl-py==2.2.2 +einops==0.8.1 +msgpack==1.1.0 +pydantic_core==2.33.2 +ninja==1.11.1.4 +altair==5.5.0 +attrs==25.3.0 +tqdm==4.67.1 +deepspeed==0.16.10+b666844f +pydeck==0.9.1 +stack-data==0.6.3 +pydantic==2.11.5 +torch==2.6.0 +nvidia-cudnn-cu12==9.1.0.70 +python-slugify==8.0.4 +webdataset==0.2.111 +pytorch-lightning==2.5.1.post0 +prompt_toolkit==3.0.51 +psutil==7.0.0 +opendatasets==0.1.22 +asttokens==3.0.0 +MarkupSafe==3.0.2 +multidict==6.4.4 +nvidia-cufft-cu12==11.2.1.3 +GitPython==3.1.44 +PyYAML==6.0.2 +cloudpathlib==0.21.1 +toml==0.10.2 +marisa-trie==1.2.1 +traitlets==5.14.3 +cachetools==5.5.2 +spacy-loggers==1.0.5 +nvidia-cuda-runtime-cu12==12.4.127 +torchvision==0.21.0 +nvidia-cusparse-cu12==12.3.1.170 +jedi==0.19.2 +thinc==8.3.6 +py-cpuinfo==9.0.0 +yacs==0.1.8 +cffi==1.17.1 +preshed==3.0.10 +more-itertools==10.7.0 +bigmodelvis==0.0.1 +datasets==3.6.0 +huggingface-hub==0.32.1 +narwhals==1.41.0 +xxhash==3.5.0 +sentry-sdk==2.29.1 +aiohttp==3.12.2 +opencv-python==4.11.0.86 +pycryptodome==3.23.0 +threadpoolctl==3.6.0 +flash-attn==2.7.1.post1 +transformers==4.52.3 +pycparser==2.22 +pathlib==1.0.1 +dill==0.3.8 +scikit-learn==1.6.1 +tokenizers==0.21.1 +aliyun-python-sdk-core==2.16.0 +fsspec==2025.3.0 +jmespath==0.10.0 +click==8.2.1 +delta-center-client==0.0.4 +cheroot==10.0.1 +wandb==0.19.11 +setproctitle==1.3.6 +jsonschema==4.24.0 +oss2==2.15.0 +multiprocess==0.70.16 +jaraco.functools==4.1.0 +web.py==0.62 +aliyun-python-sdk-kms==2.16.5 +cryptography==45.0.3 +pip==25.1.1 +docker-pycreds==0.4.0 +typer==0.16.0 +opendelta==0.3.2 +crcmod==1.7 +jaraco.functools==4.0.1 +inflect==7.3.1 +jaraco.collections==5.1.0 +packaging==24.2 +wheel==0.45.1 +tomli==2.0.1 +platformdirs==4.2.2 +typing_extensions==4.12.2 +more-itertools==10.3.0 +autocommand==2.2.2 +jaraco.text==3.12.1 +importlib_metadata==8.0.0 +jaraco.context==5.3.0 +zipp==3.19.2 +backports.tarfile==1.2.0 +typeguard==4.3.0 diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..c1b802be4e1e99ac67a89c9aed3d71f14d518dbb --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-metadata.json @@ -0,0 +1,98 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-06-22T11:18:08.739768Z", + "args": [ + "--devices", + "0,1,2,3", + "--mode", + "train", + "--filename", + "stage1_06221723", + "--num_query_token", + "8", + "--plm_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--save_every_n_epochs", + "5", + "--max_epochs", + "20", + "--batch_size", + "32", + "--precision", + "bf16-mixed", + "--mix_dataset", + "--num_workers", + "8", + "--use_wandb_logger" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", + "codePath": "stage1.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage1_06221723/", + "host": "dsw-251511-c5cfcb8-lwcpt", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage1.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "1363197952" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-summary.json b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..9f61df7d54e464e8f8f30d59b5674f8ef6e7a4a9 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":11}} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-internal.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..490615571c9952f311854e426efd6b5359c8d307 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-internal.log @@ -0,0 +1,10 @@ +{"time":"2025-06-22T19:18:08.742601474+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-core.log"} +{"time":"2025-06-22T19:18:09.773061599+08:00","level":"INFO","msg":"created new stream","id":"a2dszq0q"} +{"time":"2025-06-22T19:18:09.773105546+08:00","level":"INFO","msg":"stream: started","id":"a2dszq0q"} +{"time":"2025-06-22T19:18:09.773141968+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"a2dszq0q"} +{"time":"2025-06-22T19:18:09.773166056+08:00","level":"INFO","msg":"sender: started","stream_id":"a2dszq0q"} +{"time":"2025-06-22T19:18:09.773225667+08:00","level":"INFO","msg":"handler: started","stream_id":"a2dszq0q"} +{"time":"2025-06-22T19:18:11.012086945+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-06-22T19:18:19.860220439+08:00","level":"INFO","msg":"stream: closing","id":"a2dszq0q"} +{"time":"2025-06-22T19:18:19.860330929+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-06-22T19:18:19.861063374+08:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..925397fe4d9d131a527b9b3b864f3098115fca5a --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug.log @@ -0,0 +1,24 @@ +2025-06-22 19:18:08,732 INFO MainThread:99755 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_setup.py:_flush():70] Configure stats pid to 99755 +2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug.log +2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-internal.log +2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:init():852] calling init triggers +2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:init():893] starting backend +2025-06-22 19:18:08,733 INFO MainThread:99755 [wandb_init.py:init():897] sending inform_init request +2025-06-22 19:18:08,734 INFO MainThread:99755 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-22 19:18:08,736 INFO MainThread:99755 [wandb_init.py:init():907] backend started and connected +2025-06-22 19:18:08,740 INFO MainThread:99755 [wandb_init.py:init():1005] updated telemetry +2025-06-22 19:18:08,746 INFO MainThread:99755 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-06-22 19:18:10,963 INFO MainThread:99755 [wandb_init.py:init():1104] starting run threads in backend +2025-06-22 19:18:11,141 INFO MainThread:99755 [wandb_run.py:_console_start():2573] atexit reg +2025-06-22 19:18:11,142 INFO MainThread:99755 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-06-22 19:18:11,145 INFO MainThread:99755 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-06-22 19:18:11,145 INFO MainThread:99755 [wandb_run.py:_redirect():2513] Redirects installed. +2025-06-22 19:18:11,146 INFO MainThread:99755 [wandb_init.py:init():1150] run started, returning control to user process +2025-06-22 19:18:14,870 INFO MainThread:99755 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'} +2025-06-22 19:18:19,859 INFO MsgRouterThr:99755 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles. diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/run-a2dszq0q.wandb b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/run-a2dszq0q.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/config.yaml b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e012eba04d3a36cee18cf21032bfe29a149858cd --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/config.yaml @@ -0,0 +1,128 @@ +_wandb: + value: + cli_version: 0.19.11 + m: + - "1": trainer/global_step + "6": + - 3 + "7": [] + python_version: 3.10.0 + t: + "1": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "2": + - 1 + - 5 + - 9 + - 11 + - 33 + - 41 + - 49 + - 53 + - 55 + - 63 + - 103 + "3": + - 7 + - 23 + - 55 + - 66 + "4": 3.10.0 + "5": 0.19.11 + "6": 4.52.3 + "8": + - 5 + "12": 0.19.11 + "13": linux-x86_64 +accelerator: + value: gpu +batch_size: + value: 32 +bert_hidden_dim: + value: 768 +bert_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft +check_val_every_n_epoch: + value: 1 +cross_attention_freq: + value: 2 +devices: + value: 0,1,2,3,4,5,6,7 +filename: + value: stage1_06221723 +init_checkpoint: + value: "" +init_lr: + value: 0.0001 +lm: + value: true +load_4bit: + value: false +lr_decay_rate: + value: 0.9 +match_batch_size: + value: 64 +max_epochs: + value: 20 +min_lr: + value: 1e-05 +mix_dataset: + value: true +mode: + value: train +num_query_token: + value: 8 +num_workers: + value: 8 +plm_name: + value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m +plm_tune: + value: freeze +pool_size: + value: 0 +precision: + value: bf16-mixed +projection_dim: + value: 256 +prot_aug: + value: None +prot_max_len: + value: 1024 +ptm: + value: true +rerank_cand_num: + value: 128 +retrieval_eval_epoch: + value: 10 +root: + value: data +save_every_n_epochs: + value: 5 +scheduler: + value: linear_warmup_cosine_lr +seed: + value: 42 +strategy: + value: deepspeed +temperature: + value: 0.1 +text_max_len: + value: 128 +use_wandb_logger: + value: true +warmup_lr: + value: 1e-06 +warmup_steps: + value: 1000 +weight_decay: + value: 0.05 diff --git a/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/output.log b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..06b2bacc5ef44e05661f4448af56f99ba10bd7f2 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/output.log @@ -0,0 +1,110 @@ +[rank0]:W0622 19:26:47.840041 104393 site-packages/torch/distributed/distributed_c10d.py:2941] _object_to_tensor size: 81 hash value: 6444836214324640892 +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty. +Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`. +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] + + | Name | Type | Params | Mode +------------------------------------------------------ +0 | blip2qformer | Blip2Qformer | 327 M | train +------------------------------------------------------ +179 M Trainable params +147 M Non-trainable params +327 M Total params +1,309.467 Total estimated model params size (MB) +5 Modules in train mode +926 Modules in eval mode +Sanity Checking DataLoader 0: 0%| | 0/1 [00:00172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T21:27:33.331148254+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.0.83:54150->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-25T21:58:27.179919372+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:40534->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T22:02:25.259899391+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:38440->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T22:16:31.59587148+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:56268->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T22:19:56.907879782+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:43654->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T22:23:52.42791399+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:53794->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T22:26:33.708117916+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:40034->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T22:30:13.867916693+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:32980->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T22:30:33.10966556+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-25T22:31:18.580341641+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:39024->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-06-25T22:31:45.75581378+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:34464->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-25T22:34:33.112611993+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-06-25T22:34:35.287138561+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": unexpected EOF"} +{"time":"2025-06-25T22:42:10.155856104+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:58140->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T22:45:09.355938555+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:52284->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T22:46:07.40951519+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:51012->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-25T22:46:41.008497375+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": unexpected EOF"} +{"time":"2025-06-25T22:47:31.259847092+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:43332->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-25T22:51:19.019873559+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:57658->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T22:52:03.381938447+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": unexpected EOF"} +{"time":"2025-06-25T22:52:23.522879435+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": EOF"} +{"time":"2025-06-25T22:53:48.1218961+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-25T22:54:20.402583005+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-06-25T22:55:17.099855676+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:50348->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:00:16.619877289+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:37840->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:03:18.81326346+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-25T23:03:57.745139654+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:36672->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-25T23:08:14.315883208+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:54016->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T23:13:09.739942718+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:54158->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:16:20.715862184+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:58004->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T23:20:39.78889843+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:40198->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T23:23:30.795892852+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:48128->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:26:15.910097937+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:44958->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-06-25T23:30:50.604847693+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:42698->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:32:23.800889762+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": unexpected EOF"} +{"time":"2025-06-25T23:33:26.830944473+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": unexpected EOF"} +{"time":"2025-06-25T23:38:21.163951837+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:52674->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:41:24.459894526+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:38954->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T23:44:19.563893608+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:34598->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:46:18.182150858+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:34706->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-06-25T23:49:46.249081011+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": context deadline exceeded"} +{"time":"2025-06-25T23:53:18.699900687+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:57666->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:53:46.542264524+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:33188->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-06-25T23:54:07.951959346+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": unexpected EOF"} +{"time":"2025-06-25T23:57:13.707890889+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:36368->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:59:57.036853786+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:52244->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-26T00:03:11.083910384+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:52824->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-26T00:10:34.743438132+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": unexpected EOF"} +{"time":"2025-06-26T00:13:02.606968144+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:58564->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-06-26T00:18:10.155893302+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:46202->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-26T00:22:20.523889616+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:33012->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-26T00:38:02.395930463+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:55212->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-06-26T00:53:38.53989026+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:58500->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-26T00:55:03.187417989+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-26T00:57:10.507870306+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:57040->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-26T01:07:32.913163174+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:38334->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-26T01:17:38.796889621+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:43274->104.21.20.172:443: read: connection timed out"} diff --git a/ProtT3/all_checkpoints/stage1_06251842/wandb/debug.log b/ProtT3/all_checkpoints/stage1_06251842/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5a931ca69b74a5161821cd575a8a84eb8692935f --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06251842/wandb/debug.log @@ -0,0 +1,23 @@ +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_setup.py:_flush():70] Configure stats pid to 4055 +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/logs/debug.log +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/logs/debug-internal.log +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_init.py:init():852] calling init triggers +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_init.py:init():893] starting backend +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_init.py:init():897] sending inform_init request +2025-06-25 18:45:45,317 INFO MainThread:4055 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-25 18:45:45,331 INFO MainThread:4055 [wandb_init.py:init():907] backend started and connected +2025-06-25 18:45:45,334 INFO MainThread:4055 [wandb_init.py:init():1005] updated telemetry +2025-06-25 18:45:45,339 INFO MainThread:4055 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-06-25 18:45:47,641 INFO MainThread:4055 [wandb_init.py:init():1104] starting run threads in backend +2025-06-25 18:45:47,902 INFO MainThread:4055 [wandb_run.py:_console_start():2573] atexit reg +2025-06-25 18:45:47,903 INFO MainThread:4055 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-06-25 18:45:47,914 INFO MainThread:4055 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-06-25 18:45:47,914 INFO MainThread:4055 [wandb_run.py:_redirect():2513] Redirects installed. +2025-06-25 18:45:47,921 INFO MainThread:4055 [wandb_init.py:init():1150] run started, returning control to user process +2025-06-25 18:45:53,732 INFO MainThread:4055 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06251842', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'} diff --git a/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/files/output.log b/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..9a4c6003ddeba6bf789394b7c9ad2f29e47e9633 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/files/output.log @@ -0,0 +1,28 @@ +/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06251842 exists and is not empty. +Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`. +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] + + | Name | Type | Params | Mode +------------------------------------------------------ +0 | blip2qformer | Blip2Qformer | 327 M | train +------------------------------------------------------ +179 M Trainable params +147 M Non-trainable params +327 M Total params +1,309.467 Total estimated model params size (MB) +5 Modules in train mode +926 Modules in eval mode +Epoch 9: 100%|███████████████████████████████████████████| 3331/3331 [35:13<00:00, 1.58it/s, v_num=3n36] +/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:220: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. + with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32): +Validation DataLoader 2: 100%|███████████████████████████████████████████| 40/40 [00:19<00:00, 2.08it/s] +/nas/shared/kilab/wangyujia/ProtT3/model/dist_funs.py:18: FutureWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. + sd = self.module.state_dict(destination, prefix, keep_vars) +/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py:42: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. + return torch.cuda.amp.autocast(dtype=dtype) +100%|██████████████████████████████████████████████████████████████████| 157/157 [08:53<00:00, 3.40s/it] +100%|██████████████████████████████████████████████████████████████| 1250/1250 [00:01<00:00, 1090.12it/s] +re-ranking p2t: 100%|████████████████████████████████████████████████| 2500/2500 [10:06<00:00, 4.12it/s] + 97%|████████████████████████████████████████████████████████████▎ | 1216/1250 [00:01<00:00, 1109.38it/s][rank: 1] Child process with PID 4522 terminated with code -6. Forcefully terminating all other processes to avoid zombies 🧟 +re-ranking p2t: 100%|████████████████████████████████████████████████| 2500/2500 [10:06<00:00, 4.11it/s] +re-ranking t2p: 57%|███████████████████████████▌ | 1436/2500 [10:57<08:10, 2.17it/s] diff --git a/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/files/requirements.txt b/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3381a8445e39a0567cdc98ae36e823ff5bafeba --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/files/requirements.txt @@ -0,0 +1,225 @@ +decord==0.6.0 +pandas==2.2.3 +py-cpuinfo==9.0.0 +virtualenv==20.31.2 +omegaconf==2.3.0 +mdurl==0.1.2 +packaging==24.2 +toml==0.10.2 +hf-xet==1.1.2 +cfgv==3.4.0 +braceexpand==0.1.7 +stack-data==0.6.3 +jedi==0.19.2 +exceptiongroup==1.3.0 +requests==2.32.3 +wcwidth==0.2.13 +lazy_loader==0.4 +blinker==1.9.0 +triton==3.2.0 +annotated-types==0.7.0 +bleach==6.2.0 +urllib3==2.4.0 +spacy-legacy==3.0.12 +fairscale==0.4.4 +absl-py==2.2.2 +python-dateutil==2.9.0.post0 +safetensors==0.5.3 +pyparsing==3.2.3 +confection==0.1.5 +typing-inspection==0.4.1 +jsonschema-specifications==2025.4.1 +Jinja2==3.1.6 +psutil==7.0.0 +numpy==2.2.6 +pytz==2025.2 +MarkupSafe==3.0.2 +wasabi==1.1.3 +imageio==2.37.0 +scipy==1.15.3 +text-unidecode==1.3 +filelock==3.18.0 +webdataset==0.2.111 +pydantic==2.11.5 +altair==5.5.0 +rouge_score==0.1.2 +sympy==1.13.1 +nvidia-nvtx-cu12==12.4.127 +asttokens==3.0.0 +prompt_toolkit==3.0.51 +attrs==25.3.0 +spacy==3.8.7 +pycocotools==2.0.8 +smmap==5.0.2 +PyYAML==6.0.2 +tenacity==9.1.2 +nvidia-cufile-cu12==1.11.1.6 +torch==2.6.0 +idna==3.10 +blis==1.3.0 +typing_extensions==4.13.2 +distlib==0.3.9 +pre_commit==4.2.0 +nvidia-cusolver-cu12==11.6.1.9 +timm==0.4.12 +frozenlist==1.6.0 +gitdb==4.0.12 +shellingham==1.5.4 +murmurhash==1.0.13 +hjson==3.1.0 +kiwisolver==1.4.8 +platformdirs==4.3.8 +networkx==3.4.2 +ipython==8.36.0 +marisa-trie==1.2.1 +weasel==0.4.1 +multidict==6.4.4 +setuptools==78.1.1 +thinc==8.3.6 +cachetools==5.5.2 +rich==14.0.0 +salesforce-lavis==1.0.2 +joblib==1.5.1 +nodeenv==1.9.1 +ptyprocess==0.7.0 +wrapt==1.17.2 +tzdata==2025.2 +Pygments==2.19.1 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-nccl-cu12==2.21.5 +antlr4-python3-runtime==4.9.3 +markdown-it-py==3.0.0 +nltk==3.9.1 +sentencepiece==0.2.0 +aiohappyeyeballs==2.6.1 +python-slugify==8.0.4 +tqdm==4.67.1 +streamlit==1.45.1 +pytorch-lightning==2.5.1.post0 +cycler==0.12.1 +kaggle==1.7.4.5 +pexpect==4.9.0 +cymem==2.0.11 +rpds-py==0.25.1 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +language_data==1.3.0 +portalocker==3.1.1 +propcache==0.3.1 +regex==2024.11.6 +tifffile==2025.5.10 +aiosignal==1.3.2 +ninja==1.11.1.4 +parso==0.8.4 +torchmetrics==1.7.1 +async-timeout==5.0.1 +nvidia-cublas-cu12==12.4.5.8 +pyarrow==20.0.0 +plotly==6.1.1 +six==1.17.0 +pydantic_core==2.33.2 +contexttimer==0.3.3 +msgpack==1.1.0 +wheel==0.45.1 +opendatasets==0.1.22 +srsly==2.5.1 +charset-normalizer==3.4.2 +nvidia-cufft-cu12==11.2.1.3 +tornado==6.5.1 +pillow==11.2.1 +traitlets==5.14.3 +nvidia-cuda-cupti-cu12==12.4.127 +python-magic==0.4.27 +ftfy==6.3.1 +opencv-python-headless==4.5.5.64 +scikit-image==0.25.2 +pure_eval==0.2.3 +watchdog==6.0.0 +iopath==0.1.10 +nvidia-ml-py==12.575.51 +executing==2.2.0 +GitPython==3.1.44 +einops==0.8.1 +webencodings==0.5.1 +protobuf==6.31.0 +identify==2.6.12 +certifi==2025.4.26 +smart-open==7.1.0 +matplotlib-inline==0.1.7 +mpmath==1.3.0 +contourpy==1.3.2 +matplotlib==3.10.3 +deepspeed==0.16.10+b666844f +nvidia-nvjitlink-cu12==12.4.127 +langcodes==3.5.0 +decorator==5.2.1 +spacy-loggers==1.0.5 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusparse-cu12==12.3.1.170 +fonttools==4.58.0 +pydeck==0.9.1 +torchvision==0.21.0 +referencing==0.36.2 +pycocoevalcap==1.2 +lightning-utilities==0.14.3 +catalogue==2.0.10 +nvidia-cusparselt-cu12==0.6.2 +yarl==1.20.0 +cloudpathlib==0.21.1 +fsspec==2025.3.0 +threadpoolctl==3.6.0 +aiohttp==3.12.2 +cheroot==10.0.1 +cryptography==45.0.3 +more-itertools==10.7.0 +docker-pycreds==0.4.0 +wandb==0.19.11 +aliyun-python-sdk-core==2.16.0 +oss2==2.15.0 +scikit-learn==1.6.1 +yacs==0.1.8 +multiprocess==0.70.16 +transformers==4.52.3 +huggingface-hub==0.32.1 +typer==0.16.0 +opendelta==0.3.2 +jaraco.functools==4.1.0 +web.py==0.62 +dill==0.3.8 +pathlib==1.0.1 +pycparser==2.22 +delta-center-client==0.0.4 +datasets==3.6.0 +setproctitle==1.3.6 +xxhash==3.5.0 +click==8.2.1 +bigmodelvis==0.0.1 +preshed==3.0.10 +crcmod==1.7 +narwhals==1.41.0 +pip==25.1.1 +cffi==1.17.1 +opencv-python==4.11.0.86 +jmespath==0.10.0 +jsonschema==4.24.0 +pycryptodome==3.23.0 +tokenizers==0.21.1 +sentry-sdk==2.29.1 +aliyun-python-sdk-kms==2.16.5 +flash-attn==2.7.1.post1 +inflect==7.3.1 +packaging==24.2 +jaraco.collections==5.1.0 +jaraco.text==3.12.1 +importlib_metadata==8.0.0 +typeguard==4.3.0 +platformdirs==4.2.2 +backports.tarfile==1.2.0 +tomli==2.0.1 +jaraco.context==5.3.0 +zipp==3.19.2 +autocommand==2.2.2 +typing_extensions==4.12.2 +jaraco.functools==4.0.1 +wheel==0.45.1 +more-itertools==10.3.0 diff --git a/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/files/wandb-metadata.json b/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..8ecb16fc231aad6ef5d02b5a9ee0358228300961 --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/files/wandb-metadata.json @@ -0,0 +1,98 @@ +{ + "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35", + "python": "CPython 3.10.0", + "startedAt": "2025-06-25T10:45:45.333606Z", + "args": [ + "--devices", + "0,1,2,3,4,5,6,7", + "--mode", + "train", + "--filename", + "stage1_06251842", + "--num_query_token", + "8", + "--plm_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m", + "--bert_name", + "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft", + "--save_every_n_epochs", + "5", + "--max_epochs", + "20", + "--batch_size", + "32", + "--precision", + "bf16-mixed", + "--mix_dataset", + "--num_workers", + "8", + "--use_wandb_logger" + ], + "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", + "codePath": "stage1.py", + "email": "gia0603yucca@gmail.com", + "root": "./all_checkpoints/stage1_06251842/", + "host": "dsw-265304-856768c7b7-kp5k7", + "executable": "/root/miniconda3/envs/protT3/bin/python", + "codePathLocal": "stage1.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA A800-SXM4-80GB", + "gpu_count": 8, + "disk": { + "/": { + "total": "1623302262784", + "used": "1285709824" + } + }, + "memory": { + "total": "549755813888" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + }, + { + "name": "NVIDIA A800-SXM4-80GB", + "memoryTotal": "85198045184", + "architecture": "Ampere" + } + ], + "cudaVersion": "12.1" +} \ No newline at end of file diff --git a/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/logs/debug-internal.log b/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3b38cb88fa8748998497e973f741467e4e2410ce --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/logs/debug-internal.log @@ -0,0 +1,67 @@ +{"time":"2025-06-25T18:45:45.331877961+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/logs/debug-core.log"} +{"time":"2025-06-25T18:45:46.398547009+08:00","level":"INFO","msg":"created new stream","id":"w3yy3n36"} +{"time":"2025-06-25T18:45:46.39859813+08:00","level":"INFO","msg":"stream: started","id":"w3yy3n36"} +{"time":"2025-06-25T18:45:46.398627059+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"w3yy3n36"} +{"time":"2025-06-25T18:45:46.398651468+08:00","level":"INFO","msg":"sender: started","stream_id":"w3yy3n36"} +{"time":"2025-06-25T18:45:46.398711906+08:00","level":"INFO","msg":"handler: started","stream_id":"w3yy3n36"} +{"time":"2025-06-25T18:45:47.652788913+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-06-25T21:16:48.067072441+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-25T21:17:20.327592009+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-25T21:19:05.323923963+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:58536->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T21:27:33.331148254+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.0.83:54150->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-25T21:58:27.179919372+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:40534->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T22:02:25.259899391+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:38440->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T22:16:31.59587148+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:56268->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T22:19:56.907879782+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:43654->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T22:23:52.42791399+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:53794->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T22:26:33.708117916+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:40034->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T22:30:13.867916693+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:32980->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T22:30:33.10966556+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-25T22:31:18.580341641+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:39024->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-06-25T22:31:45.75581378+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:34464->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-25T22:34:33.112611993+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-06-25T22:34:35.287138561+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": unexpected EOF"} +{"time":"2025-06-25T22:42:10.155856104+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:58140->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T22:45:09.355938555+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:52284->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T22:46:07.40951519+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:51012->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-25T22:46:41.008497375+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": unexpected EOF"} +{"time":"2025-06-25T22:47:31.259847092+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:43332->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-25T22:51:19.019873559+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:57658->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T22:52:03.381938447+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": unexpected EOF"} +{"time":"2025-06-25T22:52:23.522879435+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": EOF"} +{"time":"2025-06-25T22:53:48.1218961+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-25T22:54:20.402583005+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"} +{"time":"2025-06-25T22:55:17.099855676+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:50348->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:00:16.619877289+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:37840->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:03:18.81326346+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-25T23:03:57.745139654+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:36672->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-25T23:08:14.315883208+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:54016->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T23:13:09.739942718+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:54158->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:16:20.715862184+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:58004->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T23:20:39.78889843+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:40198->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T23:23:30.795892852+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:48128->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:26:15.910097937+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:44958->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-06-25T23:30:50.604847693+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:42698->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:32:23.800889762+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": unexpected EOF"} +{"time":"2025-06-25T23:33:26.830944473+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": unexpected EOF"} +{"time":"2025-06-25T23:38:21.163951837+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:52674->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:41:24.459894526+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:38954->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-25T23:44:19.563893608+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:34598->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:46:18.182150858+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:34706->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-06-25T23:49:46.249081011+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": context deadline exceeded"} +{"time":"2025-06-25T23:53:18.699900687+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:57666->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:53:46.542264524+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:33188->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-06-25T23:54:07.951959346+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": unexpected EOF"} +{"time":"2025-06-25T23:57:13.707890889+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:36368->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-25T23:59:57.036853786+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:52244->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-26T00:03:11.083910384+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:52824->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-26T00:10:34.743438132+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": unexpected EOF"} +{"time":"2025-06-26T00:13:02.606968144+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:58564->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-06-26T00:18:10.155893302+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:46202->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-26T00:22:20.523889616+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:33012->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-26T00:38:02.395930463+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:55212->172.67.193.61:443: read: connection reset by peer"} +{"time":"2025-06-26T00:53:38.53989026+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:58500->104.21.20.172:443: read: connection timed out"} +{"time":"2025-06-26T00:55:03.187417989+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2025-06-26T00:57:10.507870306+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:57040->172.67.193.61:443: read: connection timed out"} +{"time":"2025-06-26T01:07:32.913163174+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:38334->104.21.20.172:443: read: connection reset by peer"} +{"time":"2025-06-26T01:17:38.796889621+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06251842/w3yy3n36/file_stream\": read tcp 10.1.0.83:43274->104.21.20.172:443: read: connection timed out"} diff --git a/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/logs/debug.log b/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5a931ca69b74a5161821cd575a8a84eb8692935f --- /dev/null +++ b/ProtT3/all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/logs/debug.log @@ -0,0 +1,23 @@ +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11 +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_setup.py:_flush():70] Configure stats pid to 4055 +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_setup.py:_flush():70] Loading settings from environment variables +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/logs/debug.log +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06251842/wandb/run-20250625_184545-w3yy3n36/logs/debug-internal.log +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_init.py:init():852] calling init triggers +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_init.py:init():893] starting backend +2025-06-25 18:45:45,314 INFO MainThread:4055 [wandb_init.py:init():897] sending inform_init request +2025-06-25 18:45:45,317 INFO MainThread:4055 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-25 18:45:45,331 INFO MainThread:4055 [wandb_init.py:init():907] backend started and connected +2025-06-25 18:45:45,334 INFO MainThread:4055 [wandb_init.py:init():1005] updated telemetry +2025-06-25 18:45:45,339 INFO MainThread:4055 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout +2025-06-25 18:45:47,641 INFO MainThread:4055 [wandb_init.py:init():1104] starting run threads in backend +2025-06-25 18:45:47,902 INFO MainThread:4055 [wandb_run.py:_console_start():2573] atexit reg +2025-06-25 18:45:47,903 INFO MainThread:4055 [wandb_run.py:_redirect():2421] redirect: wrap_raw +2025-06-25 18:45:47,914 INFO MainThread:4055 [wandb_run.py:_redirect():2490] Wrapping output streams. +2025-06-25 18:45:47,914 INFO MainThread:4055 [wandb_run.py:_redirect():2513] Redirects installed. +2025-06-25 18:45:47,921 INFO MainThread:4055 [wandb_init.py:init():1150] run started, returning control to user process +2025-06-25 18:45:53,732 INFO MainThread:4055 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06251842', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}