yuccaaa commited on Sep 3, 2025

Commit

992a397

verified ·

1 Parent(s): e48e67e

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_token_acc.png +0 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_epoch.png +0 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_grad_norm.png +0 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_learning_rate.png +0 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_loss.png +0 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_memory(GiB).png +0 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/config.yaml +128 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/output.log +141 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/requirements.txt +225 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-metadata.json +98 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-summary.json +1 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-internal.log +10 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug.log +24 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/run-tul2l6xd.wandb +0 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/config.yaml +128 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/output.log +141 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/requirements.txt +225 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-metadata.json +98 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-summary.json +1 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-internal.log +15 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug.log +24 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/run-bq9amgfj.wandb +0 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/config.yaml +236 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/output.log +20 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/requirements.txt +225 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-metadata.json +98 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-summary.json +1 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-internal.log +10 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug.log +24 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/config.yaml +128 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/output.log +141 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/requirements.txt +225 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-metadata.json +98 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-summary.json +1 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-internal.log +15 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug.log +24 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/run-bu5vqvh5.wandb +0 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/config.yaml +128 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/output.log +150 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/requirements.txt +225 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-metadata.json +98 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-summary.json +1 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-internal.log +10 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug.log +24 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/run-a2dszq0q.wandb +0 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/config.yaml +128 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/output.log +110 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/requirements.txt +225 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/wandb-metadata.json +98 -0
ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/run-zmuhvn72.wandb +0 -0

BIO/sft/qwen-production-08022302/v0-20250802-230250/images/eval_token_acc.png ADDED Viewed

BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_epoch.png ADDED Viewed

BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_grad_norm.png ADDED Viewed

BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_learning_rate.png ADDED Viewed

BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_loss.png ADDED Viewed

BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_memory(GiB).png ADDED Viewed

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/config.yaml ADDED Viewed

	@@ -0,0 +1,128 @@

+_wandb:
+    value:
+        cli_version: 0.19.11
+        m:
+            - "1": trainer/global_step
+              "6":
+                - 3
+              "7": []
+        python_version: 3.10.0
+        t:
+            "1":
+                - 1
+                - 5
+                - 9
+                - 11
+                - 33
+                - 41
+                - 49
+                - 53
+                - 55
+                - 63
+                - 103
+            "2":
+                - 1
+                - 5
+                - 9
+                - 11
+                - 33
+                - 41
+                - 49
+                - 53
+                - 55
+                - 63
+                - 103
+            "3":
+                - 7
+                - 23
+                - 55
+                - 66
+            "4": 3.10.0
+            "5": 0.19.11
+            "6": 4.52.3
+            "8":
+                - 5
+            "12": 0.19.11
+            "13": linux-x86_64
+accelerator:
+    value: gpu
+batch_size:
+    value: 32
+bert_hidden_dim:
+    value: 768
+bert_name:
+    value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
+check_val_every_n_epoch:
+    value: 1
+cross_attention_freq:
+    value: 2
+devices:
+    value: 0,1,2,3
+filename:
+    value: stage1_06221723
+init_checkpoint:
+    value: ""
+init_lr:
+    value: 0.0001
+lm:
+    value: true
+load_4bit:
+    value: false
+lr_decay_rate:
+    value: 0.9
+match_batch_size:
+    value: 64
+max_epochs:
+    value: 20
+min_lr:
+    value: 1e-05
+mix_dataset:
+    value: true
+mode:
+    value: train
+num_query_token:
+    value: 8
+num_workers:
+    value: 8
+plm_name:
+    value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
+plm_tune:
+    value: freeze
+pool_size:
+    value: 0
+precision:
+    value: bf16-mixed
+projection_dim:
+    value: 256
+prot_aug:
+    value: None
+prot_max_len:
+    value: 1024
+ptm:
+    value: true
+rerank_cand_num:
+    value: 128
+retrieval_eval_epoch:
+    value: 10
+root:
+    value: data
+save_every_n_epochs:
+    value: 5
+scheduler:
+    value: linear_warmup_cosine_lr
+seed:
+    value: 42
+strategy:
+    value: deepspeed
+temperature:
+    value: 0.1
+text_max_len:
+    value: 128
+use_wandb_logger:
+    value: true
+warmup_lr:
+    value: 1e-06
+warmup_steps:
+    value: 1000
+weight_decay:
+    value: 0.05

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/output.log ADDED Viewed

	@@ -0,0 +1,141 @@

+/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
+Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+  | Name         | Type         | Params | Mode
+------------------------------------------------------
+0 | blip2qformer | Blip2Qformer | 327 M  | train
+------------------------------------------------------
+179 M     Trainable params
+147 M     Non-trainable params
+327 M     Total params
+1,309.467 Total estimated model params size (MB)
+5         Modules in train mode
+926       Modules in eval mode
+Sanity Checking DataLoader 2:  50%|██████████████████████████▌                          | 1/2 [00:00<00:00,  1.86it/s]
+/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
+Traceback (most recent call last):
+  File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
+    main(args)
+  File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
+    trainer.fit(model, datamodule=dm)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
+    call._call_and_handle_interrupt(
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
+    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
+    return function(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
+    self._run(model, ckpt_path=ckpt_path)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
+    results = self._run_stage()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
+    self._run_sanity_check()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
+    val_loop.run()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
+    return loop_run(self, *args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
+    self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
+    output = call._call_strategy_hook(trainer, hook_name, *step_args)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
+    output = fn(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
+    return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
+    wrapper_output = wrapper_module(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
+    ret_val = func(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
+    loss = self.module(*inputs, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
+    return inner()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
+    result = forward_call(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
+    out = method(*_args, **_kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
+    return func(*args, **kwargs)
+  File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
+    blip2_loss = self.blip2qformer(batch)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
+    sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
+  File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
+    loss_graph = F.cross_entropy(logits_per_graph, labels)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
+    return torch._C._nn.cross_entropy_loss(
+RuntimeError: size mismatch (got input: [4], target: [1])
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
+[rank0]:     main(args)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
+[rank0]:     trainer.fit(model, datamodule=dm)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
+[rank0]:     call._call_and_handle_interrupt(
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
+[rank0]:     return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
+[rank0]:     return function(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
+[rank0]:     self._run(model, ckpt_path=ckpt_path)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
+[rank0]:     results = self._run_stage()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
+[rank0]:     self._run_sanity_check()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
+[rank0]:     val_loop.run()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
+[rank0]:     return loop_run(self, *args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
+[rank0]:     self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
+[rank0]:     output = call._call_strategy_hook(trainer, hook_name, *step_args)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
+[rank0]:     output = fn(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
+[rank0]:     return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
+[rank0]:     wrapper_output = wrapper_module(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
+[rank0]:     ret_val = func(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
+[rank0]:     loss = self.module(*inputs, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
+[rank0]:     return inner()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
+[rank0]:     result = forward_call(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
+[rank0]:     out = method(*_args, **_kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
+[rank0]:     return func(*args, **kwargs)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
+[rank0]:     blip2_loss = self.blip2qformer(batch)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
+[rank0]:     sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
+[rank0]:     loss_graph = F.cross_entropy(logits_per_graph, labels)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
+[rank0]:     return torch._C._nn.cross_entropy_loss(
+[rank0]: RuntimeError: size mismatch (got input: [4], target: [1])

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,225 @@

+numpy==2.2.6
+confection==0.1.5
+text-unidecode==1.3
+contexttimer==0.3.3
+omegaconf==2.3.0
+tzdata==2025.2
+nvidia-cuda-nvrtc-cu12==12.4.127
+plotly==6.1.1
+decord==0.6.0
+nvidia-cublas-cu12==12.4.5.8
+scipy==1.15.3
+nvidia-cufile-cu12==1.11.1.6
+parso==0.8.4
+python-dateutil==2.9.0.post0
+setuptools==78.1.1
+aiosignal==1.3.2
+joblib==1.5.1
+platformdirs==4.3.8
+regex==2024.11.6
+aiohappyeyeballs==2.6.1
+virtualenv==20.31.2
+lazy_loader==0.4
+rich==14.0.0
+timm==0.4.12
+antlr4-python3-runtime==4.9.3
+pandas==2.2.3
+salesforce-lavis==1.0.2
+gitdb==4.0.12
+six==1.17.0
+smmap==5.0.2
+annotated-types==0.7.0
+pyparsing==3.2.3
+Jinja2==3.1.6
+ptyprocess==0.7.0
+streamlit==1.45.1
+idna==3.10
+nvidia-cusolver-cu12==11.6.1.9
+tenacity==9.1.2
+sentencepiece==0.2.0
+matplotlib-inline==0.1.7
+typing-inspection==0.4.1
+packaging==24.2
+nltk==3.9.1
+wheel==0.45.1
+catalogue==2.0.10
+matplotlib==3.10.3
+propcache==0.3.1
+Pygments==2.19.1
+nvidia-nvjitlink-cu12==12.4.127
+requests==2.32.3
+filelock==3.18.0
+pexpect==4.9.0
+opencv-python-headless==4.5.5.64
+certifi==2025.4.26
+nvidia-nvtx-cu12==12.4.127
+bleach==6.2.0
+typing_extensions==4.13.2
+tornado==6.5.1
+networkx==3.4.2
+sympy==1.13.1
+watchdog==6.0.0
+kaggle==1.7.4.5
+nvidia-ml-py==12.575.51
+pyarrow==20.0.0
+mpmath==1.3.0
+lightning-utilities==0.14.3
+ftfy==6.3.1
+triton==3.2.0
+referencing==0.36.2
+ipython==8.36.0
+yarl==1.20.0
+language_data==1.3.0
+cycler==0.12.1
+python-magic==0.4.27
+wasabi==1.1.3
+protobuf==6.31.0
+murmurhash==1.0.13
+jsonschema-specifications==2025.4.1
+blinker==1.9.0
+fonttools==4.58.0
+imageio==2.37.0
+pycocoevalcap==1.2
+nvidia-cuda-cupti-cu12==12.4.127
+fairscale==0.4.4
+hjson==3.1.0
+identify==2.6.12
+mdurl==0.1.2
+decorator==5.2.1
+distlib==0.3.9
+webencodings==0.5.1
+kiwisolver==1.4.8
+srsly==2.5.1
+frozenlist==1.6.0
+blis==1.3.0
+contourpy==1.3.2
+hf-xet==1.1.2
+cymem==2.0.11
+pillow==11.2.1
+pycocotools==2.0.8
+pre_commit==4.2.0
+wrapt==1.17.2
+nvidia-curand-cu12==10.3.5.147
+spacy==3.8.7
+rpds-py==0.25.1
+exceptiongroup==1.3.0
+braceexpand==0.1.7
+rouge_score==0.1.2
+async-timeout==5.0.1
+torchmetrics==1.7.1
+nvidia-nccl-cu12==2.21.5
+wcwidth==0.2.13
+nvidia-cusparselt-cu12==0.6.2
+scikit-image==0.25.2
+urllib3==2.4.0
+portalocker==3.1.1
+smart-open==7.1.0
+cfgv==3.4.0
+markdown-it-py==3.0.0
+charset-normalizer==3.4.2
+executing==2.2.0
+pure_eval==0.2.3
+safetensors==0.5.3
+spacy-legacy==3.0.12
+shellingham==1.5.4
+langcodes==3.5.0
+pytz==2025.2
+iopath==0.1.10
+weasel==0.4.1
+tifffile==2025.5.10
+nodeenv==1.9.1
+absl-py==2.2.2
+einops==0.8.1
+msgpack==1.1.0
+pydantic_core==2.33.2
+ninja==1.11.1.4
+altair==5.5.0
+attrs==25.3.0
+tqdm==4.67.1
+deepspeed==0.16.10+b666844f
+pydeck==0.9.1
+stack-data==0.6.3
+pydantic==2.11.5
+torch==2.6.0
+nvidia-cudnn-cu12==9.1.0.70
+python-slugify==8.0.4
+webdataset==0.2.111
+pytorch-lightning==2.5.1.post0
+prompt_toolkit==3.0.51
+psutil==7.0.0
+opendatasets==0.1.22
+asttokens==3.0.0
+MarkupSafe==3.0.2
+multidict==6.4.4
+nvidia-cufft-cu12==11.2.1.3
+GitPython==3.1.44
+PyYAML==6.0.2
+cloudpathlib==0.21.1
+toml==0.10.2
+marisa-trie==1.2.1
+traitlets==5.14.3
+cachetools==5.5.2
+spacy-loggers==1.0.5
+nvidia-cuda-runtime-cu12==12.4.127
+torchvision==0.21.0
+nvidia-cusparse-cu12==12.3.1.170
+jedi==0.19.2
+thinc==8.3.6
+py-cpuinfo==9.0.0
+yacs==0.1.8
+cffi==1.17.1
+preshed==3.0.10
+more-itertools==10.7.0
+bigmodelvis==0.0.1
+datasets==3.6.0
+huggingface-hub==0.32.1
+narwhals==1.41.0
+xxhash==3.5.0
+sentry-sdk==2.29.1
+aiohttp==3.12.2
+opencv-python==4.11.0.86
+pycryptodome==3.23.0
+threadpoolctl==3.6.0
+flash-attn==2.7.1.post1
+transformers==4.52.3
+pycparser==2.22
+pathlib==1.0.1
+dill==0.3.8
+scikit-learn==1.6.1
+tokenizers==0.21.1
+aliyun-python-sdk-core==2.16.0
+fsspec==2025.3.0
+jmespath==0.10.0
+click==8.2.1
+delta-center-client==0.0.4
+cheroot==10.0.1
+wandb==0.19.11
+setproctitle==1.3.6
+jsonschema==4.24.0
+oss2==2.15.0
+multiprocess==0.70.16
+jaraco.functools==4.1.0
+web.py==0.62
+aliyun-python-sdk-kms==2.16.5
+cryptography==45.0.3
+pip==25.1.1
+docker-pycreds==0.4.0
+typer==0.16.0
+opendelta==0.3.2
+crcmod==1.7
+jaraco.functools==4.0.1
+inflect==7.3.1
+jaraco.collections==5.1.0
+packaging==24.2
+wheel==0.45.1
+tomli==2.0.1
+platformdirs==4.2.2
+typing_extensions==4.12.2
+more-itertools==10.3.0
+autocommand==2.2.2
+jaraco.text==3.12.1
+importlib_metadata==8.0.0
+jaraco.context==5.3.0
+zipp==3.19.2
+backports.tarfile==1.2.0
+typeguard==4.3.0

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,98 @@

+{
+  "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
+  "python": "CPython 3.10.0",
+  "startedAt": "2025-06-22T11:01:09.864619Z",
+  "args": [
+    "--devices",
+    "0,1,2,3",
+    "--mode",
+    "train",
+    "--filename",
+    "stage1_06221723",
+    "--num_query_token",
+    "8",
+    "--plm_name",
+    "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
+    "--bert_name",
+    "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
+    "--save_every_n_epochs",
+    "5",
+    "--max_epochs",
+    "20",
+    "--batch_size",
+    "32",
+    "--precision",
+    "bf16-mixed",
+    "--mix_dataset",
+    "--num_workers",
+    "8",
+    "--use_wandb_logger"
+  ],
+  "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
+  "codePath": "stage1.py",
+  "email": "gia0603yucca@gmail.com",
+  "root": "./all_checkpoints/stage1_06221723/",
+  "host": "dsw-251511-c5cfcb8-lwcpt",
+  "executable": "/root/miniconda3/envs/protT3/bin/python",
+  "codePathLocal": "stage1.py",
+  "cpu_count": 64,
+  "cpu_count_logical": 64,
+  "gpu": "NVIDIA A800-SXM4-80GB",
+  "gpu_count": 8,
+  "disk": {
+    "/": {
+      "total": "1623302262784",
+      "used": "1363144704"
+    }
+  },
+  "memory": {
+    "total": "549755813888"
+  },
+  "cpu": {
+    "count": 64,
+    "countLogical": 64
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    }
+  ],
+  "cudaVersion": "12.1"
+}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb":{"runtime":11}}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,10 @@

+{"time":"2025-06-22T19:01:09.866022252+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-core.log"}
+{"time":"2025-06-22T19:01:10.922793248+08:00","level":"INFO","msg":"created new stream","id":"tul2l6xd"}
+{"time":"2025-06-22T19:01:10.922840238+08:00","level":"INFO","msg":"stream: started","id":"tul2l6xd"}
+{"time":"2025-06-22T19:01:10.922861712+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"tul2l6xd"}
+{"time":"2025-06-22T19:01:10.922902903+08:00","level":"INFO","msg":"sender: started","stream_id":"tul2l6xd"}
+{"time":"2025-06-22T19:01:10.922946705+08:00","level":"INFO","msg":"handler: started","stream_id":"tul2l6xd"}
+{"time":"2025-06-22T19:01:12.123540259+08:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-06-22T19:01:21.097810278+08:00","level":"INFO","msg":"stream: closing","id":"tul2l6xd"}
+{"time":"2025-06-22T19:01:21.097899274+08:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-06-22T19:01:21.098725356+08:00","level":"INFO","msg":"Stopped system monitor"}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug.log ADDED Viewed

	@@ -0,0 +1,24 @@

+2025-06-22 19:01:09,858 INFO    MainThread:75754 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
+2025-06-22 19:01:09,858 INFO    MainThread:75754 [wandb_setup.py:_flush():70] Configure stats pid to 75754
+2025-06-22 19:01:09,858 INFO    MainThread:75754 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
+2025-06-22 19:01:09,858 INFO    MainThread:75754 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
+2025-06-22 19:01:09,858 INFO    MainThread:75754 [wandb_setup.py:_flush():70] Loading settings from environment variables
+2025-06-22 19:01:09,858 INFO    MainThread:75754 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug.log
+2025-06-22 19:01:09,858 INFO    MainThread:75754 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/logs/debug-internal.log
+2025-06-22 19:01:09,858 INFO    MainThread:75754 [wandb_init.py:init():852] calling init triggers
+2025-06-22 19:01:09,858 INFO    MainThread:75754 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-06-22 19:01:09,858 INFO    MainThread:75754 [wandb_init.py:init():893] starting backend
+2025-06-22 19:01:09,858 INFO    MainThread:75754 [wandb_init.py:init():897] sending inform_init request
+2025-06-22 19:01:09,860 INFO    MainThread:75754 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-06-22 19:01:09,861 INFO    MainThread:75754 [wandb_init.py:init():907] backend started and connected
+2025-06-22 19:01:09,865 INFO    MainThread:75754 [wandb_init.py:init():1005] updated telemetry
+2025-06-22 19:01:09,869 INFO    MainThread:75754 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
+2025-06-22 19:01:12,112 INFO    MainThread:75754 [wandb_init.py:init():1104] starting run threads in backend
+2025-06-22 19:01:12,253 INFO    MainThread:75754 [wandb_run.py:_console_start():2573] atexit reg
+2025-06-22 19:01:12,253 INFO    MainThread:75754 [wandb_run.py:_redirect():2421] redirect: wrap_raw
+2025-06-22 19:01:12,256 INFO    MainThread:75754 [wandb_run.py:_redirect():2490] Wrapping output streams.
+2025-06-22 19:01:12,256 INFO    MainThread:75754 [wandb_run.py:_redirect():2513] Redirects installed.
+2025-06-22 19:01:12,257 INFO    MainThread:75754 [wandb_init.py:init():1150] run started, returning control to user process
+2025-06-22 19:01:16,111 INFO    MainThread:75754 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
+2025-06-22 19:01:21,097 INFO    MsgRouterThr:75754 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190109-tul2l6xd/run-tul2l6xd.wandb ADDED Viewed

File without changes

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/config.yaml ADDED Viewed

	@@ -0,0 +1,128 @@

+_wandb:
+    value:
+        cli_version: 0.19.11
+        m:
+            - "1": trainer/global_step
+              "6":
+                - 3
+              "7": []
+        python_version: 3.10.0
+        t:
+            "1":
+                - 1
+                - 5
+                - 9
+                - 11
+                - 33
+                - 41
+                - 49
+                - 53
+                - 55
+                - 63
+                - 103
+            "2":
+                - 1
+                - 5
+                - 9
+                - 11
+                - 33
+                - 41
+                - 49
+                - 53
+                - 55
+                - 63
+                - 103
+            "3":
+                - 7
+                - 23
+                - 55
+                - 66
+            "4": 3.10.0
+            "5": 0.19.11
+            "6": 4.52.3
+            "8":
+                - 5
+            "12": 0.19.11
+            "13": linux-x86_64
+accelerator:
+    value: gpu
+batch_size:
+    value: 32
+bert_hidden_dim:
+    value: 768
+bert_name:
+    value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
+check_val_every_n_epoch:
+    value: 1
+cross_attention_freq:
+    value: 2
+devices:
+    value: 0,1,2,3
+filename:
+    value: stage1_06221723
+init_checkpoint:
+    value: ""
+init_lr:
+    value: 0.0001
+lm:
+    value: true
+load_4bit:
+    value: false
+lr_decay_rate:
+    value: 0.9
+match_batch_size:
+    value: 64
+max_epochs:
+    value: 20
+min_lr:
+    value: 1e-05
+mix_dataset:
+    value: true
+mode:
+    value: train
+num_query_token:
+    value: 8
+num_workers:
+    value: 4
+plm_name:
+    value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
+plm_tune:
+    value: freeze
+pool_size:
+    value: 0
+precision:
+    value: bf16-mixed
+projection_dim:
+    value: 256
+prot_aug:
+    value: None
+prot_max_len:
+    value: 1024
+ptm:
+    value: true
+rerank_cand_num:
+    value: 128
+retrieval_eval_epoch:
+    value: 10
+root:
+    value: data
+save_every_n_epochs:
+    value: 5
+scheduler:
+    value: linear_warmup_cosine_lr
+seed:
+    value: 42
+strategy:
+    value: deepspeed
+temperature:
+    value: 0.1
+text_max_len:
+    value: 128
+use_wandb_logger:
+    value: true
+warmup_lr:
+    value: 1e-06
+warmup_steps:
+    value: 1000
+weight_decay:
+    value: 0.05

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/output.log ADDED Viewed

	@@ -0,0 +1,141 @@

+/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
+Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+  | Name         | Type         | Params | Mode
+------------------------------------------------------
+0 | blip2qformer | Blip2Qformer | 327 M  | train
+------------------------------------------------------
+179 M     Trainable params
+147 M     Non-trainable params
+327 M     Total params
+1,309.467 Total estimated model params size (MB)
+5         Modules in train mode
+926       Modules in eval mode
+Sanity Checking DataLoader 2:  50%|██████████████████████████▌                          | 1/2 [00:00<00:00,  1.92it/s]
+/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
+Traceback (most recent call last):
+  File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
+    main(args)
+  File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
+    trainer.fit(model, datamodule=dm)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
+    call._call_and_handle_interrupt(
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
+    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
+    return function(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
+    self._run(model, ckpt_path=ckpt_path)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
+    results = self._run_stage()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
+    self._run_sanity_check()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
+    val_loop.run()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
+    return loop_run(self, *args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
+    self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
+    output = call._call_strategy_hook(trainer, hook_name, *step_args)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
+    output = fn(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
+    return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
+    wrapper_output = wrapper_module(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
+    ret_val = func(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
+    loss = self.module(*inputs, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
+    return inner()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
+    result = forward_call(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
+    out = method(*_args, **_kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
+    return func(*args, **kwargs)
+  File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
+    blip2_loss = self.blip2qformer(batch)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
+    sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
+  File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
+    loss_graph = F.cross_entropy(logits_per_graph, labels)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
+    return torch._C._nn.cross_entropy_loss(
+RuntimeError: size mismatch (got input: [4], target: [1])
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
+[rank0]:     main(args)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
+[rank0]:     trainer.fit(model, datamodule=dm)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
+[rank0]:     call._call_and_handle_interrupt(
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
+[rank0]:     return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
+[rank0]:     return function(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
+[rank0]:     self._run(model, ckpt_path=ckpt_path)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
+[rank0]:     results = self._run_stage()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
+[rank0]:     self._run_sanity_check()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
+[rank0]:     val_loop.run()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
+[rank0]:     return loop_run(self, *args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
+[rank0]:     self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
+[rank0]:     output = call._call_strategy_hook(trainer, hook_name, *step_args)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
+[rank0]:     output = fn(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
+[rank0]:     return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
+[rank0]:     wrapper_output = wrapper_module(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
+[rank0]:     ret_val = func(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
+[rank0]:     loss = self.module(*inputs, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
+[rank0]:     return inner()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
+[rank0]:     result = forward_call(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
+[rank0]:     out = method(*_args, **_kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
+[rank0]:     return func(*args, **kwargs)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
+[rank0]:     blip2_loss = self.blip2qformer(batch)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
+[rank0]:     sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
+[rank0]:     loss_graph = F.cross_entropy(logits_per_graph, labels)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
+[rank0]:     return torch._C._nn.cross_entropy_loss(
+[rank0]: RuntimeError: size mismatch (got input: [4], target: [1])

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,225 @@

+numpy==2.2.6
+confection==0.1.5
+text-unidecode==1.3
+contexttimer==0.3.3
+omegaconf==2.3.0
+tzdata==2025.2
+nvidia-cuda-nvrtc-cu12==12.4.127
+plotly==6.1.1
+decord==0.6.0
+nvidia-cublas-cu12==12.4.5.8
+scipy==1.15.3
+nvidia-cufile-cu12==1.11.1.6
+parso==0.8.4
+python-dateutil==2.9.0.post0
+setuptools==78.1.1
+aiosignal==1.3.2
+joblib==1.5.1
+platformdirs==4.3.8
+regex==2024.11.6
+aiohappyeyeballs==2.6.1
+virtualenv==20.31.2
+lazy_loader==0.4
+rich==14.0.0
+timm==0.4.12
+antlr4-python3-runtime==4.9.3
+pandas==2.2.3
+salesforce-lavis==1.0.2
+gitdb==4.0.12
+six==1.17.0
+smmap==5.0.2
+annotated-types==0.7.0
+pyparsing==3.2.3
+Jinja2==3.1.6
+ptyprocess==0.7.0
+streamlit==1.45.1
+idna==3.10
+nvidia-cusolver-cu12==11.6.1.9
+tenacity==9.1.2
+sentencepiece==0.2.0
+matplotlib-inline==0.1.7
+typing-inspection==0.4.1
+packaging==24.2
+nltk==3.9.1
+wheel==0.45.1
+catalogue==2.0.10
+matplotlib==3.10.3
+propcache==0.3.1
+Pygments==2.19.1
+nvidia-nvjitlink-cu12==12.4.127
+requests==2.32.3
+filelock==3.18.0
+pexpect==4.9.0
+opencv-python-headless==4.5.5.64
+certifi==2025.4.26
+nvidia-nvtx-cu12==12.4.127
+bleach==6.2.0
+typing_extensions==4.13.2
+tornado==6.5.1
+networkx==3.4.2
+sympy==1.13.1
+watchdog==6.0.0
+kaggle==1.7.4.5
+nvidia-ml-py==12.575.51
+pyarrow==20.0.0
+mpmath==1.3.0
+lightning-utilities==0.14.3
+ftfy==6.3.1
+triton==3.2.0
+referencing==0.36.2
+ipython==8.36.0
+yarl==1.20.0
+language_data==1.3.0
+cycler==0.12.1
+python-magic==0.4.27
+wasabi==1.1.3
+protobuf==6.31.0
+murmurhash==1.0.13
+jsonschema-specifications==2025.4.1
+blinker==1.9.0
+fonttools==4.58.0
+imageio==2.37.0
+pycocoevalcap==1.2
+nvidia-cuda-cupti-cu12==12.4.127
+fairscale==0.4.4
+hjson==3.1.0
+identify==2.6.12
+mdurl==0.1.2
+decorator==5.2.1
+distlib==0.3.9
+webencodings==0.5.1
+kiwisolver==1.4.8
+srsly==2.5.1
+frozenlist==1.6.0
+blis==1.3.0
+contourpy==1.3.2
+hf-xet==1.1.2
+cymem==2.0.11
+pillow==11.2.1
+pycocotools==2.0.8
+pre_commit==4.2.0
+wrapt==1.17.2
+nvidia-curand-cu12==10.3.5.147
+spacy==3.8.7
+rpds-py==0.25.1
+exceptiongroup==1.3.0
+braceexpand==0.1.7
+rouge_score==0.1.2
+async-timeout==5.0.1
+torchmetrics==1.7.1
+nvidia-nccl-cu12==2.21.5
+wcwidth==0.2.13
+nvidia-cusparselt-cu12==0.6.2
+scikit-image==0.25.2
+urllib3==2.4.0
+portalocker==3.1.1
+smart-open==7.1.0
+cfgv==3.4.0
+markdown-it-py==3.0.0
+charset-normalizer==3.4.2
+executing==2.2.0
+pure_eval==0.2.3
+safetensors==0.5.3
+spacy-legacy==3.0.12
+shellingham==1.5.4
+langcodes==3.5.0
+pytz==2025.2
+iopath==0.1.10
+weasel==0.4.1
+tifffile==2025.5.10
+nodeenv==1.9.1
+absl-py==2.2.2
+einops==0.8.1
+msgpack==1.1.0
+pydantic_core==2.33.2
+ninja==1.11.1.4
+altair==5.5.0
+attrs==25.3.0
+tqdm==4.67.1
+deepspeed==0.16.10+b666844f
+pydeck==0.9.1
+stack-data==0.6.3
+pydantic==2.11.5
+torch==2.6.0
+nvidia-cudnn-cu12==9.1.0.70
+python-slugify==8.0.4
+webdataset==0.2.111
+pytorch-lightning==2.5.1.post0
+prompt_toolkit==3.0.51
+psutil==7.0.0
+opendatasets==0.1.22
+asttokens==3.0.0
+MarkupSafe==3.0.2
+multidict==6.4.4
+nvidia-cufft-cu12==11.2.1.3
+GitPython==3.1.44
+PyYAML==6.0.2
+cloudpathlib==0.21.1
+toml==0.10.2
+marisa-trie==1.2.1
+traitlets==5.14.3
+cachetools==5.5.2
+spacy-loggers==1.0.5
+nvidia-cuda-runtime-cu12==12.4.127
+torchvision==0.21.0
+nvidia-cusparse-cu12==12.3.1.170
+jedi==0.19.2
+thinc==8.3.6
+py-cpuinfo==9.0.0
+yacs==0.1.8
+cffi==1.17.1
+preshed==3.0.10
+more-itertools==10.7.0
+bigmodelvis==0.0.1
+datasets==3.6.0
+huggingface-hub==0.32.1
+narwhals==1.41.0
+xxhash==3.5.0
+sentry-sdk==2.29.1
+aiohttp==3.12.2
+opencv-python==4.11.0.86
+pycryptodome==3.23.0
+threadpoolctl==3.6.0
+flash-attn==2.7.1.post1
+transformers==4.52.3
+pycparser==2.22
+pathlib==1.0.1
+dill==0.3.8
+scikit-learn==1.6.1
+tokenizers==0.21.1
+aliyun-python-sdk-core==2.16.0
+fsspec==2025.3.0
+jmespath==0.10.0
+click==8.2.1
+delta-center-client==0.0.4
+cheroot==10.0.1
+wandb==0.19.11
+setproctitle==1.3.6
+jsonschema==4.24.0
+oss2==2.15.0
+multiprocess==0.70.16
+jaraco.functools==4.1.0
+web.py==0.62
+aliyun-python-sdk-kms==2.16.5
+cryptography==45.0.3
+pip==25.1.1
+docker-pycreds==0.4.0
+typer==0.16.0
+opendelta==0.3.2
+crcmod==1.7
+jaraco.functools==4.0.1
+inflect==7.3.1
+jaraco.collections==5.1.0
+packaging==24.2
+wheel==0.45.1
+tomli==2.0.1
+platformdirs==4.2.2
+typing_extensions==4.12.2
+more-itertools==10.3.0
+autocommand==2.2.2
+jaraco.text==3.12.1
+importlib_metadata==8.0.0
+jaraco.context==5.3.0
+zipp==3.19.2
+backports.tarfile==1.2.0
+typeguard==4.3.0

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,98 @@

+{
+  "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
+  "python": "CPython 3.10.0",
+  "startedAt": "2025-06-22T11:04:29.455254Z",
+  "args": [
+    "--devices",
+    "0,1,2,3",
+    "--mode",
+    "train",
+    "--filename",
+    "stage1_06221723",
+    "--num_query_token",
+    "8",
+    "--plm_name",
+    "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
+    "--bert_name",
+    "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
+    "--save_every_n_epochs",
+    "5",
+    "--max_epochs",
+    "20",
+    "--batch_size",
+    "32",
+    "--precision",
+    "bf16-mixed",
+    "--mix_dataset",
+    "--num_workers",
+    "4",
+    "--use_wandb_logger"
+  ],
+  "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
+  "codePath": "stage1.py",
+  "email": "gia0603yucca@gmail.com",
+  "root": "./all_checkpoints/stage1_06221723/",
+  "host": "dsw-251511-c5cfcb8-lwcpt",
+  "executable": "/root/miniconda3/envs/protT3/bin/python",
+  "codePathLocal": "stage1.py",
+  "cpu_count": 64,
+  "cpu_count_logical": 64,
+  "gpu": "NVIDIA A800-SXM4-80GB",
+  "gpu_count": 8,
+  "disk": {
+    "/": {
+      "total": "1623302262784",
+      "used": "1363148800"
+    }
+  },
+  "memory": {
+    "total": "549755813888"
+  },
+  "cpu": {
+    "count": 64,
+    "countLogical": 64
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    }
+  ],
+  "cudaVersion": "12.1"
+}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb":{"runtime":11}}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,15 @@

+{"time":"2025-06-22T19:04:29.457094442+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-core.log"}
+{"time":"2025-06-22T19:04:30.490221799+08:00","level":"INFO","msg":"created new stream","id":"bq9amgfj"}
+{"time":"2025-06-22T19:04:30.490257318+08:00","level":"INFO","msg":"stream: started","id":"bq9amgfj"}
+{"time":"2025-06-22T19:04:30.490294134+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"bq9amgfj"}
+{"time":"2025-06-22T19:04:30.490304877+08:00","level":"INFO","msg":"sender: started","stream_id":"bq9amgfj"}
+{"time":"2025-06-22T19:04:30.490450091+08:00","level":"INFO","msg":"handler: started","stream_id":"bq9amgfj"}
+{"time":"2025-06-22T19:04:31.742014982+08:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-06-22T19:04:40.504265321+08:00","level":"INFO","msg":"stream: closing","id":"bq9amgfj"}
+{"time":"2025-06-22T19:04:40.504307897+08:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-06-22T19:04:40.505067489+08:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2025-06-22T19:04:41.953923124+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-06-22T19:04:43.137437891+08:00","level":"INFO","msg":"handler: closed","stream_id":"bq9amgfj"}
+{"time":"2025-06-22T19:04:43.137489667+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"bq9amgfj"}
+{"time":"2025-06-22T19:04:43.137501311+08:00","level":"INFO","msg":"sender: closed","stream_id":"bq9amgfj"}
+{"time":"2025-06-22T19:04:43.141402359+08:00","level":"INFO","msg":"stream: closed","id":"bq9amgfj"}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug.log ADDED Viewed

	@@ -0,0 +1,24 @@

+2025-06-22 19:04:29,449 INFO    MainThread:79876 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
+2025-06-22 19:04:29,449 INFO    MainThread:79876 [wandb_setup.py:_flush():70] Configure stats pid to 79876
+2025-06-22 19:04:29,449 INFO    MainThread:79876 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
+2025-06-22 19:04:29,449 INFO    MainThread:79876 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
+2025-06-22 19:04:29,449 INFO    MainThread:79876 [wandb_setup.py:_flush():70] Loading settings from environment variables
+2025-06-22 19:04:29,449 INFO    MainThread:79876 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug.log
+2025-06-22 19:04:29,449 INFO    MainThread:79876 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/logs/debug-internal.log
+2025-06-22 19:04:29,449 INFO    MainThread:79876 [wandb_init.py:init():852] calling init triggers
+2025-06-22 19:04:29,449 INFO    MainThread:79876 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-06-22 19:04:29,449 INFO    MainThread:79876 [wandb_init.py:init():893] starting backend
+2025-06-22 19:04:29,449 INFO    MainThread:79876 [wandb_init.py:init():897] sending inform_init request
+2025-06-22 19:04:29,450 INFO    MainThread:79876 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-06-22 19:04:29,452 INFO    MainThread:79876 [wandb_init.py:init():907] backend started and connected
+2025-06-22 19:04:29,456 INFO    MainThread:79876 [wandb_init.py:init():1005] updated telemetry
+2025-06-22 19:04:29,458 INFO    MainThread:79876 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
+2025-06-22 19:04:31,693 INFO    MainThread:79876 [wandb_init.py:init():1104] starting run threads in backend
+2025-06-22 19:04:31,896 INFO    MainThread:79876 [wandb_run.py:_console_start():2573] atexit reg
+2025-06-22 19:04:31,897 INFO    MainThread:79876 [wandb_run.py:_redirect():2421] redirect: wrap_raw
+2025-06-22 19:04:31,899 INFO    MainThread:79876 [wandb_run.py:_redirect():2490] Wrapping output streams.
+2025-06-22 19:04:31,899 INFO    MainThread:79876 [wandb_run.py:_redirect():2513] Redirects installed.
+2025-06-22 19:04:31,900 INFO    MainThread:79876 [wandb_init.py:init():1150] run started, returning control to user process
+2025-06-22 19:04:35,461 INFO    MainThread:79876 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 4, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
+2025-06-22 19:04:40,503 INFO    MsgRouterThr:79876 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190429-bq9amgfj/run-bq9amgfj.wandb ADDED Viewed

Binary file (20.6 kB). View file

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/config.yaml ADDED Viewed

	@@ -0,0 +1,236 @@

+_wandb:
+    value:
+        cli_version: 0.19.11
+        m:
+            - "1": loader1/val_loss/dataloader_idx_1
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": trainer/global_step
+              "6":
+                - 3
+              "7": []
+            - "1": loader1/val_loss_ptm/dataloader_idx_1
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": train_loss
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": loader2/val_loss_ptm/dataloader_idx_2
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": loader2/val_loss/dataloader_idx_2
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": epoch
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": train_loss_ptm
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": train_loss_lm
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": loader1/val_loss_ptc/dataloader_idx_1
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": loader0/val_loss_ptm/dataloader_idx_0
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": loader0/val_loss_lm/dataloader_idx_0
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": loader2/val_loss_lm/dataloader_idx_2
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": loader1/val_loss_lm/dataloader_idx_1
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": train_loss_ptc
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": loader0/val_loss_ptc/dataloader_idx_0
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": loader0/val_loss/dataloader_idx_0
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": lr
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": loader2/val_loss_ptc/dataloader_idx_2
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+        python_version: 3.10.0
+        t:
+            "1":
+                - 1
+                - 5
+                - 9
+                - 11
+                - 33
+                - 41
+                - 49
+                - 53
+                - 55
+                - 63
+                - 103
+            "2":
+                - 1
+                - 5
+                - 9
+                - 11
+                - 33
+                - 41
+                - 49
+                - 53
+                - 55
+                - 63
+                - 103
+            "3":
+                - 7
+                - 23
+                - 55
+                - 66
+            "4": 3.10.0
+            "5": 0.19.11
+            "6": 4.52.3
+            "8":
+                - 5
+            "12": 0.19.11
+            "13": linux-x86_64
+accelerator:
+    value: gpu
+batch_size:
+    value: 32
+bert_hidden_dim:
+    value: 768
+bert_name:
+    value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
+check_val_every_n_epoch:
+    value: 1
+cross_attention_freq:
+    value: 2
+devices:
+    value: 0,1,2,3,4,5,6,7
+filename:
+    value: stage1_06221723
+init_checkpoint:
+    value: ""
+init_lr:
+    value: 0.0001
+lm:
+    value: true
+load_4bit:
+    value: false
+lr_decay_rate:
+    value: 0.9
+match_batch_size:
+    value: 64
+max_epochs:
+    value: 20
+min_lr:
+    value: 1e-05
+mix_dataset:
+    value: true
+mode:
+    value: train
+num_query_token:
+    value: 8
+num_workers:
+    value: 8
+plm_name:
+    value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
+plm_tune:
+    value: freeze
+pool_size:
+    value: 0
+precision:
+    value: bf16-mixed
+projection_dim:
+    value: 256
+prot_aug:
+    value: None
+prot_max_len:
+    value: 1024
+ptm:
+    value: true
+rerank_cand_num:
+    value: 128
+retrieval_eval_epoch:
+    value: 10
+root:
+    value: data
+save_every_n_epochs:
+    value: 5
+scheduler:
+    value: linear_warmup_cosine_lr
+seed:
+    value: 42
+strategy:
+    value: deepspeed
+temperature:
+    value: 0.1
+text_max_len:
+    value: 128
+use_wandb_logger:
+    value: true
+warmup_lr:
+    value: 1e-06
+warmup_steps:
+    value: 1000
+weight_decay:
+    value: 0.05

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/output.log ADDED Viewed

	@@ -0,0 +1,20 @@

+/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
+Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+  | Name         | Type         | Params | Mode
+------------------------------------------------------
+0 | blip2qformer | Blip2Qformer | 327 M  | train
+------------------------------------------------------
+179 M     Trainable params
+147 M     Non-trainable params
+327 M     Total params
+1,309.467 Total estimated model params size (MB)
+5         Modules in train mode
+926       Modules in eval mode
+Epoch 3:  18%|██████████▊                                                 | 11/61 [00:08<00:37,  1.35it/s, v_num=24k3]
+/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
+Detected KeyboardInterrupt, attempting graceful shutdown ...

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,225 @@

+numpy==2.2.6
+confection==0.1.5
+text-unidecode==1.3
+contexttimer==0.3.3
+omegaconf==2.3.0
+tzdata==2025.2
+nvidia-cuda-nvrtc-cu12==12.4.127
+plotly==6.1.1
+decord==0.6.0
+nvidia-cublas-cu12==12.4.5.8
+scipy==1.15.3
+nvidia-cufile-cu12==1.11.1.6
+parso==0.8.4
+python-dateutil==2.9.0.post0
+setuptools==78.1.1
+aiosignal==1.3.2
+joblib==1.5.1
+platformdirs==4.3.8
+regex==2024.11.6
+aiohappyeyeballs==2.6.1
+virtualenv==20.31.2
+lazy_loader==0.4
+rich==14.0.0
+timm==0.4.12
+antlr4-python3-runtime==4.9.3
+pandas==2.2.3
+salesforce-lavis==1.0.2
+gitdb==4.0.12
+six==1.17.0
+smmap==5.0.2
+annotated-types==0.7.0
+pyparsing==3.2.3
+Jinja2==3.1.6
+ptyprocess==0.7.0
+streamlit==1.45.1
+idna==3.10
+nvidia-cusolver-cu12==11.6.1.9
+tenacity==9.1.2
+sentencepiece==0.2.0
+matplotlib-inline==0.1.7
+typing-inspection==0.4.1
+packaging==24.2
+nltk==3.9.1
+wheel==0.45.1
+catalogue==2.0.10
+matplotlib==3.10.3
+propcache==0.3.1
+Pygments==2.19.1
+nvidia-nvjitlink-cu12==12.4.127
+requests==2.32.3
+filelock==3.18.0
+pexpect==4.9.0
+opencv-python-headless==4.5.5.64
+certifi==2025.4.26
+nvidia-nvtx-cu12==12.4.127
+bleach==6.2.0
+typing_extensions==4.13.2
+tornado==6.5.1
+networkx==3.4.2
+sympy==1.13.1
+watchdog==6.0.0
+kaggle==1.7.4.5
+nvidia-ml-py==12.575.51
+pyarrow==20.0.0
+mpmath==1.3.0
+lightning-utilities==0.14.3
+ftfy==6.3.1
+triton==3.2.0
+referencing==0.36.2
+ipython==8.36.0
+yarl==1.20.0
+language_data==1.3.0
+cycler==0.12.1
+python-magic==0.4.27
+wasabi==1.1.3
+protobuf==6.31.0
+murmurhash==1.0.13
+jsonschema-specifications==2025.4.1
+blinker==1.9.0
+fonttools==4.58.0
+imageio==2.37.0
+pycocoevalcap==1.2
+nvidia-cuda-cupti-cu12==12.4.127
+fairscale==0.4.4
+hjson==3.1.0
+identify==2.6.12
+mdurl==0.1.2
+decorator==5.2.1
+distlib==0.3.9
+webencodings==0.5.1
+kiwisolver==1.4.8
+srsly==2.5.1
+frozenlist==1.6.0
+blis==1.3.0
+contourpy==1.3.2
+hf-xet==1.1.2
+cymem==2.0.11
+pillow==11.2.1
+pycocotools==2.0.8
+pre_commit==4.2.0
+wrapt==1.17.2
+nvidia-curand-cu12==10.3.5.147
+spacy==3.8.7
+rpds-py==0.25.1
+exceptiongroup==1.3.0
+braceexpand==0.1.7
+rouge_score==0.1.2
+async-timeout==5.0.1
+torchmetrics==1.7.1
+nvidia-nccl-cu12==2.21.5
+wcwidth==0.2.13
+nvidia-cusparselt-cu12==0.6.2
+scikit-image==0.25.2
+urllib3==2.4.0
+portalocker==3.1.1
+smart-open==7.1.0
+cfgv==3.4.0
+markdown-it-py==3.0.0
+charset-normalizer==3.4.2
+executing==2.2.0
+pure_eval==0.2.3
+safetensors==0.5.3
+spacy-legacy==3.0.12
+shellingham==1.5.4
+langcodes==3.5.0
+pytz==2025.2
+iopath==0.1.10
+weasel==0.4.1
+tifffile==2025.5.10
+nodeenv==1.9.1
+absl-py==2.2.2
+einops==0.8.1
+msgpack==1.1.0
+pydantic_core==2.33.2
+ninja==1.11.1.4
+altair==5.5.0
+attrs==25.3.0
+tqdm==4.67.1
+deepspeed==0.16.10+b666844f
+pydeck==0.9.1
+stack-data==0.6.3
+pydantic==2.11.5
+torch==2.6.0
+nvidia-cudnn-cu12==9.1.0.70
+python-slugify==8.0.4
+webdataset==0.2.111
+pytorch-lightning==2.5.1.post0
+prompt_toolkit==3.0.51
+psutil==7.0.0
+opendatasets==0.1.22
+asttokens==3.0.0
+MarkupSafe==3.0.2
+multidict==6.4.4
+nvidia-cufft-cu12==11.2.1.3
+GitPython==3.1.44
+PyYAML==6.0.2
+cloudpathlib==0.21.1
+toml==0.10.2
+marisa-trie==1.2.1
+traitlets==5.14.3
+cachetools==5.5.2
+spacy-loggers==1.0.5
+nvidia-cuda-runtime-cu12==12.4.127
+torchvision==0.21.0
+nvidia-cusparse-cu12==12.3.1.170
+jedi==0.19.2
+thinc==8.3.6
+py-cpuinfo==9.0.0
+yacs==0.1.8
+cffi==1.17.1
+preshed==3.0.10
+more-itertools==10.7.0
+bigmodelvis==0.0.1
+datasets==3.6.0
+huggingface-hub==0.32.1
+narwhals==1.41.0
+xxhash==3.5.0
+sentry-sdk==2.29.1
+aiohttp==3.12.2
+opencv-python==4.11.0.86
+pycryptodome==3.23.0
+threadpoolctl==3.6.0
+flash-attn==2.7.1.post1
+transformers==4.52.3
+pycparser==2.22
+pathlib==1.0.1
+dill==0.3.8
+scikit-learn==1.6.1
+tokenizers==0.21.1
+aliyun-python-sdk-core==2.16.0
+fsspec==2025.3.0
+jmespath==0.10.0
+click==8.2.1
+delta-center-client==0.0.4
+cheroot==10.0.1
+wandb==0.19.11
+setproctitle==1.3.6
+jsonschema==4.24.0
+oss2==2.15.0
+multiprocess==0.70.16
+jaraco.functools==4.1.0
+web.py==0.62
+aliyun-python-sdk-kms==2.16.5
+cryptography==45.0.3
+pip==25.1.1
+docker-pycreds==0.4.0
+typer==0.16.0
+opendelta==0.3.2
+crcmod==1.7
+jaraco.functools==4.0.1
+inflect==7.3.1
+jaraco.collections==5.1.0
+packaging==24.2
+wheel==0.45.1
+tomli==2.0.1
+platformdirs==4.2.2
+typing_extensions==4.12.2
+more-itertools==10.3.0
+autocommand==2.2.2
+jaraco.text==3.12.1
+importlib_metadata==8.0.0
+jaraco.context==5.3.0
+zipp==3.19.2
+backports.tarfile==1.2.0
+typeguard==4.3.0

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,98 @@

+{
+  "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
+  "python": "CPython 3.10.0",
+  "startedAt": "2025-06-22T11:06:31.087290Z",
+  "args": [
+    "--devices",
+    "0,1,2,3,4,5,6,7",
+    "--mode",
+    "train",
+    "--filename",
+    "stage1_06221723",
+    "--num_query_token",
+    "8",
+    "--plm_name",
+    "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
+    "--bert_name",
+    "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
+    "--save_every_n_epochs",
+    "5",
+    "--max_epochs",
+    "20",
+    "--batch_size",
+    "32",
+    "--precision",
+    "bf16-mixed",
+    "--mix_dataset",
+    "--num_workers",
+    "8",
+    "--use_wandb_logger"
+  ],
+  "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
+  "codePath": "stage1.py",
+  "email": "gia0603yucca@gmail.com",
+  "root": "./all_checkpoints/stage1_06221723/",
+  "host": "dsw-251511-c5cfcb8-lwcpt",
+  "executable": "/root/miniconda3/envs/protT3/bin/python",
+  "codePathLocal": "stage1.py",
+  "cpu_count": 64,
+  "cpu_count_logical": 64,
+  "gpu": "NVIDIA A800-SXM4-80GB",
+  "gpu_count": 8,
+  "disk": {
+    "/": {
+      "total": "1623302262784",
+      "used": "1363152896"
+    }
+  },
+  "memory": {
+    "total": "549755813888"
+  },
+  "cpu": {
+    "count": 64,
+    "countLogical": 64
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    }
+  ],
+  "cudaVersion": "12.1"
+}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"trainer/global_step":182,"loader2/val_loss_lm/dataloader_idx_2":4.984375,"loader0/val_loss_ptc/dataloader_idx_0":2.392578125,"loader2/val_loss_ptc/dataloader_idx_2":3.49609375,"loader0/val_loss/dataloader_idx_0":7.27734375,"loader0/val_loss_lm/dataloader_idx_0":4.2421875,"train_loss":6.375,"train_loss_ptm":0.63671875,"train_loss_lm":2.783203125,"loader1/val_loss/dataloader_idx_1":5.7578125,"_step":5,"_runtime":135.905828304,"_wandb":{"runtime":145},"lr":9.779754327610135e-05,"epoch":2,"loader1/val_loss_ptm/dataloader_idx_1":0.63330078125,"loader1/val_loss_ptc/dataloader_idx_1":2.71484375,"loader0/val_loss_ptm/dataloader_idx_0":0.6416015625,"loader2/val_loss_ptm/dataloader_idx_2":0.63525390625,"loader1/val_loss_lm/dataloader_idx_1":2.412109375,"loader2/val_loss/dataloader_idx_2":9.109375,"train_loss_ptc":2.953125,"_timestamp":1.7505905269926882e+09}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,10 @@

+{"time":"2025-06-22T19:06:31.147471317+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-core.log"}
+{"time":"2025-06-22T19:06:32.175593884+08:00","level":"INFO","msg":"created new stream","id":"9wqt24k3"}
+{"time":"2025-06-22T19:06:32.175639093+08:00","level":"INFO","msg":"stream: started","id":"9wqt24k3"}
+{"time":"2025-06-22T19:06:32.175689814+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"9wqt24k3"}
+{"time":"2025-06-22T19:06:32.17571173+08:00","level":"INFO","msg":"sender: started","stream_id":"9wqt24k3"}
+{"time":"2025-06-22T19:06:32.175747445+08:00","level":"INFO","msg":"handler: started","stream_id":"9wqt24k3"}
+{"time":"2025-06-22T19:06:33.341553042+08:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-06-22T19:08:56.485679626+08:00","level":"INFO","msg":"stream: closing","id":"9wqt24k3"}
+{"time":"2025-06-22T19:08:56.485774311+08:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-06-22T19:08:56.486438679+08:00","level":"INFO","msg":"Stopped system monitor"}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug.log ADDED Viewed

	@@ -0,0 +1,24 @@

+2025-06-22 19:06:31,083 INFO    MainThread:82552 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
+2025-06-22 19:06:31,083 INFO    MainThread:82552 [wandb_setup.py:_flush():70] Configure stats pid to 82552
+2025-06-22 19:06:31,083 INFO    MainThread:82552 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
+2025-06-22 19:06:31,083 INFO    MainThread:82552 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
+2025-06-22 19:06:31,083 INFO    MainThread:82552 [wandb_setup.py:_flush():70] Loading settings from environment variables
+2025-06-22 19:06:31,083 INFO    MainThread:82552 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug.log
+2025-06-22 19:06:31,083 INFO    MainThread:82552 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_190631-9wqt24k3/logs/debug-internal.log
+2025-06-22 19:06:31,083 INFO    MainThread:82552 [wandb_init.py:init():852] calling init triggers
+2025-06-22 19:06:31,083 INFO    MainThread:82552 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-06-22 19:06:31,083 INFO    MainThread:82552 [wandb_init.py:init():893] starting backend
+2025-06-22 19:06:31,084 INFO    MainThread:82552 [wandb_init.py:init():897] sending inform_init request
+2025-06-22 19:06:31,085 INFO    MainThread:82552 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-06-22 19:06:31,087 INFO    MainThread:82552 [wandb_init.py:init():907] backend started and connected
+2025-06-22 19:06:31,088 INFO    MainThread:82552 [wandb_init.py:init():1005] updated telemetry
+2025-06-22 19:06:31,147 INFO    MainThread:82552 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
+2025-06-22 19:06:33,293 INFO    MainThread:82552 [wandb_init.py:init():1104] starting run threads in backend
+2025-06-22 19:06:33,470 INFO    MainThread:82552 [wandb_run.py:_console_start():2573] atexit reg
+2025-06-22 19:06:33,471 INFO    MainThread:82552 [wandb_run.py:_redirect():2421] redirect: wrap_raw
+2025-06-22 19:06:33,474 INFO    MainThread:82552 [wandb_run.py:_redirect():2490] Wrapping output streams.
+2025-06-22 19:06:33,474 INFO    MainThread:82552 [wandb_run.py:_redirect():2513] Redirects installed.
+2025-06-22 19:06:33,475 INFO    MainThread:82552 [wandb_init.py:init():1150] run started, returning control to user process
+2025-06-22 19:06:38,587 INFO    MainThread:82552 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
+2025-06-22 19:08:56,484 INFO    MsgRouterThr:82552 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/config.yaml ADDED Viewed

	@@ -0,0 +1,128 @@

+_wandb:
+    value:
+        cli_version: 0.19.11
+        m:
+            - "1": trainer/global_step
+              "6":
+                - 3
+              "7": []
+        python_version: 3.10.0
+        t:
+            "1":
+                - 1
+                - 5
+                - 9
+                - 11
+                - 33
+                - 41
+                - 49
+                - 53
+                - 55
+                - 63
+                - 103
+            "2":
+                - 1
+                - 5
+                - 9
+                - 11
+                - 33
+                - 41
+                - 49
+                - 53
+                - 55
+                - 63
+                - 103
+            "3":
+                - 7
+                - 23
+                - 55
+                - 66
+            "4": 3.10.0
+            "5": 0.19.11
+            "6": 4.52.3
+            "8":
+                - 5
+            "12": 0.19.11
+            "13": linux-x86_64
+accelerator:
+    value: gpu
+batch_size:
+    value: 32
+bert_hidden_dim:
+    value: 768
+bert_name:
+    value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
+check_val_every_n_epoch:
+    value: 1
+cross_attention_freq:
+    value: 2
+devices:
+    value: 0,1,2,3
+filename:
+    value: stage1_06221723
+init_checkpoint:
+    value: ""
+init_lr:
+    value: 0.0001
+lm:
+    value: true
+load_4bit:
+    value: false
+lr_decay_rate:
+    value: 0.9
+match_batch_size:
+    value: 64
+max_epochs:
+    value: 20
+min_lr:
+    value: 1e-05
+mix_dataset:
+    value: true
+mode:
+    value: train
+num_query_token:
+    value: 8
+num_workers:
+    value: 8
+plm_name:
+    value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
+plm_tune:
+    value: freeze
+pool_size:
+    value: 0
+precision:
+    value: bf16-mixed
+projection_dim:
+    value: 256
+prot_aug:
+    value: None
+prot_max_len:
+    value: 1024
+ptm:
+    value: true
+rerank_cand_num:
+    value: 128
+retrieval_eval_epoch:
+    value: 10
+root:
+    value: data
+save_every_n_epochs:
+    value: 5
+scheduler:
+    value: linear_warmup_cosine_lr
+seed:
+    value: 42
+strategy:
+    value: deepspeed
+temperature:
+    value: 0.1
+text_max_len:
+    value: 128
+use_wandb_logger:
+    value: true
+warmup_lr:
+    value: 1e-06
+warmup_steps:
+    value: 1000
+weight_decay:
+    value: 0.05

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/output.log ADDED Viewed

	@@ -0,0 +1,141 @@

+/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
+Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
+  | Name         | Type         | Params | Mode
+------------------------------------------------------
+0 | blip2qformer | Blip2Qformer | 327 M  | train
+------------------------------------------------------
+179 M     Trainable params
+147 M     Non-trainable params
+327 M     Total params
+1,309.467 Total estimated model params size (MB)
+5         Modules in train mode
+926       Modules in eval mode
+Sanity Checking DataLoader 2:  50%|██████████████████████████▌                          | 1/2 [00:00<00:00,  1.83it/s]
+/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:219: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
+Traceback (most recent call last):
+  File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
+    main(args)
+  File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
+    trainer.fit(model, datamodule=dm)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
+    call._call_and_handle_interrupt(
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
+    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
+    return function(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
+    self._run(model, ckpt_path=ckpt_path)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
+    results = self._run_stage()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
+    self._run_sanity_check()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
+    val_loop.run()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
+    return loop_run(self, *args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
+    self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
+    output = call._call_strategy_hook(trainer, hook_name, *step_args)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
+    output = fn(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
+    return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
+    wrapper_output = wrapper_module(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
+    ret_val = func(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
+    loss = self.module(*inputs, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
+    return inner()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
+    result = forward_call(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
+    out = method(*_args, **_kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
+    return func(*args, **kwargs)
+  File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
+    blip2_loss = self.blip2qformer(batch)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
+    sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
+  File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
+    loss_graph = F.cross_entropy(logits_per_graph, labels)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
+    return torch._C._nn.cross_entropy_loss(
+RuntimeError: size mismatch (got input: [4], target: [1])
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
+[rank0]:     main(args)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
+[rank0]:     trainer.fit(model, datamodule=dm)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
+[rank0]:     call._call_and_handle_interrupt(
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
+[rank0]:     return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
+[rank0]:     return function(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
+[rank0]:     self._run(model, ckpt_path=ckpt_path)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
+[rank0]:     results = self._run_stage()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
+[rank0]:     self._run_sanity_check()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
+[rank0]:     val_loop.run()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
+[rank0]:     return loop_run(self, *args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
+[rank0]:     self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
+[rank0]:     output = call._call_strategy_hook(trainer, hook_name, *step_args)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
+[rank0]:     output = fn(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
+[rank0]:     return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
+[rank0]:     wrapper_output = wrapper_module(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
+[rank0]:     ret_val = func(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
+[rank0]:     loss = self.module(*inputs, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
+[rank0]:     return inner()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
+[rank0]:     result = forward_call(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
+[rank0]:     out = method(*_args, **_kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
+[rank0]:     return func(*args, **kwargs)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
+[rank0]:     blip2_loss = self.blip2qformer(batch)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 140, in forward
+[rank0]:     sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 103, in contrast_global
+[rank0]:     loss_graph = F.cross_entropy(logits_per_graph, labels)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
+[rank0]:     return torch._C._nn.cross_entropy_loss(
+[rank0]: RuntimeError: size mismatch (got input: [4], target: [1])

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,225 @@

+numpy==2.2.6
+confection==0.1.5
+text-unidecode==1.3
+contexttimer==0.3.3
+omegaconf==2.3.0
+tzdata==2025.2
+nvidia-cuda-nvrtc-cu12==12.4.127
+plotly==6.1.1
+decord==0.6.0
+nvidia-cublas-cu12==12.4.5.8
+scipy==1.15.3
+nvidia-cufile-cu12==1.11.1.6
+parso==0.8.4
+python-dateutil==2.9.0.post0
+setuptools==78.1.1
+aiosignal==1.3.2
+joblib==1.5.1
+platformdirs==4.3.8
+regex==2024.11.6
+aiohappyeyeballs==2.6.1
+virtualenv==20.31.2
+lazy_loader==0.4
+rich==14.0.0
+timm==0.4.12
+antlr4-python3-runtime==4.9.3
+pandas==2.2.3
+salesforce-lavis==1.0.2
+gitdb==4.0.12
+six==1.17.0
+smmap==5.0.2
+annotated-types==0.7.0
+pyparsing==3.2.3
+Jinja2==3.1.6
+ptyprocess==0.7.0
+streamlit==1.45.1
+idna==3.10
+nvidia-cusolver-cu12==11.6.1.9
+tenacity==9.1.2
+sentencepiece==0.2.0
+matplotlib-inline==0.1.7
+typing-inspection==0.4.1
+packaging==24.2
+nltk==3.9.1
+wheel==0.45.1
+catalogue==2.0.10
+matplotlib==3.10.3
+propcache==0.3.1
+Pygments==2.19.1
+nvidia-nvjitlink-cu12==12.4.127
+requests==2.32.3
+filelock==3.18.0
+pexpect==4.9.0
+opencv-python-headless==4.5.5.64
+certifi==2025.4.26
+nvidia-nvtx-cu12==12.4.127
+bleach==6.2.0
+typing_extensions==4.13.2
+tornado==6.5.1
+networkx==3.4.2
+sympy==1.13.1
+watchdog==6.0.0
+kaggle==1.7.4.5
+nvidia-ml-py==12.575.51
+pyarrow==20.0.0
+mpmath==1.3.0
+lightning-utilities==0.14.3
+ftfy==6.3.1
+triton==3.2.0
+referencing==0.36.2
+ipython==8.36.0
+yarl==1.20.0
+language_data==1.3.0
+cycler==0.12.1
+python-magic==0.4.27
+wasabi==1.1.3
+protobuf==6.31.0
+murmurhash==1.0.13
+jsonschema-specifications==2025.4.1
+blinker==1.9.0
+fonttools==4.58.0
+imageio==2.37.0
+pycocoevalcap==1.2
+nvidia-cuda-cupti-cu12==12.4.127
+fairscale==0.4.4
+hjson==3.1.0
+identify==2.6.12
+mdurl==0.1.2
+decorator==5.2.1
+distlib==0.3.9
+webencodings==0.5.1
+kiwisolver==1.4.8
+srsly==2.5.1
+frozenlist==1.6.0
+blis==1.3.0
+contourpy==1.3.2
+hf-xet==1.1.2
+cymem==2.0.11
+pillow==11.2.1
+pycocotools==2.0.8
+pre_commit==4.2.0
+wrapt==1.17.2
+nvidia-curand-cu12==10.3.5.147
+spacy==3.8.7
+rpds-py==0.25.1
+exceptiongroup==1.3.0
+braceexpand==0.1.7
+rouge_score==0.1.2
+async-timeout==5.0.1
+torchmetrics==1.7.1
+nvidia-nccl-cu12==2.21.5
+wcwidth==0.2.13
+nvidia-cusparselt-cu12==0.6.2
+scikit-image==0.25.2
+urllib3==2.4.0
+portalocker==3.1.1
+smart-open==7.1.0
+cfgv==3.4.0
+markdown-it-py==3.0.0
+charset-normalizer==3.4.2
+executing==2.2.0
+pure_eval==0.2.3
+safetensors==0.5.3
+spacy-legacy==3.0.12
+shellingham==1.5.4
+langcodes==3.5.0
+pytz==2025.2
+iopath==0.1.10
+weasel==0.4.1
+tifffile==2025.5.10
+nodeenv==1.9.1
+absl-py==2.2.2
+einops==0.8.1
+msgpack==1.1.0
+pydantic_core==2.33.2
+ninja==1.11.1.4
+altair==5.5.0
+attrs==25.3.0
+tqdm==4.67.1
+deepspeed==0.16.10+b666844f
+pydeck==0.9.1
+stack-data==0.6.3
+pydantic==2.11.5
+torch==2.6.0
+nvidia-cudnn-cu12==9.1.0.70
+python-slugify==8.0.4
+webdataset==0.2.111
+pytorch-lightning==2.5.1.post0
+prompt_toolkit==3.0.51
+psutil==7.0.0
+opendatasets==0.1.22
+asttokens==3.0.0
+MarkupSafe==3.0.2
+multidict==6.4.4
+nvidia-cufft-cu12==11.2.1.3
+GitPython==3.1.44
+PyYAML==6.0.2
+cloudpathlib==0.21.1
+toml==0.10.2
+marisa-trie==1.2.1
+traitlets==5.14.3
+cachetools==5.5.2
+spacy-loggers==1.0.5
+nvidia-cuda-runtime-cu12==12.4.127
+torchvision==0.21.0
+nvidia-cusparse-cu12==12.3.1.170
+jedi==0.19.2
+thinc==8.3.6
+py-cpuinfo==9.0.0
+yacs==0.1.8
+cffi==1.17.1
+preshed==3.0.10
+more-itertools==10.7.0
+bigmodelvis==0.0.1
+datasets==3.6.0
+huggingface-hub==0.32.1
+narwhals==1.41.0
+xxhash==3.5.0
+sentry-sdk==2.29.1
+aiohttp==3.12.2
+opencv-python==4.11.0.86
+pycryptodome==3.23.0
+threadpoolctl==3.6.0
+flash-attn==2.7.1.post1
+transformers==4.52.3
+pycparser==2.22
+pathlib==1.0.1
+dill==0.3.8
+scikit-learn==1.6.1
+tokenizers==0.21.1
+aliyun-python-sdk-core==2.16.0
+fsspec==2025.3.0
+jmespath==0.10.0
+click==8.2.1
+delta-center-client==0.0.4
+cheroot==10.0.1
+wandb==0.19.11
+setproctitle==1.3.6
+jsonschema==4.24.0
+oss2==2.15.0
+multiprocess==0.70.16
+jaraco.functools==4.1.0
+web.py==0.62
+aliyun-python-sdk-kms==2.16.5
+cryptography==45.0.3
+pip==25.1.1
+docker-pycreds==0.4.0
+typer==0.16.0
+opendelta==0.3.2
+crcmod==1.7
+jaraco.functools==4.0.1
+inflect==7.3.1
+jaraco.collections==5.1.0
+packaging==24.2
+wheel==0.45.1
+tomli==2.0.1
+platformdirs==4.2.2
+typing_extensions==4.12.2
+more-itertools==10.3.0
+autocommand==2.2.2
+jaraco.text==3.12.1
+importlib_metadata==8.0.0
+jaraco.context==5.3.0
+zipp==3.19.2
+backports.tarfile==1.2.0
+typeguard==4.3.0

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,98 @@

+{
+  "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
+  "python": "CPython 3.10.0",
+  "startedAt": "2025-06-22T11:10:38.675049Z",
+  "args": [
+    "--devices",
+    "0,1,2,3",
+    "--mode",
+    "train",
+    "--filename",
+    "stage1_06221723",
+    "--num_query_token",
+    "8",
+    "--plm_name",
+    "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
+    "--bert_name",
+    "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
+    "--save_every_n_epochs",
+    "5",
+    "--max_epochs",
+    "20",
+    "--batch_size",
+    "32",
+    "--precision",
+    "bf16-mixed",
+    "--mix_dataset",
+    "--num_workers",
+    "8",
+    "--use_wandb_logger"
+  ],
+  "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
+  "codePath": "stage1.py",
+  "email": "gia0603yucca@gmail.com",
+  "root": "./all_checkpoints/stage1_06221723/",
+  "host": "dsw-251511-c5cfcb8-lwcpt",
+  "executable": "/root/miniconda3/envs/protT3/bin/python",
+  "codePathLocal": "stage1.py",
+  "cpu_count": 64,
+  "cpu_count_logical": 64,
+  "gpu": "NVIDIA A800-SXM4-80GB",
+  "gpu_count": 8,
+  "disk": {
+    "/": {
+      "total": "1623302262784",
+      "used": "1363165184"
+    }
+  },
+  "memory": {
+    "total": "549755813888"
+  },
+  "cpu": {
+    "count": 64,
+    "countLogical": 64
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    }
+  ],
+  "cudaVersion": "12.1"
+}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb":{"runtime":11}}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,15 @@

+{"time":"2025-06-22T19:10:38.679318052+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-core.log"}
+{"time":"2025-06-22T19:10:39.726410578+08:00","level":"INFO","msg":"created new stream","id":"bu5vqvh5"}
+{"time":"2025-06-22T19:10:39.726458298+08:00","level":"INFO","msg":"stream: started","id":"bu5vqvh5"}
+{"time":"2025-06-22T19:10:39.726477847+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"bu5vqvh5"}
+{"time":"2025-06-22T19:10:39.726508843+08:00","level":"INFO","msg":"sender: started","stream_id":"bu5vqvh5"}
+{"time":"2025-06-22T19:10:39.726652161+08:00","level":"INFO","msg":"handler: started","stream_id":"bu5vqvh5"}
+{"time":"2025-06-22T19:10:40.990067167+08:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-06-22T19:10:49.81571789+08:00","level":"INFO","msg":"stream: closing","id":"bu5vqvh5"}
+{"time":"2025-06-22T19:10:49.815789673+08:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-06-22T19:10:49.816535239+08:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2025-06-22T19:10:52.523961836+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-06-22T19:10:53.665534986+08:00","level":"INFO","msg":"handler: closed","stream_id":"bu5vqvh5"}
+{"time":"2025-06-22T19:10:53.665598663+08:00","level":"INFO","msg":"sender: closed","stream_id":"bu5vqvh5"}
+{"time":"2025-06-22T19:10:53.665594948+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"bu5vqvh5"}
+{"time":"2025-06-22T19:10:53.669904066+08:00","level":"INFO","msg":"stream: closed","id":"bu5vqvh5"}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug.log ADDED Viewed

	@@ -0,0 +1,24 @@

+2025-06-22 19:10:38,671 INFO    MainThread:95583 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
+2025-06-22 19:10:38,671 INFO    MainThread:95583 [wandb_setup.py:_flush():70] Configure stats pid to 95583
+2025-06-22 19:10:38,671 INFO    MainThread:95583 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
+2025-06-22 19:10:38,671 INFO    MainThread:95583 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
+2025-06-22 19:10:38,671 INFO    MainThread:95583 [wandb_setup.py:_flush():70] Loading settings from environment variables
+2025-06-22 19:10:38,671 INFO    MainThread:95583 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug.log
+2025-06-22 19:10:38,671 INFO    MainThread:95583 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/logs/debug-internal.log
+2025-06-22 19:10:38,671 INFO    MainThread:95583 [wandb_init.py:init():852] calling init triggers
+2025-06-22 19:10:38,671 INFO    MainThread:95583 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-06-22 19:10:38,671 INFO    MainThread:95583 [wandb_init.py:init():893] starting backend
+2025-06-22 19:10:38,671 INFO    MainThread:95583 [wandb_init.py:init():897] sending inform_init request
+2025-06-22 19:10:38,673 INFO    MainThread:95583 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-06-22 19:10:38,674 INFO    MainThread:95583 [wandb_init.py:init():907] backend started and connected
+2025-06-22 19:10:38,675 INFO    MainThread:95583 [wandb_init.py:init():1005] updated telemetry
+2025-06-22 19:10:38,678 INFO    MainThread:95583 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
+2025-06-22 19:10:40,981 INFO    MainThread:95583 [wandb_init.py:init():1104] starting run threads in backend
+2025-06-22 19:10:41,122 INFO    MainThread:95583 [wandb_run.py:_console_start():2573] atexit reg
+2025-06-22 19:10:41,122 INFO    MainThread:95583 [wandb_run.py:_redirect():2421] redirect: wrap_raw
+2025-06-22 19:10:41,125 INFO    MainThread:95583 [wandb_run.py:_redirect():2490] Wrapping output streams.
+2025-06-22 19:10:41,125 INFO    MainThread:95583 [wandb_run.py:_redirect():2513] Redirects installed.
+2025-06-22 19:10:41,126 INFO    MainThread:95583 [wandb_init.py:init():1150] run started, returning control to user process
+2025-06-22 19:10:44,823 INFO    MainThread:95583 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
+2025-06-22 19:10:49,814 INFO    MsgRouterThr:95583 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191038-bu5vqvh5/run-bu5vqvh5.wandb ADDED Viewed

Binary file (20.5 kB). View file

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/config.yaml ADDED Viewed

	@@ -0,0 +1,128 @@

+_wandb:
+    value:
+        cli_version: 0.19.11
+        m:
+            - "1": trainer/global_step
+              "6":
+                - 3
+              "7": []
+        python_version: 3.10.0
+        t:
+            "1":
+                - 1
+                - 5
+                - 9
+                - 11
+                - 33
+                - 41
+                - 49
+                - 53
+                - 55
+                - 63
+                - 103
+            "2":
+                - 1
+                - 5
+                - 9
+                - 11
+                - 33
+                - 41
+                - 49
+                - 53
+                - 55
+                - 63
+                - 103
+            "3":
+                - 7
+                - 23
+                - 55
+                - 66
+            "4": 3.10.0
+            "5": 0.19.11
+            "6": 4.52.3
+            "8":
+                - 5
+            "12": 0.19.11
+            "13": linux-x86_64
+accelerator:
+    value: gpu
+batch_size:
+    value: 32
+bert_hidden_dim:
+    value: 768
+bert_name:
+    value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
+check_val_every_n_epoch:
+    value: 1
+cross_attention_freq:
+    value: 2
+devices:
+    value: 0,1,2,3
+filename:
+    value: stage1_06221723
+init_checkpoint:
+    value: ""
+init_lr:
+    value: 0.0001
+lm:
+    value: true
+load_4bit:
+    value: false
+lr_decay_rate:
+    value: 0.9
+match_batch_size:
+    value: 64
+max_epochs:
+    value: 20
+min_lr:
+    value: 1e-05
+mix_dataset:
+    value: true
+mode:
+    value: train
+num_query_token:
+    value: 8
+num_workers:
+    value: 8
+plm_name:
+    value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
+plm_tune:
+    value: freeze
+pool_size:
+    value: 0
+precision:
+    value: bf16-mixed
+projection_dim:
+    value: 256
+prot_aug:
+    value: None
+prot_max_len:
+    value: 1024
+ptm:
+    value: true
+rerank_cand_num:
+    value: 128
+retrieval_eval_epoch:
+    value: 10
+root:
+    value: data
+save_every_n_epochs:
+    value: 5
+scheduler:
+    value: linear_warmup_cosine_lr
+seed:
+    value: 42
+strategy:
+    value: deepspeed
+temperature:
+    value: 0.1
+text_max_len:
+    value: 128
+use_wandb_logger:
+    value: true
+warmup_lr:
+    value: 1e-06
+warmup_steps:
+    value: 1000
+weight_decay:
+    value: 0.05

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/output.log ADDED Viewed

	@@ -0,0 +1,150 @@

+/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
+Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
+  | Name         | Type         | Params | Mode
+------------------------------------------------------
+0 | blip2qformer | Blip2Qformer | 327 M  | train
+------------------------------------------------------
+179 M     Trainable params
+147 M     Non-trainable params
+327 M     Total params
+1,309.467 Total estimated model params size (MB)
+5         Modules in train mode
+926       Modules in eval mode
+Sanity Checking DataLoader 0:   0%|                                                             | 0/1 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([25, 100])
+labels.shape: torch.Size([25])
+/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:220: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
+Sanity Checking DataLoader 1:   0%|                                                             | 0/2 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([32, 128])
+labels.shape: torch.Size([32])
+Sanity Checking DataLoader 1:  50%|██████████████████████████▌                          | 1/2 [00:00<00:00,  1.84it/s]logits_per_graph.shape: torch.Size([18, 72])
+labels.shape: torch.Size([18])
+Sanity Checking DataLoader 2:   0%|                                                             | 0/2 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([32, 128])
+labels.shape: torch.Size([32])
+Sanity Checking DataLoader 2:  50%|██████████████████████████▌                          | 1/2 [00:00<00:00,  1.69it/s]logits_per_graph.shape: torch.Size([4])
+labels.shape: torch.Size([1])
+Traceback (most recent call last):
+  File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
+    main(args)
+  File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
+    trainer.fit(model, datamodule=dm)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
+    call._call_and_handle_interrupt(
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
+    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
+    return function(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
+    self._run(model, ckpt_path=ckpt_path)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
+    results = self._run_stage()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
+    self._run_sanity_check()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
+    val_loop.run()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
+    return loop_run(self, *args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
+    self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
+    output = call._call_strategy_hook(trainer, hook_name, *step_args)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
+    output = fn(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
+    return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
+    wrapper_output = wrapper_module(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
+    ret_val = func(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
+    loss = self.module(*inputs, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
+    return inner()
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
+    result = forward_call(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
+    out = method(*_args, **_kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
+    return func(*args, **kwargs)
+  File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
+    blip2_loss = self.blip2qformer(batch)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 141, in forward
+    sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
+  File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 104, in contrast_global
+    loss_graph = F.cross_entropy(logits_per_graph, labels)
+  File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
+    return torch._C._nn.cross_entropy_loss(
+RuntimeError: size mismatch (got input: [4], target: [1])
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 119, in <module>
+[rank0]:     main(args)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 85, in main
+[rank0]:     trainer.fit(model, datamodule=dm)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
+[rank0]:     call._call_and_handle_interrupt(
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
+[rank0]:     return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
+[rank0]:     return function(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
+[rank0]:     self._run(model, ckpt_path=ckpt_path)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
+[rank0]:     results = self._run_stage()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1054, in _run_stage
+[rank0]:     self._run_sanity_check()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1083, in _run_sanity_check
+[rank0]:     val_loop.run()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
+[rank0]:     return loop_run(self, *args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 145, in run
+[rank0]:     self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 437, in _evaluation_step
+[rank0]:     output = call._call_strategy_hook(trainer, hook_name, *step_args)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
+[rank0]:     output = fn(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 411, in validation_step
+[rank0]:     return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
+[rank0]:     wrapper_output = wrapper_module(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
+[rank0]:     ret_val = func(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
+[rank0]:     loss = self.module(*inputs, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
+[rank0]:     return inner()
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
+[rank0]:     result = forward_call(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
+[rank0]:     out = method(*_args, **_kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
+[rank0]:     return func(*args, **kwargs)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 64, in validation_step
+[rank0]:     blip2_loss = self.blip2qformer(batch)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 141, in forward
+[rank0]:     sim_p2t, sim_t2p, loss_ptc = self.contrast_global(prot_feats, text_feats, prot_feats_all, text_feats_all, return_sim=True)
+[rank0]:   File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 104, in contrast_global
+[rank0]:     loss_graph = F.cross_entropy(logits_per_graph, labels)
+[rank0]:   File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/functional.py", line 3494, in cross_entropy
+[rank0]:     return torch._C._nn.cross_entropy_loss(
+[rank0]: RuntimeError: size mismatch (got input: [4], target: [1])

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,225 @@

+numpy==2.2.6
+confection==0.1.5
+text-unidecode==1.3
+contexttimer==0.3.3
+omegaconf==2.3.0
+tzdata==2025.2
+nvidia-cuda-nvrtc-cu12==12.4.127
+plotly==6.1.1
+decord==0.6.0
+nvidia-cublas-cu12==12.4.5.8
+scipy==1.15.3
+nvidia-cufile-cu12==1.11.1.6
+parso==0.8.4
+python-dateutil==2.9.0.post0
+setuptools==78.1.1
+aiosignal==1.3.2
+joblib==1.5.1
+platformdirs==4.3.8
+regex==2024.11.6
+aiohappyeyeballs==2.6.1
+virtualenv==20.31.2
+lazy_loader==0.4
+rich==14.0.0
+timm==0.4.12
+antlr4-python3-runtime==4.9.3
+pandas==2.2.3
+salesforce-lavis==1.0.2
+gitdb==4.0.12
+six==1.17.0
+smmap==5.0.2
+annotated-types==0.7.0
+pyparsing==3.2.3
+Jinja2==3.1.6
+ptyprocess==0.7.0
+streamlit==1.45.1
+idna==3.10
+nvidia-cusolver-cu12==11.6.1.9
+tenacity==9.1.2
+sentencepiece==0.2.0
+matplotlib-inline==0.1.7
+typing-inspection==0.4.1
+packaging==24.2
+nltk==3.9.1
+wheel==0.45.1
+catalogue==2.0.10
+matplotlib==3.10.3
+propcache==0.3.1
+Pygments==2.19.1
+nvidia-nvjitlink-cu12==12.4.127
+requests==2.32.3
+filelock==3.18.0
+pexpect==4.9.0
+opencv-python-headless==4.5.5.64
+certifi==2025.4.26
+nvidia-nvtx-cu12==12.4.127
+bleach==6.2.0
+typing_extensions==4.13.2
+tornado==6.5.1
+networkx==3.4.2
+sympy==1.13.1
+watchdog==6.0.0
+kaggle==1.7.4.5
+nvidia-ml-py==12.575.51
+pyarrow==20.0.0
+mpmath==1.3.0
+lightning-utilities==0.14.3
+ftfy==6.3.1
+triton==3.2.0
+referencing==0.36.2
+ipython==8.36.0
+yarl==1.20.0
+language_data==1.3.0
+cycler==0.12.1
+python-magic==0.4.27
+wasabi==1.1.3
+protobuf==6.31.0
+murmurhash==1.0.13
+jsonschema-specifications==2025.4.1
+blinker==1.9.0
+fonttools==4.58.0
+imageio==2.37.0
+pycocoevalcap==1.2
+nvidia-cuda-cupti-cu12==12.4.127
+fairscale==0.4.4
+hjson==3.1.0
+identify==2.6.12
+mdurl==0.1.2
+decorator==5.2.1
+distlib==0.3.9
+webencodings==0.5.1
+kiwisolver==1.4.8
+srsly==2.5.1
+frozenlist==1.6.0
+blis==1.3.0
+contourpy==1.3.2
+hf-xet==1.1.2
+cymem==2.0.11
+pillow==11.2.1
+pycocotools==2.0.8
+pre_commit==4.2.0
+wrapt==1.17.2
+nvidia-curand-cu12==10.3.5.147
+spacy==3.8.7
+rpds-py==0.25.1
+exceptiongroup==1.3.0
+braceexpand==0.1.7
+rouge_score==0.1.2
+async-timeout==5.0.1
+torchmetrics==1.7.1
+nvidia-nccl-cu12==2.21.5
+wcwidth==0.2.13
+nvidia-cusparselt-cu12==0.6.2
+scikit-image==0.25.2
+urllib3==2.4.0
+portalocker==3.1.1
+smart-open==7.1.0
+cfgv==3.4.0
+markdown-it-py==3.0.0
+charset-normalizer==3.4.2
+executing==2.2.0
+pure_eval==0.2.3
+safetensors==0.5.3
+spacy-legacy==3.0.12
+shellingham==1.5.4
+langcodes==3.5.0
+pytz==2025.2
+iopath==0.1.10
+weasel==0.4.1
+tifffile==2025.5.10
+nodeenv==1.9.1
+absl-py==2.2.2
+einops==0.8.1
+msgpack==1.1.0
+pydantic_core==2.33.2
+ninja==1.11.1.4
+altair==5.5.0
+attrs==25.3.0
+tqdm==4.67.1
+deepspeed==0.16.10+b666844f
+pydeck==0.9.1
+stack-data==0.6.3
+pydantic==2.11.5
+torch==2.6.0
+nvidia-cudnn-cu12==9.1.0.70
+python-slugify==8.0.4
+webdataset==0.2.111
+pytorch-lightning==2.5.1.post0
+prompt_toolkit==3.0.51
+psutil==7.0.0
+opendatasets==0.1.22
+asttokens==3.0.0
+MarkupSafe==3.0.2
+multidict==6.4.4
+nvidia-cufft-cu12==11.2.1.3
+GitPython==3.1.44
+PyYAML==6.0.2
+cloudpathlib==0.21.1
+toml==0.10.2
+marisa-trie==1.2.1
+traitlets==5.14.3
+cachetools==5.5.2
+spacy-loggers==1.0.5
+nvidia-cuda-runtime-cu12==12.4.127
+torchvision==0.21.0
+nvidia-cusparse-cu12==12.3.1.170
+jedi==0.19.2
+thinc==8.3.6
+py-cpuinfo==9.0.0
+yacs==0.1.8
+cffi==1.17.1
+preshed==3.0.10
+more-itertools==10.7.0
+bigmodelvis==0.0.1
+datasets==3.6.0
+huggingface-hub==0.32.1
+narwhals==1.41.0
+xxhash==3.5.0
+sentry-sdk==2.29.1
+aiohttp==3.12.2
+opencv-python==4.11.0.86
+pycryptodome==3.23.0
+threadpoolctl==3.6.0
+flash-attn==2.7.1.post1
+transformers==4.52.3
+pycparser==2.22
+pathlib==1.0.1
+dill==0.3.8
+scikit-learn==1.6.1
+tokenizers==0.21.1
+aliyun-python-sdk-core==2.16.0
+fsspec==2025.3.0
+jmespath==0.10.0
+click==8.2.1
+delta-center-client==0.0.4
+cheroot==10.0.1
+wandb==0.19.11
+setproctitle==1.3.6
+jsonschema==4.24.0
+oss2==2.15.0
+multiprocess==0.70.16
+jaraco.functools==4.1.0
+web.py==0.62
+aliyun-python-sdk-kms==2.16.5
+cryptography==45.0.3
+pip==25.1.1
+docker-pycreds==0.4.0
+typer==0.16.0
+opendelta==0.3.2
+crcmod==1.7
+jaraco.functools==4.0.1
+inflect==7.3.1
+jaraco.collections==5.1.0
+packaging==24.2
+wheel==0.45.1
+tomli==2.0.1
+platformdirs==4.2.2
+typing_extensions==4.12.2
+more-itertools==10.3.0
+autocommand==2.2.2
+jaraco.text==3.12.1
+importlib_metadata==8.0.0
+jaraco.context==5.3.0
+zipp==3.19.2
+backports.tarfile==1.2.0
+typeguard==4.3.0

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,98 @@

+{
+  "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
+  "python": "CPython 3.10.0",
+  "startedAt": "2025-06-22T11:18:08.739768Z",
+  "args": [
+    "--devices",
+    "0,1,2,3",
+    "--mode",
+    "train",
+    "--filename",
+    "stage1_06221723",
+    "--num_query_token",
+    "8",
+    "--plm_name",
+    "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
+    "--bert_name",
+    "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
+    "--save_every_n_epochs",
+    "5",
+    "--max_epochs",
+    "20",
+    "--batch_size",
+    "32",
+    "--precision",
+    "bf16-mixed",
+    "--mix_dataset",
+    "--num_workers",
+    "8",
+    "--use_wandb_logger"
+  ],
+  "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
+  "codePath": "stage1.py",
+  "email": "gia0603yucca@gmail.com",
+  "root": "./all_checkpoints/stage1_06221723/",
+  "host": "dsw-251511-c5cfcb8-lwcpt",
+  "executable": "/root/miniconda3/envs/protT3/bin/python",
+  "codePathLocal": "stage1.py",
+  "cpu_count": 64,
+  "cpu_count_logical": 64,
+  "gpu": "NVIDIA A800-SXM4-80GB",
+  "gpu_count": 8,
+  "disk": {
+    "/": {
+      "total": "1623302262784",
+      "used": "1363197952"
+    }
+  },
+  "memory": {
+    "total": "549755813888"
+  },
+  "cpu": {
+    "count": 64,
+    "countLogical": 64
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    }
+  ],
+  "cudaVersion": "12.1"
+}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb":{"runtime":11}}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,10 @@

+{"time":"2025-06-22T19:18:08.742601474+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-core.log"}
+{"time":"2025-06-22T19:18:09.773061599+08:00","level":"INFO","msg":"created new stream","id":"a2dszq0q"}
+{"time":"2025-06-22T19:18:09.773105546+08:00","level":"INFO","msg":"stream: started","id":"a2dszq0q"}
+{"time":"2025-06-22T19:18:09.773141968+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"a2dszq0q"}
+{"time":"2025-06-22T19:18:09.773166056+08:00","level":"INFO","msg":"sender: started","stream_id":"a2dszq0q"}
+{"time":"2025-06-22T19:18:09.773225667+08:00","level":"INFO","msg":"handler: started","stream_id":"a2dszq0q"}
+{"time":"2025-06-22T19:18:11.012086945+08:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-06-22T19:18:19.860220439+08:00","level":"INFO","msg":"stream: closing","id":"a2dszq0q"}
+{"time":"2025-06-22T19:18:19.860330929+08:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-06-22T19:18:19.861063374+08:00","level":"INFO","msg":"Stopped system monitor"}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug.log ADDED Viewed

	@@ -0,0 +1,24 @@

+2025-06-22 19:18:08,732 INFO    MainThread:99755 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
+2025-06-22 19:18:08,733 INFO    MainThread:99755 [wandb_setup.py:_flush():70] Configure stats pid to 99755
+2025-06-22 19:18:08,733 INFO    MainThread:99755 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
+2025-06-22 19:18:08,733 INFO    MainThread:99755 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
+2025-06-22 19:18:08,733 INFO    MainThread:99755 [wandb_setup.py:_flush():70] Loading settings from environment variables
+2025-06-22 19:18:08,733 INFO    MainThread:99755 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug.log
+2025-06-22 19:18:08,733 INFO    MainThread:99755 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/logs/debug-internal.log
+2025-06-22 19:18:08,733 INFO    MainThread:99755 [wandb_init.py:init():852] calling init triggers
+2025-06-22 19:18:08,733 INFO    MainThread:99755 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-06-22 19:18:08,733 INFO    MainThread:99755 [wandb_init.py:init():893] starting backend
+2025-06-22 19:18:08,733 INFO    MainThread:99755 [wandb_init.py:init():897] sending inform_init request
+2025-06-22 19:18:08,734 INFO    MainThread:99755 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-06-22 19:18:08,736 INFO    MainThread:99755 [wandb_init.py:init():907] backend started and connected
+2025-06-22 19:18:08,740 INFO    MainThread:99755 [wandb_init.py:init():1005] updated telemetry
+2025-06-22 19:18:08,746 INFO    MainThread:99755 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
+2025-06-22 19:18:10,963 INFO    MainThread:99755 [wandb_init.py:init():1104] starting run threads in backend
+2025-06-22 19:18:11,141 INFO    MainThread:99755 [wandb_run.py:_console_start():2573] atexit reg
+2025-06-22 19:18:11,142 INFO    MainThread:99755 [wandb_run.py:_redirect():2421] redirect: wrap_raw
+2025-06-22 19:18:11,145 INFO    MainThread:99755 [wandb_run.py:_redirect():2490] Wrapping output streams.
+2025-06-22 19:18:11,145 INFO    MainThread:99755 [wandb_run.py:_redirect():2513] Redirects installed.
+2025-06-22 19:18:11,146 INFO    MainThread:99755 [wandb_init.py:init():1150] run started, returning control to user process
+2025-06-22 19:18:14,870 INFO    MainThread:99755 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06221723', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
+2025-06-22 19:18:19,859 INFO    MsgRouterThr:99755 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_191808-a2dszq0q/run-a2dszq0q.wandb ADDED Viewed

File without changes

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/config.yaml ADDED Viewed

	@@ -0,0 +1,128 @@

+_wandb:
+    value:
+        cli_version: 0.19.11
+        m:
+            - "1": trainer/global_step
+              "6":
+                - 3
+              "7": []
+        python_version: 3.10.0
+        t:
+            "1":
+                - 1
+                - 5
+                - 9
+                - 11
+                - 33
+                - 41
+                - 49
+                - 53
+                - 55
+                - 63
+                - 103
+            "2":
+                - 1
+                - 5
+                - 9
+                - 11
+                - 33
+                - 41
+                - 49
+                - 53
+                - 55
+                - 63
+                - 103
+            "3":
+                - 7
+                - 23
+                - 55
+                - 66
+            "4": 3.10.0
+            "5": 0.19.11
+            "6": 4.52.3
+            "8":
+                - 5
+            "12": 0.19.11
+            "13": linux-x86_64
+accelerator:
+    value: gpu
+batch_size:
+    value: 32
+bert_hidden_dim:
+    value: 768
+bert_name:
+    value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
+check_val_every_n_epoch:
+    value: 1
+cross_attention_freq:
+    value: 2
+devices:
+    value: 0,1,2,3,4,5,6,7
+filename:
+    value: stage1_06221723
+init_checkpoint:
+    value: ""
+init_lr:
+    value: 0.0001
+lm:
+    value: true
+load_4bit:
+    value: false
+lr_decay_rate:
+    value: 0.9
+match_batch_size:
+    value: 64
+max_epochs:
+    value: 20
+min_lr:
+    value: 1e-05
+mix_dataset:
+    value: true
+mode:
+    value: train
+num_query_token:
+    value: 8
+num_workers:
+    value: 8
+plm_name:
+    value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
+plm_tune:
+    value: freeze
+pool_size:
+    value: 0
+precision:
+    value: bf16-mixed
+projection_dim:
+    value: 256
+prot_aug:
+    value: None
+prot_max_len:
+    value: 1024
+ptm:
+    value: true
+rerank_cand_num:
+    value: 128
+retrieval_eval_epoch:
+    value: 10
+root:
+    value: data
+save_every_n_epochs:
+    value: 5
+scheduler:
+    value: linear_warmup_cosine_lr
+seed:
+    value: 42
+strategy:
+    value: deepspeed
+temperature:
+    value: 0.1
+text_max_len:
+    value: 128
+use_wandb_logger:
+    value: true
+warmup_lr:
+    value: 1e-06
+warmup_steps:
+    value: 1000
+weight_decay:
+    value: 0.05

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/output.log ADDED Viewed

	@@ -0,0 +1,110 @@

+[rank0]:W0622 19:26:47.840041 104393 site-packages/torch/distributed/distributed_c10d.py:2941] _object_to_tensor size: 81 hash value: 6444836214324640892
+/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06221723 exists and is not empty.
+Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+  | Name         | Type         | Params | Mode
+------------------------------------------------------
+0 | blip2qformer | Blip2Qformer | 327 M  | train
+------------------------------------------------------
+179 M     Trainable params
+147 M     Non-trainable params
+327 M     Total params
+1,309.467 Total estimated model params size (MB)
+5         Modules in train mode
+926       Modules in eval mode
+Sanity Checking DataLoader 0:   0%|                                                             | 0/1 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([13, 104])
+labels.shape: torch.Size([13])
+/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:220: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
+Sanity Checking DataLoader 1:   0%|                                                             | 0/1 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([25, 200])
+labels.shape: torch.Size([25])
+Sanity Checking DataLoader 2:   0%|                                                             | 0/1 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([17, 136])
+labels.shape: torch.Size([17])
+Epoch 0:   0%|                                                                                 | 0/61 [00:00<?, ?it/s]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:   2%|█                                                            | 1/61 [00:01<01:11,  0.84it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:   3%|██                                                           | 2/61 [00:01<00:55,  1.07it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:   5%|███                                                          | 3/61 [00:02<00:49,  1.17it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:   7%|████                                                         | 4/61 [00:03<00:46,  1.24it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:   8%|█████                                                        | 5/61 [00:03<00:43,  1.28it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  10%|██████                                                       | 6/61 [00:04<00:42,  1.31it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  11%|███████                                                      | 7/61 [00:05<00:40,  1.33it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  13%|████████                                                     | 8/61 [00:05<00:39,  1.35it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  15%|█████████                                                    | 9/61 [00:06<00:38,  1.36it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  16%|█████████▊                                                  | 10/61 [00:07<00:37,  1.37it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  18%|██████████▊                                                 | 11/61 [00:07<00:36,  1.38it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  20%|███████████▊                                                | 12/61 [00:08<00:35,  1.39it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  21%|████████████▊                                               | 13/61 [00:09<00:34,  1.40it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  23%|█████████████▊                                              | 14/61 [00:09<00:33,  1.40it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  25%|██████████████▊                                             | 15/61 [00:10<00:32,  1.41it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  26%|███████████████▋                                            | 16/61 [00:11<00:31,  1.41it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  28%|████████████████▋                                           | 17/61 [00:12<00:31,  1.42it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  30%|█████████████████▋                                          | 18/61 [00:12<00:30,  1.42it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  31%|██████████████████▋                                         | 19/61 [00:13<00:29,  1.42it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  33%|███████████████████▋                                        | 20/61 [00:14<00:28,  1.42it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  34%|████████████████████▋                                       | 21/61 [00:14<00:28,  1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  36%|█████████████████████▋                                      | 22/61 [00:15<00:27,  1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  38%|██████████████████████▌                                     | 23/61 [00:16<00:26,  1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  39%|███████████████████████▌                                    | 24/61 [00:16<00:25,  1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  41%|████████████████████████▌                                   | 25/61 [00:17<00:25,  1.43it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  43%|█████████████████████████▌                                  | 26/61 [00:18<00:24,  1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  44%|██████████████████████████▌                                 | 27/61 [00:18<00:23,  1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  46%|███████████████████████████▌                                | 28/61 [00:19<00:22,  1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  48%|████████████████████████████▌                               | 29/61 [00:20<00:22,  1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  49%|█████████████████████████████▌                              | 30/61 [00:20<00:21,  1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  51%|██████████████████████████████▍                             | 31/61 [00:21<00:20,  1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  52%|███████████████████████████████▍                            | 32/61 [00:22<00:20,  1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  54%|████████████████████████████████▍                           | 33/61 [00:22<00:19,  1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  56%|█████████████████████████████████▍                          | 34/61 [00:23<00:18,  1.44it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  57%|██████████████████████████████████▍                         | 35/61 [00:24<00:17,  1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  59%|█████████████████████████���█████████▍                        | 36/61 [00:24<00:17,  1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  61%|████████████████████████████████████▍                       | 37/61 [00:25<00:16,  1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  62%|█████████████████████████████████████▍                      | 38/61 [00:26<00:15,  1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  64%|██████████████████████████████████████▎                     | 39/61 [00:26<00:15,  1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  66%|███████████████████████████████████████▎                    | 40/61 [00:27<00:14,  1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  67%|████████████████████████████████████████▎                   | 41/61 [00:28<00:13,  1.45it/s, v_num=vn72]logits_per_graph.shape: torch.Size([32, 256])
+labels.shape: torch.Size([32])
+Epoch 0:  69%|█████████████████████████████████████████▎                  | 42/61 [00:28<00:13,  1.45it/s, v_num=vn72]
+Detected KeyboardInterrupt, attempting graceful shutdown ...

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,225 @@

+numpy==2.2.6
+confection==0.1.5
+text-unidecode==1.3
+contexttimer==0.3.3
+omegaconf==2.3.0
+tzdata==2025.2
+nvidia-cuda-nvrtc-cu12==12.4.127
+plotly==6.1.1
+decord==0.6.0
+nvidia-cublas-cu12==12.4.5.8
+scipy==1.15.3
+nvidia-cufile-cu12==1.11.1.6
+parso==0.8.4
+python-dateutil==2.9.0.post0
+setuptools==78.1.1
+aiosignal==1.3.2
+joblib==1.5.1
+platformdirs==4.3.8
+regex==2024.11.6
+aiohappyeyeballs==2.6.1
+virtualenv==20.31.2
+lazy_loader==0.4
+rich==14.0.0
+timm==0.4.12
+antlr4-python3-runtime==4.9.3
+pandas==2.2.3
+salesforce-lavis==1.0.2
+gitdb==4.0.12
+six==1.17.0
+smmap==5.0.2
+annotated-types==0.7.0
+pyparsing==3.2.3
+Jinja2==3.1.6
+ptyprocess==0.7.0
+streamlit==1.45.1
+idna==3.10
+nvidia-cusolver-cu12==11.6.1.9
+tenacity==9.1.2
+sentencepiece==0.2.0
+matplotlib-inline==0.1.7
+typing-inspection==0.4.1
+packaging==24.2
+nltk==3.9.1
+wheel==0.45.1
+catalogue==2.0.10
+matplotlib==3.10.3
+propcache==0.3.1
+Pygments==2.19.1
+nvidia-nvjitlink-cu12==12.4.127
+requests==2.32.3
+filelock==3.18.0
+pexpect==4.9.0
+opencv-python-headless==4.5.5.64
+certifi==2025.4.26
+nvidia-nvtx-cu12==12.4.127
+bleach==6.2.0
+typing_extensions==4.13.2
+tornado==6.5.1
+networkx==3.4.2
+sympy==1.13.1
+watchdog==6.0.0
+kaggle==1.7.4.5
+nvidia-ml-py==12.575.51
+pyarrow==20.0.0
+mpmath==1.3.0
+lightning-utilities==0.14.3
+ftfy==6.3.1
+triton==3.2.0
+referencing==0.36.2
+ipython==8.36.0
+yarl==1.20.0
+language_data==1.3.0
+cycler==0.12.1
+python-magic==0.4.27
+wasabi==1.1.3
+protobuf==6.31.0
+murmurhash==1.0.13
+jsonschema-specifications==2025.4.1
+blinker==1.9.0
+fonttools==4.58.0
+imageio==2.37.0
+pycocoevalcap==1.2
+nvidia-cuda-cupti-cu12==12.4.127
+fairscale==0.4.4
+hjson==3.1.0
+identify==2.6.12
+mdurl==0.1.2
+decorator==5.2.1
+distlib==0.3.9
+webencodings==0.5.1
+kiwisolver==1.4.8
+srsly==2.5.1
+frozenlist==1.6.0
+blis==1.3.0
+contourpy==1.3.2
+hf-xet==1.1.2
+cymem==2.0.11
+pillow==11.2.1
+pycocotools==2.0.8
+pre_commit==4.2.0
+wrapt==1.17.2
+nvidia-curand-cu12==10.3.5.147
+spacy==3.8.7
+rpds-py==0.25.1
+exceptiongroup==1.3.0
+braceexpand==0.1.7
+rouge_score==0.1.2
+async-timeout==5.0.1
+torchmetrics==1.7.1
+nvidia-nccl-cu12==2.21.5
+wcwidth==0.2.13
+nvidia-cusparselt-cu12==0.6.2
+scikit-image==0.25.2
+urllib3==2.4.0
+portalocker==3.1.1
+smart-open==7.1.0
+cfgv==3.4.0
+markdown-it-py==3.0.0
+charset-normalizer==3.4.2
+executing==2.2.0
+pure_eval==0.2.3
+safetensors==0.5.3
+spacy-legacy==3.0.12
+shellingham==1.5.4
+langcodes==3.5.0
+pytz==2025.2
+iopath==0.1.10
+weasel==0.4.1
+tifffile==2025.5.10
+nodeenv==1.9.1
+absl-py==2.2.2
+einops==0.8.1
+msgpack==1.1.0
+pydantic_core==2.33.2
+ninja==1.11.1.4
+altair==5.5.0
+attrs==25.3.0
+tqdm==4.67.1
+deepspeed==0.16.10+b666844f
+pydeck==0.9.1
+stack-data==0.6.3
+pydantic==2.11.5
+torch==2.6.0
+nvidia-cudnn-cu12==9.1.0.70
+python-slugify==8.0.4
+webdataset==0.2.111
+pytorch-lightning==2.5.1.post0
+prompt_toolkit==3.0.51
+psutil==7.0.0
+opendatasets==0.1.22
+asttokens==3.0.0
+MarkupSafe==3.0.2
+multidict==6.4.4
+nvidia-cufft-cu12==11.2.1.3
+GitPython==3.1.44
+PyYAML==6.0.2
+cloudpathlib==0.21.1
+toml==0.10.2
+marisa-trie==1.2.1
+traitlets==5.14.3
+cachetools==5.5.2
+spacy-loggers==1.0.5
+nvidia-cuda-runtime-cu12==12.4.127
+torchvision==0.21.0
+nvidia-cusparse-cu12==12.3.1.170
+jedi==0.19.2
+thinc==8.3.6
+py-cpuinfo==9.0.0
+yacs==0.1.8
+cffi==1.17.1
+preshed==3.0.10
+more-itertools==10.7.0
+bigmodelvis==0.0.1
+datasets==3.6.0
+huggingface-hub==0.32.1
+narwhals==1.41.0
+xxhash==3.5.0
+sentry-sdk==2.29.1
+aiohttp==3.12.2
+opencv-python==4.11.0.86
+pycryptodome==3.23.0
+threadpoolctl==3.6.0
+flash-attn==2.7.1.post1
+transformers==4.52.3
+pycparser==2.22
+pathlib==1.0.1
+dill==0.3.8
+scikit-learn==1.6.1
+tokenizers==0.21.1
+aliyun-python-sdk-core==2.16.0
+fsspec==2025.3.0
+jmespath==0.10.0
+click==8.2.1
+delta-center-client==0.0.4
+cheroot==10.0.1
+wandb==0.19.11
+setproctitle==1.3.6
+jsonschema==4.24.0
+oss2==2.15.0
+multiprocess==0.70.16
+jaraco.functools==4.1.0
+web.py==0.62
+aliyun-python-sdk-kms==2.16.5
+cryptography==45.0.3
+pip==25.1.1
+docker-pycreds==0.4.0
+typer==0.16.0
+opendelta==0.3.2
+crcmod==1.7
+jaraco.functools==4.0.1
+inflect==7.3.1
+jaraco.collections==5.1.0
+packaging==24.2
+wheel==0.45.1
+tomli==2.0.1
+platformdirs==4.2.2
+typing_extensions==4.12.2
+more-itertools==10.3.0
+autocommand==2.2.2
+jaraco.text==3.12.1
+importlib_metadata==8.0.0
+jaraco.context==5.3.0
+zipp==3.19.2
+backports.tarfile==1.2.0
+typeguard==4.3.0

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,98 @@

+{
+  "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
+  "python": "CPython 3.10.0",
+  "startedAt": "2025-06-22T11:26:45.457479Z",
+  "args": [
+    "--devices",
+    "0,1,2,3,4,5,6,7",
+    "--mode",
+    "train",
+    "--filename",
+    "stage1_06221723",
+    "--num_query_token",
+    "8",
+    "--plm_name",
+    "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
+    "--bert_name",
+    "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
+    "--save_every_n_epochs",
+    "5",
+    "--max_epochs",
+    "20",
+    "--batch_size",
+    "32",
+    "--precision",
+    "bf16-mixed",
+    "--mix_dataset",
+    "--num_workers",
+    "8",
+    "--use_wandb_logger"
+  ],
+  "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
+  "codePath": "stage1.py",
+  "email": "gia0603yucca@gmail.com",
+  "root": "./all_checkpoints/stage1_06221723/",
+  "host": "dsw-251511-c5cfcb8-lwcpt",
+  "executable": "/root/miniconda3/envs/protT3/bin/python",
+  "codePathLocal": "stage1.py",
+  "cpu_count": 64,
+  "cpu_count_logical": 64,
+  "gpu": "NVIDIA A800-SXM4-80GB",
+  "gpu_count": 8,
+  "disk": {
+    "/": {
+      "total": "1623302262784",
+      "used": "1363202048"
+    }
+  },
+  "memory": {
+    "total": "549755813888"
+  },
+  "cpu": {
+    "count": 64,
+    "countLogical": 64
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "memoryTotal": "85198045184",
+      "architecture": "Ampere"
+    }
+  ],
+  "cudaVersion": "12.1"
+}

ProtT3/all_checkpoints/stage1_06221723/wandb/run-20250622_192645-zmuhvn72/run-zmuhvn72.wandb ADDED Viewed

Binary file (58.8 kB). View file