diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log deleted file mode 100644 index 8f66fe3c7ae770c0e93c28ce15a95a46c40e21af..0000000000000000000000000000000000000000 --- a/wandb/debug-internal.log +++ /dev/null @@ -1,21 +0,0 @@ -{"time":"2025-05-04T17:25:03.375857654+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} -{"time":"2025-05-04T17:25:03.375905253+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_172503-0ictlmwf/logs/debug-core.log"} -{"time":"2025-05-04T17:25:03.501241143+03:00","level":"INFO","msg":"created new stream","id":"0ictlmwf"} -{"time":"2025-05-04T17:25:03.501294637+03:00","level":"INFO","msg":"stream: started","id":"0ictlmwf"} -{"time":"2025-05-04T17:25:03.501448652+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"0ictlmwf"} -{"time":"2025-05-04T17:25:03.501451145+03:00","level":"INFO","msg":"handler: started","stream_id":"0ictlmwf"} -{"time":"2025-05-04T17:25:03.501574427+03:00","level":"INFO","msg":"sender: started","stream_id":"0ictlmwf"} -{"time":"2025-05-04T17:25:03.865922055+03:00","level":"INFO","msg":"Starting system monitor"} -{"time":"2025-05-04T22:47:43.191425732+03:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/isikz/finetuning-bc-protT5/0ictlmwf/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"} -{"time":"2025-05-05T00:01:47.351449692+03:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/isikz/finetuning-bc-protT5/0ictlmwf/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"} -{"time":"2025-05-05T00:49:32.57779148+03:00","level":"INFO","msg":"stream: closing","id":"0ictlmwf"} -{"time":"2025-05-05T00:49:32.577842715+03:00","level":"INFO","msg":"Stopping system monitor"} -{"time":"2025-05-05T00:49:32.578849729+03:00","level":"INFO","msg":"Stopped system monitor"} -{"time":"2025-05-05T00:49:32.781968337+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} -{"time":"2025-05-05T00:49:32.781997123+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} -{"time":"2025-05-05T00:49:32.782008311+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} -{"time":"2025-05-05T00:49:33.357099059+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} -{"time":"2025-05-05T00:49:33.741524339+03:00","level":"INFO","msg":"handler: closed","stream_id":"0ictlmwf"} -{"time":"2025-05-05T00:49:33.741583153+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"0ictlmwf"} -{"time":"2025-05-05T00:49:33.741593811+03:00","level":"INFO","msg":"sender: closed","stream_id":"0ictlmwf"} -{"time":"2025-05-05T00:49:33.741652369+03:00","level":"INFO","msg":"stream: closed","id":"0ictlmwf"} diff --git a/wandb/debug.log b/wandb/debug.log deleted file mode 100644 index 627abd37727afa0dddc772a5f08d1d451156833a..0000000000000000000000000000000000000000 --- a/wandb/debug.log +++ /dev/null @@ -1,27 +0,0 @@ -2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 -2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Configure stats pid to 3189710 -2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings -2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings -2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} -2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_172503-0ictlmwf/logs/debug.log -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_172503-0ictlmwf/logs/debug-internal.log -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():619] calling init triggers -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():626] wandb.init called with sweep_config: {} -config: {} -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():669] starting backend -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():673] sending inform_init request -2025-05-04 17:25:03,371 INFO MainThread:3189710 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn -2025-05-04 17:25:03,371 INFO MainThread:3189710 [wandb_init.py:init():686] backend started and connected -2025-05-04 17:25:03,379 INFO MainThread:3189710 [wandb_init.py:init():781] updated telemetry -2025-05-04 17:25:03,382 INFO MainThread:3189710 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout -2025-05-04 17:25:03,852 INFO MainThread:3189710 [wandb_init.py:init():867] starting run threads in backend -2025-05-04 17:25:05,277 INFO MainThread:3189710 [wandb_run.py:_console_start():2456] atexit reg -2025-05-04 17:25:05,278 INFO MainThread:3189710 [wandb_run.py:_redirect():2305] redirect: wrap_raw -2025-05-04 17:25:05,278 INFO MainThread:3189710 [wandb_run.py:_redirect():2370] Wrapping output streams. -2025-05-04 17:25:05,278 INFO MainThread:3189710 [wandb_run.py:_redirect():2395] Redirects installed. -2025-05-04 17:25:05,283 INFO MainThread:3189710 [wandb_init.py:init():911] run started, returning control to user process -2025-05-04 17:25:53,069 INFO MainThread:3189710 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_17-25-43_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} -2025-05-05 00:49:32,578 WARNING MsgRouterThr:3189710 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_132610-pxg645u5/files/config.yaml b/wandb/run-20250504_132610-pxg645u5/files/config.yaml deleted file mode 100644 index 7e7549dbe318b236ac4d168d1610ec259f3f67e0..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132610-pxg645u5/files/config.yaml +++ /dev/null @@ -1,44 +0,0 @@ -_wandb: - value: - cli_version: 0.18.7 - m: [] - python_version: 3.10.15 - t: - "1": - - 1 - - 2 - - 3 - - 5 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "2": - - 1 - - 2 - - 3 - - 5 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "3": - - 23 - - 55 - "4": 3.10.15 - "5": 0.18.7 - "6": 4.45.2 - "8": - - 5 - "12": 0.18.7 - "13": linux-x86_64 diff --git a/wandb/run-20250504_132610-pxg645u5/files/output.log b/wandb/run-20250504_132610-pxg645u5/files/output.log deleted file mode 100644 index f32d8969878a7a0628870456700492bec8448c62..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132610-pxg645u5/files/output.log +++ /dev/null @@ -1,37 +0,0 @@ -You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 -Traceback (most recent call last): - File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 45, in - train_ds = load_dataset("json", data_files={"train": "-"}, - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 2132, in load_dataset - builder_instance = load_dataset_builder( - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 1853, in load_dataset_builder - dataset_module = dataset_module_factory( - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 1562, in dataset_module_factory - ).get_module() - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 942, in get_module - data_files = DataFilesDict.from_patterns( - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/data_files.py", line 721, in from_patterns - else DataFilesList.from_patterns( - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/data_files.py", line 624, in from_patterns - resolve_pattern( - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/data_files.py", line 411, in resolve_pattern - raise FileNotFoundError(error_msg) -FileNotFoundError: Unable to find '/arf/scratch/zisik/prott5_bc_ft/-' -Traceback (most recent call last): - File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 45, in - train_ds = load_dataset("json", data_files={"train": "-"}, - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 2132, in load_dataset - builder_instance = load_dataset_builder( - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 1853, in load_dataset_builder - dataset_module = dataset_module_factory( - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 1562, in dataset_module_factory - ).get_module() - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 942, in get_module - data_files = DataFilesDict.from_patterns( - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/data_files.py", line 721, in from_patterns - else DataFilesList.from_patterns( - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/data_files.py", line 624, in from_patterns - resolve_pattern( - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/data_files.py", line 411, in resolve_pattern - raise FileNotFoundError(error_msg) -FileNotFoundError: Unable to find '/arf/scratch/zisik/prott5_bc_ft/-' diff --git a/wandb/run-20250504_132610-pxg645u5/files/requirements.txt b/wandb/run-20250504_132610-pxg645u5/files/requirements.txt deleted file mode 100644 index 847c45ecccb522de294762faeeb01fe5fb02f7ac..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132610-pxg645u5/files/requirements.txt +++ /dev/null @@ -1,541 +0,0 @@ -nvidia-cuda-cupti-cu12==12.4.127 -nvidia-cuda-nvrtc-cu12==12.4.127 -pyg-lib==0.4.0+pt20cu117 -biopython==1.85 -iniconfig==2.0.0 -tokenizers==0.20.0 -accelerate==1.3.0 -torch==2.6.0 -nvidia-nccl-cu12==2.21.5 -transformers==4.45.2 -nvidia-cusparse-cu12==12.3.1.170 -torch-scatter==2.1.2+pt20cu117 -nvidia-cusparselt-cu12==0.6.2 -nvidia-nvtx-cu12==12.4.127 -zstd==1.5.6.6 -fair-esm==2.0.0 -omegaconf==2.3.0 -pluggy==1.5.0 -pytest==8.3.5 -nvidia-curand-cu12==10.3.5.147 -nvidia-cufft-cu12==11.2.1.3 -torch-cluster==1.6.3+pt20cu117 -regex==2024.9.11 -nvidia-cudnn-cu12==9.1.0.70 -torch-spline-conv==1.2.2+pt20cu117 -nvidia-cusolver-cu12==11.6.1.9 -antlr4-python3-runtime==4.9.3 -msgpack-numpy==0.4.8 -nlp==0.2.0 -einops==0.8.1 -nvidia-cublas-cu12==12.4.5.8 -triton==3.2.0 -ninja==1.11.1.3 -hydra-core==1.3.2 -nvidia-nvjitlink-cu12==12.4.127 -biotite==0.41.2 -torch-sparse==0.6.18+pt20cu117 -esm==3.1.4 -sympy==1.13.1 -nvidia-cuda-runtime-cu12==12.4.127 -jupyter-lsp==2.2.5 -jupyter-events==0.10.0 -ipykernel==6.29.5 -Mako==1.3.5 -proto-plus==1.25.0 -fst-pso==1.8.1 -gensim==4.3.3 -htmlmin==0.1.12 -tokenizers==0.13.3 -timm==1.0.11 -MarkupSafe==3.0.2 -safetensors==0.4.5 -requests==2.32.3 -gast==0.5.5 -cuml==24.12.0a33 -jaxlib==0.4.23.dev20240214 -spacy-loggers==1.0.5 -pytz==2024.1 -idna==3.10 -python-dateutil==2.9.0 -mdurl==0.1.2 -blis==0.7.10 -jupyter==1.1.1 -pyerfa==2.0.1.5 -comm==0.2.2 -pygraphviz==1.14 -dill==0.3.8 -paramiko==3.5.0 -llama-index==0.8.36 -mdit-py-plugins==0.4.2 -Werkzeug==3.1.3 -pyu2f==0.1.5 -dask-glm==0.2.0 -httpx==0.27.2 -typeguard==4.4.1 -mypy-extensions==1.0.0 -kmodes==0.12.2 -keras==2.15.0 -ydata-profiling==0.0.dev0 -regex==2024.11.6 -xarray==2024.11.0 -setuptools==75.3.0 -charset-normalizer==3.4.0 -jupyterlab_nvdashboard==0.11.0 -pylibraft==24.12.0a36 -spacy==3.7.6 -mlflow-skinny==2.17.2 -nvtx==0.2.10 -multimethod==1.12 -pexpect==4.9.0 -torch==2.1.0.post301 -flatbuffers==24.3.25 -python-json-logger==2.0.7 -PyJWT==2.9.0 -multiprocess==0.70.16 -colorlover==0.3.0 -yarl==1.16.0 -locket==1.0.0 -patsy==1.0.0 -rapids-dask-dependency==24.12.0a0 -stanza==1.9.2 -debugpy==1.8.8 -jupyterlab_pygments==0.3.0 -pylibcudf==24.12.0a337 -lz4==4.3.3 -pandas==2.2.3 -tifffile==2024.9.20 -pynvml==11.4.1 -cufflinks==0.17.3 -ipywidgets==8.1.5 -requests-oauthlib==2.0.0 -google-auth-oauthlib==1.2.1 -rsa==4.9 -webcolors==24.8.0 -jsonschema-specifications==2024.10.1 -scikit-learn==1.5.2 -langchain-text-splitters==0.3.2 -pandas-datareader==0.10.0 -tomli==2.0.2 -tzdata==2024.2 -scikit-image==0.24.0 -tensorboard_data_server==0.7.0 -kiwisolver==1.4.7 -cloudpathlib==0.20.0 -isodate==0.6.1 -adversarial-robustness-toolbox==1.19.1 -SQLAlchemy==2.0.36 -pytest-runner==6.0.0 -pycairo==1.27.0 -treelite==4.3.0 -jiter==0.7.0 -threadpoolctl==3.5.0 -pandocfilters==1.5.0 -loguru==0.7.2 -smart_open==7.0.5 -shellingham==1.5.4 -deepspeed==0.15.4 -prompt_toolkit==3.0.48 -databricks-sdk==0.34.0 -langchain-core==0.3.15 -imageio==2.36.0 -openapi-schema-pydantic==1.2.4 -zict==3.0.0 -cachetools==5.5.0 -colorful==0.5.6 -mpmath==1.3.0 -nest_asyncio==1.6.0 -pyFUME==0.2.25 -opencv-python-headless==4.9.0 -fastai==2.7.18 -importlib_resources==6.4.5 -binaryornot==0.4.4 -evaluate==0.4.1 -matplotlib-inline==0.1.7 -wasabi==1.1.2 -pycparser==2.22 -GitPython==3.1.43 -pluggy==1.5.0 -async-lru==2.0.4 -pgmpy==0.1.24 -anyio==4.4.0 -executing==2.1.0 -orjson==3.10.11 -humanfriendly==10.0 -tornado==6.4.1 -gmpy2==2.1.5 -rlPyCairo==0.2.0 -distributed==2024.11.0 -FuzzyTM==2.0.5 -torchtext==0.15.2a0+5ce3163 -pytest==8.3.5 -pyod==2.0.2 -ImageHash==4.3.1 -soupsieve==2.5 -tblib==3.0.0 -emoji==2.14.0 -aiohappyeyeballs==2.4.3 -uri-template==1.3.0 -tensorflow_estimator==2.15.0 -babel==2.16.0 -dask-cuda==24.12.0a12 -overrides==7.7.0 -opencensus==0.11.3 -openai==0.28.1 -language_data==1.2.0 -jedi==0.19.2 -cookiecutter==2.6.0 -entrypoints==0.4 -exceptiongroup==1.2.2 -marisa-trie==1.2.0 -uvloop==0.20.0 -aiosignal==1.3.1 -Flask==3.0.3 -tensorboard==2.15.2 -cffi==1.17.1 -tf_keras==2.15.0 -absl-py==2.1.0 -blinker==1.9.0 -types-python-dateutil==2.9.0.20241003 -opencv-python==4.9.0 -frozendict==2.4.6 -aiohttp-cors==0.7.0 -statsmodels==0.14.4 -tinycss2==1.4.0 -terminado==0.18.1 -pycaret==2.2.3 -aiohttp==3.10.10 -distributed-ucxx==0.41.0 -prometheus_client==0.21.0 -fastdownload==0.0.7 -grpcio==1.59.3 -google-api-core==2.22.0 -jupyterlab_widgets==3.0.13 -appdirs==1.4.4 -littleutils==0.0.0 -ray==2.24.0 -kaggle==1.6.17 -jsonschema==4.23.0 -google-auth==2.36.0 -scikit-base==0.11.0 -visions==0.7.6 -pyarrow==15.0.0 -transformers==4.33.0 -prometheus_flask_exporter==0.23.1 -dm-tree==0.1.8 -colorama==0.4.6 -requests-toolbelt==1.0.0 -cached-property==1.5.2 -cymem==2.0.8 -PyNaCl==1.5.0 -PyWavelets==1.7.0 -httptools==0.6.1 -typing-utils==0.1.0 -email_validator==2.2.0 -marshmallow==3.23.1 -Deprecated==1.2.14 -virtualenv==20.4.7 -optuna==3.6.1 -jupyter_server==2.14.2 -termcolor==2.5.0 -mpi4py==4.0.1 -torchdata==0.7.1+8cea82f -dataclasses==0.8 -cloudpickle==3.1.0 -tree_sitter_languages==1.10.2 -tabulate==0.9.0 -ipython==8.29.0 -lightgbm==4.3.0 -captum==0.6.0 -confuse==2.0.1 -torchvision==0.16.1+adc3221 -lxml==4.9.4 -fastapi==0.115.4 -python-multipart==0.0.17 -dnspython==2.7.0 -jupyter-console==6.6.3 -preshed==3.0.9 -py-cpuinfo==9.0.0 -Send2Trash==1.8.3 -murmurhash==1.0.10 -sniffio==1.3.1 -websockets==13.1 -h11==0.14.0 -smmap==5.0.0 -textual==0.85.2 -jsonpatch==1.33 -opencensus-context==0.1.3 -nbconvert==7.16.4 -sentry-sdk==2.19.0 -opentelemetry-semantic-conventions==0.37b0 -pandas-profiling==2.8.0 -pillow==10.3.0 -peft==0.13.2 -rpds-py==0.21.0 -bokeh==3.6.1 -distro==1.9.0 -itsdangerous==2.2.0 -wandb==0.18.7 -jsonpointer==3.0.0 -astropy-iers-data==0.2024.11.11.0.32.38 -horovod==0.28.1 -graphviz==0.20.3 -vtk==9.3.1 -bleach==6.2.0 -numexpr==2.8.7 -pydantic_core==2.23.4 -Jinja2==3.1.4 -widgetsnbextension==4.0.13 -filelock==3.16.1 -catboost==1.2.7 -raft-dask==24.12.0a36 -async-timeout==4.0.3 -datefinder==0.7.3 -coloredlogs==15.0.1 -platformdirs==4.3.6 -spacy-legacy==3.0.12 -chardet==5.2.0 -jupyter_client==8.6.3 -importlib_metadata==8.5.0 -rfc3986-validator==0.1.1 -huggingface_hub==0.26.2 -PySocks==1.7.1 -mlxtend==0.23.2 -outdated==0.2.2 -partd==1.4.2 -thinc==8.2.5 -astropy==6.1.6 -rdflib==6.3.2 -h2==4.1.0 -typer==0.13.0 -xyzservices==2024.9.0 -toolz==0.12.1 -frozenlist==1.5.0 -rdkit==2024.9.2 -pyasn1==0.6.1 -jupyter_server_terminals==0.5.3 -ucx-py==0.41.0a11 -astunparse==1.6.3 -simpful==2.12.0 -notebook_shim==0.2.4 -scipy==1.13.1 -colorlog==6.9.0 -tiktoken==0.3.3 -plotly==5.24.1 -fastrlock==0.8.2 -chart-studio==1.1.0 -stack-data==0.6.2 -google-pasta==0.2.0 -sktime==0.34.0 -PyYAML==6.0.2 -sympy==1.13.3 -multidict==6.1.0 -ml-dtypes==0.2.0 -tensorboardX==2.6.2.2 -decorator==5.1.1 -cytoolz==1.0.0 -ase==3.23.0 -isoduration==20.11.0 -html5lib==1.1 -langsmith==0.1.142 -future==1.0.0 -onnx2torch==1.5.15 -multipledispatch==0.6.0 -protobuf==4.24.4 -ucxx==0.41.0 -pandas_flavor==0.6.0 -msgpack==1.1.0 -pyasn1_modules==0.4.1 -imagecodecs==2024.1.1 -mlflow==2.17.2 -watchfiles==0.24.0 -dm-sonnet==2.0.2 -langcodes==3.4.1 -freetype-py==2.3.0 -argon2-cffi-bindings==21.2.0 -trimesh==4.5.2 -opt_einsum==3.4.0 -tenacity==8.5.0 -h5py==3.12.1 -fastapi-cli==0.0.5 -oauthlib==3.2.2 -parso==0.8.4 -weasel==0.4.1 -yfinance==0.2.49 -networkx==2.8.8 -bitsandbytes==0.44.1 -lazy_loader==0.4 -querystring_parser==1.2.4 -contourpy==1.3.0 -unicodedata2==15.1.0 -bcrypt==4.2.0 -munkres==1.1.4 -langchain==0.0.298 -hpack==4.0.0 -cryptography==43.0.3 -umap-learn==0.5.7 -arrow==1.3.0 -docker==7.1.0 -certifi==2025.1.31 -fastjsonschema==2.20.0 -tensorflow==2.15.0 -googleapis-common-protos==1.65.0 -iniconfig==2.0.0 -Markdown==3.6 -llvmlite==0.43.0 -wslink==2.3.2 -attrs==24.2.0 -rich==13.9.4 -cupy==13.3.0 -uc-micro-py==1.0.3 -alembic==1.14.0 -joblib==1.4.2 -reportlab==4.2.5 -miniful==0.0.6 -jupyter_core==5.7.2 -wheel==0.45.0 -phik==0.12.3 -mistune==3.0.2 -wcwidth==0.2.13 -dacite==1.8.1 -accelerate==0.22.0 -sacremoses==0.0.53 -revtok==0.0.3 -python-slugify==8.0.4 -tangled-up-in-unicode==0.2.0 -dask==2024.11.0 -markdown-it-py==3.0.0 -sentencepiece==0.1.99 -beautifulsoup4==4.12.3 -six==1.16.0 -numba-cuda==0.0.17 -argon2-cffi==23.1.0 -xxhash==3.5.0 -hjson==3.1.0 -fonttools==4.54.1 -graphql-core==3.2.5 -pyparsing==3.2.0 -pure_eval==0.2.3 -distlib==0.3.9 -lightning==2.4.0 -wordcloud==0.0.0 -catalogue==2.0.10 -jax==0.4.27 -tree-sitter==0.23.2 -notebook==7.2.2 -dataclasses-json==0.6.7 -propcache==0.2.0 -numba==0.60.0 -dask-expr==1.1.17 -pydantic==2.9.2 -gunicorn==22.0.0 -missingno==0.5.2 -pyOpenSSL==24.2.1 -openpyxl==3.1.5 -packaging==24.1 -python-dotenv==1.0.1 -cycler==0.12.1 -types-pytz==2024.2.0.20241003 -yellowbrick==1.5 -referencing==0.35.1 -pyLDAvis==3.4.1 -lazypredict==0.2.16 -fqdn==1.5.1 -websocket-client==1.8.0 -fastcore==1.7.19 -pynvjitlink-cu12==0.3.0 -pingouin==0.5.5 -numpy==1.26.4 -typing-inspect==0.9.0 -nltk==3.9.1 -onnxruntime==1.19.2 -tensorflow-probability==0.23.0 -datasets==3.0.2 -pickleshare==0.7.5 -peewee==3.17.7 -torch-geometric==2.6.1 -ptyprocess==0.7.0 -greenlet==3.1.1 -graphql-relay==3.2.0 -graphene==3.4.3 -et_xmlfile==2.0.0 -webencodings==0.5.1 -hyperframe==6.0.1 -multitasking==0.0.9 -typer-slim==0.13.0 -onnx==1.15.0 -uvicorn==0.32.0 -memray==1.13.4 -xgboost==2.1.2 -Brotli==1.1.0 -zipp==3.21.0 -nbformat==5.10.4 -responses==0.18.0 -funcy==2.0 -Pygments==2.18.0 -tqdm==4.67.0 -linkify-it-py==2.0.3 -srsly==2.4.8 -cuda-python==12.6.0 -lightning-utilities==0.11.8 -cudf==24.12.0a337 -dask-ml==2024.4.4 -docker-pycreds==0.4.0 -pkgutil_resolve_name==1.3.10 -opentelemetry-api==1.16.0 -fsspec==2024.9.0 -nbclient==0.10.0 -psutil==5.9.8 -pytorch-lightning==2.4.0 -sortedcontainers==2.4.0 -matplotlib==3.9.2 -defusedxml==0.7.1 -urllib3==1.26.19 -jupyterlab_server==2.27.3 -retrying==1.3.3 -dask-cudf==24.12.0a337 -sqlparse==0.5.1 -text-unidecode==1.3 -seaborn==0.13.2 -typing_extensions==4.12.2 -pyzmq==26.2.0 -rfc3339-validator==0.1.4 -pynndescent==0.5.13 -pip==24.3.1 -confection==0.1.4 -wrapt==1.14.1 -fastprogress==1.0.3 -traitlets==5.14.3 -asttokens==2.4.1 -json5==0.9.28 -pandas-stubs==2.2.3.241126 -torchmetrics==1.2.1 -gitdb==4.0.11 -annotated-types==0.7.0 -ipython-autotime==0.1 -httpcore==1.0.6 -click==8.1.7 -setproctitle==1.3.3 -starlette==0.41.2 -jupyterlab==4.2.5 -rmm==24.12.0a27 -opentelemetry-sdk==1.16.0 -textblob==0.15.3 -imbalanced-learn==0.12.4 -typeguard==4.3.0 -more-itertools==10.3.0 -zipp==3.19.2 -autocommand==2.2.2 -jaraco.context==5.3.0 -packaging==24.1 -importlib_metadata==8.0.0 -platformdirs==4.2.2 -jaraco.functools==4.0.1 -importlib_resources==6.4.0 -tomli==2.0.1 -jaraco.text==3.12.1 -wheel==0.43.0 -jaraco.collections==5.1.0 -typing_extensions==4.12.2 -inflect==7.3.1 -backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_132610-pxg645u5/files/wandb-metadata.json b/wandb/run-20250504_132610-pxg645u5/files/wandb-metadata.json deleted file mode 100644 index 448328b179970362f2471973f31fb58da4f76b55..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132610-pxg645u5/files/wandb-metadata.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", - "python": "3.10.15", - "startedAt": "2025-05-04T10:26:10.053836Z", - "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", - "codePath": "finetuning_bc_prott5.py", - "email": "zeynep.isik1@sabanciuniv.edu", - "root": "/arf/scratch/zisik/prott5_bc_ft", - "host": "kolyoz1", - "username": "zisik", - "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", - "codePathLocal": "finetuning_bc_prott5.py", - "cpu_count": 64, - "cpu_count_logical": 64, - "gpu": "NVIDIA H100 80GB HBM3", - "gpu_count": 1, - "disk": { - "/": { - "total": "7643995308032", - "used": "274767593472" - } - }, - "memory": { - "total": "1081373220864" - }, - "cpu": { - "count": 64, - "countLogical": 64 - }, - "gpu_nvidia": [ - { - "name": "NVIDIA H100 80GB HBM3", - "memoryTotal": "85520809984", - "cudaCores": 16896, - "architecture": "Hopper" - } - ], - "slurm": { - "cluster_name": "cuda", - "conf": "/etc/slurm/slurm.conf", - "cpus_on_node": "16", - "cpus_per_task": "16", - "gpus_on_node": "1", - "gtids": "0", - "job_account": "tbag154", - "job_cpus_per_node": "16", - "job_end_time": "1746613538", - "job_gid": "11636", - "job_gpus": "1", - "job_id": "1027932", - "job_name": "msa_ph_pt", - "job_nodelist": "kolyoz1", - "job_num_nodes": "1", - "job_partition": "kolyoz-cuda", - "job_qos": "tbag", - "job_start_time": "1746354338", - "job_uid": "11636", - "job_user": "zisik", - "jobid": "1027932", - "localid": "0", - "mem_per_cpu": "14000", - "nnodes": "1", - "node_aliases": "(null)", - "nodeid": "0", - "nodelist": "kolyoz1", - "prio_process": "0", - "procid": "0", - "submit_dir": "/arf/scratch/zisik", - "submit_host": "cuda-ui", - "task_pid": "3156950", - "tasks_per_node": "1", - "topology_addr": "kolyoz1", - "topology_addr_pattern": "node", - "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" - }, - "cudaVersion": "12.6" -} \ No newline at end of file diff --git a/wandb/run-20250504_132610-pxg645u5/files/wandb-summary.json b/wandb/run-20250504_132610-pxg645u5/files/wandb-summary.json deleted file mode 100644 index abe7f35e04106235b4471ed10391e2de502bf8a5..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132610-pxg645u5/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"_wandb":{"runtime":6}} \ No newline at end of file diff --git a/wandb/run-20250504_132610-pxg645u5/logs/debug-core.log b/wandb/run-20250504_132610-pxg645u5/logs/debug-core.log deleted file mode 100644 index d8927ec645e582bb16b497af54aed2f51506dd14..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132610-pxg645u5/logs/debug-core.log +++ /dev/null @@ -1,14 +0,0 @@ -{"time":"2025-05-04T13:26:09.392354119+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmppack6571/port-3156976.txt","pid":3156976,"debug":false,"disable-analytics":false} -{"time":"2025-05-04T13:26:09.392402628+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} -{"time":"2025-05-04T13:26:09.393200765+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":36685,"Zone":""}} -{"time":"2025-05-04T13:26:09.393299078+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3156976} -{"time":"2025-05-04T13:26:09.570123715+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:37852"} -{"time":"2025-05-04T13:26:10.055349971+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"pxg645u5","id":"127.0.0.1:37852"} -{"time":"2025-05-04T13:26:10.180212249+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"pxg645u5","id":"127.0.0.1:37852"} -{"time":"2025-05-04T13:26:16.993053475+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:37852"} -{"time":"2025-05-04T13:26:16.994546738+03:00","level":"INFO","msg":"server is shutting down"} -{"time":"2025-05-04T13:26:16.993862146+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:37852"} -{"time":"2025-05-04T13:26:16.994899765+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:37852"} -{"time":"2025-05-04T13:26:17.953982632+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:37852"} -{"time":"2025-05-04T13:26:17.954000039+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:37852"} -{"time":"2025-05-04T13:26:17.954015604+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_132610-pxg645u5/logs/debug-internal.log b/wandb/run-20250504_132610-pxg645u5/logs/debug-internal.log deleted file mode 100644 index 90be0a8f62ab298af46980179616b6b5c91f3e29..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132610-pxg645u5/logs/debug-internal.log +++ /dev/null @@ -1,19 +0,0 @@ -{"time":"2025-05-04T13:26:10.056874799+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} -{"time":"2025-05-04T13:26:10.056920353+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_132610-pxg645u5/logs/debug-core.log"} -{"time":"2025-05-04T13:26:10.180146537+03:00","level":"INFO","msg":"created new stream","id":"pxg645u5"} -{"time":"2025-05-04T13:26:10.180200098+03:00","level":"INFO","msg":"stream: started","id":"pxg645u5"} -{"time":"2025-05-04T13:26:10.180372555+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"pxg645u5"} -{"time":"2025-05-04T13:26:10.180478207+03:00","level":"INFO","msg":"sender: started","stream_id":"pxg645u5"} -{"time":"2025-05-04T13:26:10.18057531+03:00","level":"INFO","msg":"handler: started","stream_id":"pxg645u5"} -{"time":"2025-05-04T13:26:10.587540794+03:00","level":"INFO","msg":"Starting system monitor"} -{"time":"2025-05-04T13:26:16.993666261+03:00","level":"INFO","msg":"stream: closing","id":"pxg645u5"} -{"time":"2025-05-04T13:26:16.993748173+03:00","level":"INFO","msg":"Stopping system monitor"} -{"time":"2025-05-04T13:26:16.995793958+03:00","level":"INFO","msg":"Stopped system monitor"} -{"time":"2025-05-04T13:26:17.198876326+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} -{"time":"2025-05-04T13:26:17.198909473+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} -{"time":"2025-05-04T13:26:17.198920913+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} -{"time":"2025-05-04T13:26:17.694743818+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} -{"time":"2025-05-04T13:26:17.953755664+03:00","level":"INFO","msg":"handler: closed","stream_id":"pxg645u5"} -{"time":"2025-05-04T13:26:17.953802728+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"pxg645u5"} -{"time":"2025-05-04T13:26:17.953828101+03:00","level":"INFO","msg":"sender: closed","stream_id":"pxg645u5"} -{"time":"2025-05-04T13:26:17.953904675+03:00","level":"INFO","msg":"stream: closed","id":"pxg645u5"} diff --git a/wandb/run-20250504_132610-pxg645u5/logs/debug.log b/wandb/run-20250504_132610-pxg645u5/logs/debug.log deleted file mode 100644 index 468c0395d71efd915d75073afc6774b985f26212..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132610-pxg645u5/logs/debug.log +++ /dev/null @@ -1,26 +0,0 @@ -2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 -2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Configure stats pid to 3156976 -2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings -2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings -2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} -2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} -2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_132610-pxg645u5/logs/debug.log -2025-05-04 13:26:10,047 INFO MainThread:3156976 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_132610-pxg645u5/logs/debug-internal.log -2025-05-04 13:26:10,047 INFO MainThread:3156976 [wandb_init.py:init():619] calling init triggers -2025-05-04 13:26:10,047 INFO MainThread:3156976 [wandb_init.py:init():626] wandb.init called with sweep_config: {} -config: {} -2025-05-04 13:26:10,047 INFO MainThread:3156976 [wandb_init.py:init():669] starting backend -2025-05-04 13:26:10,047 INFO MainThread:3156976 [wandb_init.py:init():673] sending inform_init request -2025-05-04 13:26:10,052 INFO MainThread:3156976 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn -2025-05-04 13:26:10,053 INFO MainThread:3156976 [wandb_init.py:init():686] backend started and connected -2025-05-04 13:26:10,061 INFO MainThread:3156976 [wandb_init.py:init():781] updated telemetry -2025-05-04 13:26:10,064 INFO MainThread:3156976 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout -2025-05-04 13:26:10,574 INFO MainThread:3156976 [wandb_init.py:init():867] starting run threads in backend -2025-05-04 13:26:12,208 INFO MainThread:3156976 [wandb_run.py:_console_start():2456] atexit reg -2025-05-04 13:26:12,209 INFO MainThread:3156976 [wandb_run.py:_redirect():2305] redirect: wrap_raw -2025-05-04 13:26:12,209 INFO MainThread:3156976 [wandb_run.py:_redirect():2370] Wrapping output streams. -2025-05-04 13:26:12,209 INFO MainThread:3156976 [wandb_run.py:_redirect():2395] Redirects installed. -2025-05-04 13:26:12,220 INFO MainThread:3156976 [wandb_init.py:init():911] run started, returning control to user process -2025-05-04 13:26:16,995 WARNING MsgRouterThr:3156976 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_132610-pxg645u5/run-pxg645u5.wandb b/wandb/run-20250504_132610-pxg645u5/run-pxg645u5.wandb deleted file mode 100644 index ebcf26b6563d253be1738d7c6c5bd6f413bdaf9a..0000000000000000000000000000000000000000 Binary files a/wandb/run-20250504_132610-pxg645u5/run-pxg645u5.wandb and /dev/null differ diff --git a/wandb/run-20250504_132912-1agsw1y8/files/config.yaml b/wandb/run-20250504_132912-1agsw1y8/files/config.yaml deleted file mode 100644 index 89a4f38c983e370e131179dcc4d572a4d25e65b6..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132912-1agsw1y8/files/config.yaml +++ /dev/null @@ -1,374 +0,0 @@ -_wandb: - value: - cli_version: 0.18.7 - m: - - "1": train/epoch - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/global_step - "6": - - 3 - "7": [] - - "1": eval/runtime - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/loss - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/grad_norm - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/learning_rate - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/loss - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/samples_per_second - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/steps_per_second - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/accuracy - "5": 2 - "6": - - 1 - - 3 - "7": [] - python_version: 3.10.15 - t: - "1": - - 1 - - 2 - - 3 - - 5 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "2": - - 1 - - 2 - - 3 - - 5 - - 6 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "3": - - 7 - - 23 - - 55 - - 66 - "4": 3.10.15 - "5": 0.18.7 - "6": 4.45.2 - "8": - - 5 - "9": - "1": transformers_trainer - "12": 0.18.7 - "13": linux-x86_64 -accelerator_config: - value: - dispatch_batches: null - even_batches: true - gradient_accumulation_kwargs: null - non_blocking: false - split_batches: false - use_seedable_sampler: true -adafactor: - value: false -adam_beta1: - value: 0.9 -adam_beta2: - value: 0.999 -adam_epsilon: - value: 1e-08 -auto_find_batch_size: - value: false -batch_eval_metrics: - value: false -bf16: - value: false -bf16_full_eval: - value: false -data_seed: - value: null -dataloader_drop_last: - value: false -dataloader_num_workers: - value: 0 -dataloader_persistent_workers: - value: false -dataloader_pin_memory: - value: true -dataloader_prefetch_factor: - value: null -ddp_backend: - value: null -ddp_broadcast_buffers: - value: null -ddp_bucket_cap_mb: - value: null -ddp_find_unused_parameters: - value: null -ddp_timeout: - value: 1800 -debug: - value: [] -deepspeed: - value: null -disable_tqdm: - value: false -dispatch_batches: - value: null -do_eval: - value: true -do_predict: - value: false -do_train: - value: false -eval_accumulation_steps: - value: null -eval_delay: - value: 0 -eval_do_concat_batches: - value: true -eval_on_start: - value: false -eval_steps: - value: null -eval_strategy: - value: epoch -eval_use_gather_object: - value: false -evaluation_strategy: - value: epoch -fp16: - value: true -fp16_backend: - value: auto -fp16_full_eval: - value: false -fp16_opt_level: - value: O1 -fsdp: - value: [] -fsdp_config: - value: - min_num_params: 0 - xla: false - xla_fsdp_grad_ckpt: false - xla_fsdp_v2: false -fsdp_min_num_params: - value: 0 -fsdp_transformer_layer_cls_to_wrap: - value: null -full_determinism: - value: false -gradient_accumulation_steps: - value: 4 -gradient_checkpointing: - value: false -gradient_checkpointing_kwargs: - value: null -greater_is_better: - value: false -group_by_length: - value: false -half_precision_backend: - value: auto -hub_always_push: - value: false -hub_model_id: - value: null -hub_private_repo: - value: false -hub_strategy: - value: every_save -hub_token: - value: -ignore_data_skip: - value: false -include_inputs_for_metrics: - value: false -include_num_input_tokens_seen: - value: false -include_tokens_per_second: - value: false -jit_mode_eval: - value: false -label_names: - value: null -label_smoothing_factor: - value: 0 -learning_rate: - value: 5e-05 -length_column_name: - value: length -load_best_model_at_end: - value: true -local_rank: - value: 0 -log_level: - value: passive -log_level_replica: - value: warning -log_on_each_node: - value: true -logging_dir: - value: t5-bc-out/runs/May04_13-33-08_kolyoz1 -logging_first_step: - value: false -logging_nan_inf_filter: - value: true -logging_steps: - value: 500 -logging_strategy: - value: steps -lr_scheduler_type: - value: linear -max_grad_norm: - value: 1 -max_steps: - value: -1 -metric_for_best_model: - value: loss -mp_parameters: - value: "" -neftune_noise_alpha: - value: null -no_cuda: - value: false -num_train_epochs: - value: 3 -optim: - value: adamw_torch -optim_args: - value: null -optim_target_modules: - value: null -output_dir: - value: t5-bc-out -overwrite_output_dir: - value: false -past_index: - value: -1 -per_device_eval_batch_size: - value: 8 -per_device_train_batch_size: - value: 8 -per_gpu_eval_batch_size: - value: null -per_gpu_train_batch_size: - value: null -prediction_loss_only: - value: false -push_to_hub: - value: false -push_to_hub_model_id: - value: null -push_to_hub_organization: - value: null -push_to_hub_token: - value: -ray_scope: - value: last -remove_unused_columns: - value: true -report_to: - value: - - wandb -restore_callback_states_from_checkpoint: - value: false -resume_from_checkpoint: - value: null -run_name: - value: t5-bc-out -save_on_each_node: - value: false -save_only_model: - value: false -save_safetensors: - value: true -save_steps: - value: 500 -save_strategy: - value: epoch -save_total_limit: - value: null -seed: - value: 42 -skip_memory_metrics: - value: true -split_batches: - value: null -tf32: - value: null -torch_compile: - value: false -torch_compile_backend: - value: null -torch_compile_mode: - value: null -torch_empty_cache_steps: - value: null -torchdynamo: - value: null -tpu_metrics_debug: - value: false -tpu_num_cores: - value: null -use_cpu: - value: false -use_ipex: - value: false -use_legacy_prediction_loop: - value: false -use_liger_kernel: - value: false -use_mps_device: - value: false -warmup_ratio: - value: 0 -warmup_steps: - value: 0 -weight_decay: - value: 0 diff --git a/wandb/run-20250504_132912-1agsw1y8/files/output.log b/wandb/run-20250504_132912-1agsw1y8/files/output.log deleted file mode 100644 index 8ca93eec2346930dfe72e70314a1388aa43e22d8..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132912-1agsw1y8/files/output.log +++ /dev/null @@ -1,87 +0,0 @@ -You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 -Map: 100%|██████████| 511104/511104 [00:20<00:00, 25525.81 examples/s] -Map: 100%|██████████| 109522/109522 [00:04<00:00, 26956.64 examples/s] -/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead - warnings.warn( -[2025-05-04 13:33:14,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. - 33%|███▎ | 15972/47916 [2:22:01<4:54:49, 1.81it/s] -{'loss': 0.6947, 'grad_norm': 0.09912440180778503, 'learning_rate': 4.947825361048502e-05, 'epoch': 0.03} -{'loss': 0.6939, 'grad_norm': 0.23786939680576324, 'learning_rate': 4.8956507220970036e-05, 'epoch': 0.06} -{'loss': 0.6936, 'grad_norm': 0.10555226355791092, 'learning_rate': 4.843476083145505e-05, 'epoch': 0.09} -{'loss': 0.6935, 'grad_norm': 0.28058305382728577, 'learning_rate': 4.791301444194006e-05, 'epoch': 0.13} -{'loss': 0.6937, 'grad_norm': 0.13599741458892822, 'learning_rate': 4.739126805242508e-05, 'epoch': 0.16} -{'loss': 0.6935, 'grad_norm': 0.13076388835906982, 'learning_rate': 4.6869521662910095e-05, 'epoch': 0.19} -{'loss': 0.6934, 'grad_norm': 0.1778457760810852, 'learning_rate': 4.634777527339511e-05, 'epoch': 0.22} -{'loss': 0.6935, 'grad_norm': 0.4112167954444885, 'learning_rate': 4.582602888388012e-05, 'epoch': 0.25} -{'loss': 0.6934, 'grad_norm': 0.1330016702413559, 'learning_rate': 4.530428249436514e-05, 'epoch': 0.28} -{'loss': 0.6935, 'grad_norm': 0.09426847100257874, 'learning_rate': 4.478253610485016e-05, 'epoch': 0.31} -{'loss': 0.6933, 'grad_norm': 0.3686296343803406, 'learning_rate': 4.426078971533517e-05, 'epoch': 0.34} -{'loss': 0.6933, 'grad_norm': 0.21278153359889984, 'learning_rate': 4.373904332582019e-05, 'epoch': 0.38} -{'loss': 0.6935, 'grad_norm': 0.23074378073215485, 'learning_rate': 4.321834042908423e-05, 'epoch': 0.41} -{'loss': 0.6932, 'grad_norm': 0.5192509293556213, 'learning_rate': 4.269659403956925e-05, 'epoch': 0.44} -{'loss': 0.6932, 'grad_norm': 0.07643919438123703, 'learning_rate': 4.217484765005426e-05, 'epoch': 0.47} -{'loss': 0.6935, 'grad_norm': 0.09435634315013885, 'learning_rate': 4.1653101260539276e-05, 'epoch': 0.5} -{'loss': 0.6932, 'grad_norm': 0.3456329107284546, 'learning_rate': 4.113239836380333e-05, 'epoch': 0.53} -{'loss': 0.6934, 'grad_norm': 0.11689063161611557, 'learning_rate': 4.061065197428834e-05, 'epoch': 0.56} -{'loss': 0.6934, 'grad_norm': 0.25019219517707825, 'learning_rate': 4.0088905584773355e-05, 'epoch': 0.59} -{'loss': 0.6933, 'grad_norm': 0.12248441576957703, 'learning_rate': 3.956715919525837e-05, 'epoch': 0.63} -{'loss': 0.6933, 'grad_norm': 0.11549345403909683, 'learning_rate': 3.9046456298522416e-05, 'epoch': 0.66} -{'loss': 0.6934, 'grad_norm': 0.27383607625961304, 'learning_rate': 3.852470990900743e-05, 'epoch': 0.69} -{'loss': 0.6935, 'grad_norm': 0.21311810612678528, 'learning_rate': 3.800296351949245e-05, 'epoch': 0.72} -{'loss': 0.6933, 'grad_norm': 0.25916823744773865, 'learning_rate': 3.7481217129977466e-05, 'epoch': 0.75} -{'loss': 0.6934, 'grad_norm': 0.13208124041557312, 'learning_rate': 3.6960514233241504e-05, 'epoch': 0.78} -{'loss': 0.6934, 'grad_norm': 0.4182877242565155, 'learning_rate': 3.643876784372652e-05, 'epoch': 0.81} -{'loss': 0.6933, 'grad_norm': 0.19375275075435638, 'learning_rate': 3.5917021454211544e-05, 'epoch': 0.85} -{'loss': 0.6933, 'grad_norm': 0.1647150218486786, 'learning_rate': 3.5395275064696554e-05, 'epoch': 0.88} -{'loss': 0.6933, 'grad_norm': 0.458692729473114, 'learning_rate': 3.48745721679606e-05, 'epoch': 0.91} -{'loss': 0.6933, 'grad_norm': 0.24417555332183838, 'learning_rate': 3.4352825778445616e-05, 'epoch': 0.94} -{'loss': 0.6932, 'grad_norm': 0.10788150876760483, 'learning_rate': 3.383107938893063e-05, 'epoch': 0.97} - File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 125, in -{'eval_loss': 0.6931192278862, 'eval_accuracy': 0.4992604225635032, 'eval_runtime': 182.4166, 'eval_samples_per_second': 600.395, 'eval_steps_per_second': 75.053, 'epoch': 1.0} - trainer.train() - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2052, in train - return inner_training_loop( - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2487, in _inner_training_loop - self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2918, in _maybe_log_save_evaluate - self._save_checkpoint(model, trial, metrics=metrics) - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3008, in _save_checkpoint - self.save_model(output_dir, _internal_call=True) - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3623, in save_model - self._save(output_dir) - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3721, in _save - safetensors.torch.save_file( - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/safetensors/torch.py", line 286, in save_file - serialize_file(_flatten(tensors), filename, metadata=metadata) - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/safetensors/torch.py", line 488, in _flatten - raise RuntimeError( -RuntimeError: - Some tensors share memory, this will lead to duplicate memory on disk and potential differences when loading them again: [{'encoder.encoder.embed_tokens.weight', 'encoder.shared.weight'}]. - A potential way to correctly save your model is to use `save_model`. - More information at https://huggingface.co/docs/safetensors/torch_shared_tensors - -Traceback (most recent call last): - File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 125, in - trainer.train() - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2052, in train - return inner_training_loop( - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2487, in _inner_training_loop - self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2918, in _maybe_log_save_evaluate - self._save_checkpoint(model, trial, metrics=metrics) - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3008, in _save_checkpoint - self.save_model(output_dir, _internal_call=True) - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3623, in save_model - self._save(output_dir) - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3721, in _save - safetensors.torch.save_file( - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/safetensors/torch.py", line 286, in save_file - serialize_file(_flatten(tensors), filename, metadata=metadata) - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/safetensors/torch.py", line 488, in _flatten - raise RuntimeError( -RuntimeError: - Some tensors share memory, this will lead to duplicate memory on disk and potential differences when loading them again: [{'encoder.encoder.embed_tokens.weight', 'encoder.shared.weight'}]. - A potential way to correctly save your model is to use `save_model`. - More information at https://huggingface.co/docs/safetensors/torch_shared_tensors - diff --git a/wandb/run-20250504_132912-1agsw1y8/files/requirements.txt b/wandb/run-20250504_132912-1agsw1y8/files/requirements.txt deleted file mode 100644 index 847c45ecccb522de294762faeeb01fe5fb02f7ac..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132912-1agsw1y8/files/requirements.txt +++ /dev/null @@ -1,541 +0,0 @@ -nvidia-cuda-cupti-cu12==12.4.127 -nvidia-cuda-nvrtc-cu12==12.4.127 -pyg-lib==0.4.0+pt20cu117 -biopython==1.85 -iniconfig==2.0.0 -tokenizers==0.20.0 -accelerate==1.3.0 -torch==2.6.0 -nvidia-nccl-cu12==2.21.5 -transformers==4.45.2 -nvidia-cusparse-cu12==12.3.1.170 -torch-scatter==2.1.2+pt20cu117 -nvidia-cusparselt-cu12==0.6.2 -nvidia-nvtx-cu12==12.4.127 -zstd==1.5.6.6 -fair-esm==2.0.0 -omegaconf==2.3.0 -pluggy==1.5.0 -pytest==8.3.5 -nvidia-curand-cu12==10.3.5.147 -nvidia-cufft-cu12==11.2.1.3 -torch-cluster==1.6.3+pt20cu117 -regex==2024.9.11 -nvidia-cudnn-cu12==9.1.0.70 -torch-spline-conv==1.2.2+pt20cu117 -nvidia-cusolver-cu12==11.6.1.9 -antlr4-python3-runtime==4.9.3 -msgpack-numpy==0.4.8 -nlp==0.2.0 -einops==0.8.1 -nvidia-cublas-cu12==12.4.5.8 -triton==3.2.0 -ninja==1.11.1.3 -hydra-core==1.3.2 -nvidia-nvjitlink-cu12==12.4.127 -biotite==0.41.2 -torch-sparse==0.6.18+pt20cu117 -esm==3.1.4 -sympy==1.13.1 -nvidia-cuda-runtime-cu12==12.4.127 -jupyter-lsp==2.2.5 -jupyter-events==0.10.0 -ipykernel==6.29.5 -Mako==1.3.5 -proto-plus==1.25.0 -fst-pso==1.8.1 -gensim==4.3.3 -htmlmin==0.1.12 -tokenizers==0.13.3 -timm==1.0.11 -MarkupSafe==3.0.2 -safetensors==0.4.5 -requests==2.32.3 -gast==0.5.5 -cuml==24.12.0a33 -jaxlib==0.4.23.dev20240214 -spacy-loggers==1.0.5 -pytz==2024.1 -idna==3.10 -python-dateutil==2.9.0 -mdurl==0.1.2 -blis==0.7.10 -jupyter==1.1.1 -pyerfa==2.0.1.5 -comm==0.2.2 -pygraphviz==1.14 -dill==0.3.8 -paramiko==3.5.0 -llama-index==0.8.36 -mdit-py-plugins==0.4.2 -Werkzeug==3.1.3 -pyu2f==0.1.5 -dask-glm==0.2.0 -httpx==0.27.2 -typeguard==4.4.1 -mypy-extensions==1.0.0 -kmodes==0.12.2 -keras==2.15.0 -ydata-profiling==0.0.dev0 -regex==2024.11.6 -xarray==2024.11.0 -setuptools==75.3.0 -charset-normalizer==3.4.0 -jupyterlab_nvdashboard==0.11.0 -pylibraft==24.12.0a36 -spacy==3.7.6 -mlflow-skinny==2.17.2 -nvtx==0.2.10 -multimethod==1.12 -pexpect==4.9.0 -torch==2.1.0.post301 -flatbuffers==24.3.25 -python-json-logger==2.0.7 -PyJWT==2.9.0 -multiprocess==0.70.16 -colorlover==0.3.0 -yarl==1.16.0 -locket==1.0.0 -patsy==1.0.0 -rapids-dask-dependency==24.12.0a0 -stanza==1.9.2 -debugpy==1.8.8 -jupyterlab_pygments==0.3.0 -pylibcudf==24.12.0a337 -lz4==4.3.3 -pandas==2.2.3 -tifffile==2024.9.20 -pynvml==11.4.1 -cufflinks==0.17.3 -ipywidgets==8.1.5 -requests-oauthlib==2.0.0 -google-auth-oauthlib==1.2.1 -rsa==4.9 -webcolors==24.8.0 -jsonschema-specifications==2024.10.1 -scikit-learn==1.5.2 -langchain-text-splitters==0.3.2 -pandas-datareader==0.10.0 -tomli==2.0.2 -tzdata==2024.2 -scikit-image==0.24.0 -tensorboard_data_server==0.7.0 -kiwisolver==1.4.7 -cloudpathlib==0.20.0 -isodate==0.6.1 -adversarial-robustness-toolbox==1.19.1 -SQLAlchemy==2.0.36 -pytest-runner==6.0.0 -pycairo==1.27.0 -treelite==4.3.0 -jiter==0.7.0 -threadpoolctl==3.5.0 -pandocfilters==1.5.0 -loguru==0.7.2 -smart_open==7.0.5 -shellingham==1.5.4 -deepspeed==0.15.4 -prompt_toolkit==3.0.48 -databricks-sdk==0.34.0 -langchain-core==0.3.15 -imageio==2.36.0 -openapi-schema-pydantic==1.2.4 -zict==3.0.0 -cachetools==5.5.0 -colorful==0.5.6 -mpmath==1.3.0 -nest_asyncio==1.6.0 -pyFUME==0.2.25 -opencv-python-headless==4.9.0 -fastai==2.7.18 -importlib_resources==6.4.5 -binaryornot==0.4.4 -evaluate==0.4.1 -matplotlib-inline==0.1.7 -wasabi==1.1.2 -pycparser==2.22 -GitPython==3.1.43 -pluggy==1.5.0 -async-lru==2.0.4 -pgmpy==0.1.24 -anyio==4.4.0 -executing==2.1.0 -orjson==3.10.11 -humanfriendly==10.0 -tornado==6.4.1 -gmpy2==2.1.5 -rlPyCairo==0.2.0 -distributed==2024.11.0 -FuzzyTM==2.0.5 -torchtext==0.15.2a0+5ce3163 -pytest==8.3.5 -pyod==2.0.2 -ImageHash==4.3.1 -soupsieve==2.5 -tblib==3.0.0 -emoji==2.14.0 -aiohappyeyeballs==2.4.3 -uri-template==1.3.0 -tensorflow_estimator==2.15.0 -babel==2.16.0 -dask-cuda==24.12.0a12 -overrides==7.7.0 -opencensus==0.11.3 -openai==0.28.1 -language_data==1.2.0 -jedi==0.19.2 -cookiecutter==2.6.0 -entrypoints==0.4 -exceptiongroup==1.2.2 -marisa-trie==1.2.0 -uvloop==0.20.0 -aiosignal==1.3.1 -Flask==3.0.3 -tensorboard==2.15.2 -cffi==1.17.1 -tf_keras==2.15.0 -absl-py==2.1.0 -blinker==1.9.0 -types-python-dateutil==2.9.0.20241003 -opencv-python==4.9.0 -frozendict==2.4.6 -aiohttp-cors==0.7.0 -statsmodels==0.14.4 -tinycss2==1.4.0 -terminado==0.18.1 -pycaret==2.2.3 -aiohttp==3.10.10 -distributed-ucxx==0.41.0 -prometheus_client==0.21.0 -fastdownload==0.0.7 -grpcio==1.59.3 -google-api-core==2.22.0 -jupyterlab_widgets==3.0.13 -appdirs==1.4.4 -littleutils==0.0.0 -ray==2.24.0 -kaggle==1.6.17 -jsonschema==4.23.0 -google-auth==2.36.0 -scikit-base==0.11.0 -visions==0.7.6 -pyarrow==15.0.0 -transformers==4.33.0 -prometheus_flask_exporter==0.23.1 -dm-tree==0.1.8 -colorama==0.4.6 -requests-toolbelt==1.0.0 -cached-property==1.5.2 -cymem==2.0.8 -PyNaCl==1.5.0 -PyWavelets==1.7.0 -httptools==0.6.1 -typing-utils==0.1.0 -email_validator==2.2.0 -marshmallow==3.23.1 -Deprecated==1.2.14 -virtualenv==20.4.7 -optuna==3.6.1 -jupyter_server==2.14.2 -termcolor==2.5.0 -mpi4py==4.0.1 -torchdata==0.7.1+8cea82f -dataclasses==0.8 -cloudpickle==3.1.0 -tree_sitter_languages==1.10.2 -tabulate==0.9.0 -ipython==8.29.0 -lightgbm==4.3.0 -captum==0.6.0 -confuse==2.0.1 -torchvision==0.16.1+adc3221 -lxml==4.9.4 -fastapi==0.115.4 -python-multipart==0.0.17 -dnspython==2.7.0 -jupyter-console==6.6.3 -preshed==3.0.9 -py-cpuinfo==9.0.0 -Send2Trash==1.8.3 -murmurhash==1.0.10 -sniffio==1.3.1 -websockets==13.1 -h11==0.14.0 -smmap==5.0.0 -textual==0.85.2 -jsonpatch==1.33 -opencensus-context==0.1.3 -nbconvert==7.16.4 -sentry-sdk==2.19.0 -opentelemetry-semantic-conventions==0.37b0 -pandas-profiling==2.8.0 -pillow==10.3.0 -peft==0.13.2 -rpds-py==0.21.0 -bokeh==3.6.1 -distro==1.9.0 -itsdangerous==2.2.0 -wandb==0.18.7 -jsonpointer==3.0.0 -astropy-iers-data==0.2024.11.11.0.32.38 -horovod==0.28.1 -graphviz==0.20.3 -vtk==9.3.1 -bleach==6.2.0 -numexpr==2.8.7 -pydantic_core==2.23.4 -Jinja2==3.1.4 -widgetsnbextension==4.0.13 -filelock==3.16.1 -catboost==1.2.7 -raft-dask==24.12.0a36 -async-timeout==4.0.3 -datefinder==0.7.3 -coloredlogs==15.0.1 -platformdirs==4.3.6 -spacy-legacy==3.0.12 -chardet==5.2.0 -jupyter_client==8.6.3 -importlib_metadata==8.5.0 -rfc3986-validator==0.1.1 -huggingface_hub==0.26.2 -PySocks==1.7.1 -mlxtend==0.23.2 -outdated==0.2.2 -partd==1.4.2 -thinc==8.2.5 -astropy==6.1.6 -rdflib==6.3.2 -h2==4.1.0 -typer==0.13.0 -xyzservices==2024.9.0 -toolz==0.12.1 -frozenlist==1.5.0 -rdkit==2024.9.2 -pyasn1==0.6.1 -jupyter_server_terminals==0.5.3 -ucx-py==0.41.0a11 -astunparse==1.6.3 -simpful==2.12.0 -notebook_shim==0.2.4 -scipy==1.13.1 -colorlog==6.9.0 -tiktoken==0.3.3 -plotly==5.24.1 -fastrlock==0.8.2 -chart-studio==1.1.0 -stack-data==0.6.2 -google-pasta==0.2.0 -sktime==0.34.0 -PyYAML==6.0.2 -sympy==1.13.3 -multidict==6.1.0 -ml-dtypes==0.2.0 -tensorboardX==2.6.2.2 -decorator==5.1.1 -cytoolz==1.0.0 -ase==3.23.0 -isoduration==20.11.0 -html5lib==1.1 -langsmith==0.1.142 -future==1.0.0 -onnx2torch==1.5.15 -multipledispatch==0.6.0 -protobuf==4.24.4 -ucxx==0.41.0 -pandas_flavor==0.6.0 -msgpack==1.1.0 -pyasn1_modules==0.4.1 -imagecodecs==2024.1.1 -mlflow==2.17.2 -watchfiles==0.24.0 -dm-sonnet==2.0.2 -langcodes==3.4.1 -freetype-py==2.3.0 -argon2-cffi-bindings==21.2.0 -trimesh==4.5.2 -opt_einsum==3.4.0 -tenacity==8.5.0 -h5py==3.12.1 -fastapi-cli==0.0.5 -oauthlib==3.2.2 -parso==0.8.4 -weasel==0.4.1 -yfinance==0.2.49 -networkx==2.8.8 -bitsandbytes==0.44.1 -lazy_loader==0.4 -querystring_parser==1.2.4 -contourpy==1.3.0 -unicodedata2==15.1.0 -bcrypt==4.2.0 -munkres==1.1.4 -langchain==0.0.298 -hpack==4.0.0 -cryptography==43.0.3 -umap-learn==0.5.7 -arrow==1.3.0 -docker==7.1.0 -certifi==2025.1.31 -fastjsonschema==2.20.0 -tensorflow==2.15.0 -googleapis-common-protos==1.65.0 -iniconfig==2.0.0 -Markdown==3.6 -llvmlite==0.43.0 -wslink==2.3.2 -attrs==24.2.0 -rich==13.9.4 -cupy==13.3.0 -uc-micro-py==1.0.3 -alembic==1.14.0 -joblib==1.4.2 -reportlab==4.2.5 -miniful==0.0.6 -jupyter_core==5.7.2 -wheel==0.45.0 -phik==0.12.3 -mistune==3.0.2 -wcwidth==0.2.13 -dacite==1.8.1 -accelerate==0.22.0 -sacremoses==0.0.53 -revtok==0.0.3 -python-slugify==8.0.4 -tangled-up-in-unicode==0.2.0 -dask==2024.11.0 -markdown-it-py==3.0.0 -sentencepiece==0.1.99 -beautifulsoup4==4.12.3 -six==1.16.0 -numba-cuda==0.0.17 -argon2-cffi==23.1.0 -xxhash==3.5.0 -hjson==3.1.0 -fonttools==4.54.1 -graphql-core==3.2.5 -pyparsing==3.2.0 -pure_eval==0.2.3 -distlib==0.3.9 -lightning==2.4.0 -wordcloud==0.0.0 -catalogue==2.0.10 -jax==0.4.27 -tree-sitter==0.23.2 -notebook==7.2.2 -dataclasses-json==0.6.7 -propcache==0.2.0 -numba==0.60.0 -dask-expr==1.1.17 -pydantic==2.9.2 -gunicorn==22.0.0 -missingno==0.5.2 -pyOpenSSL==24.2.1 -openpyxl==3.1.5 -packaging==24.1 -python-dotenv==1.0.1 -cycler==0.12.1 -types-pytz==2024.2.0.20241003 -yellowbrick==1.5 -referencing==0.35.1 -pyLDAvis==3.4.1 -lazypredict==0.2.16 -fqdn==1.5.1 -websocket-client==1.8.0 -fastcore==1.7.19 -pynvjitlink-cu12==0.3.0 -pingouin==0.5.5 -numpy==1.26.4 -typing-inspect==0.9.0 -nltk==3.9.1 -onnxruntime==1.19.2 -tensorflow-probability==0.23.0 -datasets==3.0.2 -pickleshare==0.7.5 -peewee==3.17.7 -torch-geometric==2.6.1 -ptyprocess==0.7.0 -greenlet==3.1.1 -graphql-relay==3.2.0 -graphene==3.4.3 -et_xmlfile==2.0.0 -webencodings==0.5.1 -hyperframe==6.0.1 -multitasking==0.0.9 -typer-slim==0.13.0 -onnx==1.15.0 -uvicorn==0.32.0 -memray==1.13.4 -xgboost==2.1.2 -Brotli==1.1.0 -zipp==3.21.0 -nbformat==5.10.4 -responses==0.18.0 -funcy==2.0 -Pygments==2.18.0 -tqdm==4.67.0 -linkify-it-py==2.0.3 -srsly==2.4.8 -cuda-python==12.6.0 -lightning-utilities==0.11.8 -cudf==24.12.0a337 -dask-ml==2024.4.4 -docker-pycreds==0.4.0 -pkgutil_resolve_name==1.3.10 -opentelemetry-api==1.16.0 -fsspec==2024.9.0 -nbclient==0.10.0 -psutil==5.9.8 -pytorch-lightning==2.4.0 -sortedcontainers==2.4.0 -matplotlib==3.9.2 -defusedxml==0.7.1 -urllib3==1.26.19 -jupyterlab_server==2.27.3 -retrying==1.3.3 -dask-cudf==24.12.0a337 -sqlparse==0.5.1 -text-unidecode==1.3 -seaborn==0.13.2 -typing_extensions==4.12.2 -pyzmq==26.2.0 -rfc3339-validator==0.1.4 -pynndescent==0.5.13 -pip==24.3.1 -confection==0.1.4 -wrapt==1.14.1 -fastprogress==1.0.3 -traitlets==5.14.3 -asttokens==2.4.1 -json5==0.9.28 -pandas-stubs==2.2.3.241126 -torchmetrics==1.2.1 -gitdb==4.0.11 -annotated-types==0.7.0 -ipython-autotime==0.1 -httpcore==1.0.6 -click==8.1.7 -setproctitle==1.3.3 -starlette==0.41.2 -jupyterlab==4.2.5 -rmm==24.12.0a27 -opentelemetry-sdk==1.16.0 -textblob==0.15.3 -imbalanced-learn==0.12.4 -typeguard==4.3.0 -more-itertools==10.3.0 -zipp==3.19.2 -autocommand==2.2.2 -jaraco.context==5.3.0 -packaging==24.1 -importlib_metadata==8.0.0 -platformdirs==4.2.2 -jaraco.functools==4.0.1 -importlib_resources==6.4.0 -tomli==2.0.1 -jaraco.text==3.12.1 -wheel==0.43.0 -jaraco.collections==5.1.0 -typing_extensions==4.12.2 -inflect==7.3.1 -backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_132912-1agsw1y8/files/wandb-metadata.json b/wandb/run-20250504_132912-1agsw1y8/files/wandb-metadata.json deleted file mode 100644 index e27daebbacd3a1b6062dd305d598e9d1014c3f16..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132912-1agsw1y8/files/wandb-metadata.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", - "python": "3.10.15", - "startedAt": "2025-05-04T10:29:13.019628Z", - "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", - "codePath": "finetuning_bc_prott5.py", - "email": "zeynep.isik1@sabanciuniv.edu", - "root": "/arf/scratch/zisik/prott5_bc_ft", - "host": "kolyoz1", - "username": "zisik", - "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", - "codePathLocal": "finetuning_bc_prott5.py", - "cpu_count": 64, - "cpu_count_logical": 64, - "gpu": "NVIDIA H100 80GB HBM3", - "gpu_count": 1, - "disk": { - "/": { - "total": "7643995308032", - "used": "274768302080" - } - }, - "memory": { - "total": "1081373220864" - }, - "cpu": { - "count": 64, - "countLogical": 64 - }, - "gpu_nvidia": [ - { - "name": "NVIDIA H100 80GB HBM3", - "memoryTotal": "85520809984", - "cudaCores": 16896, - "architecture": "Hopper" - } - ], - "slurm": { - "cluster_name": "cuda", - "conf": "/etc/slurm/slurm.conf", - "cpus_on_node": "16", - "cpus_per_task": "16", - "gpus_on_node": "1", - "gtids": "0", - "job_account": "tbag154", - "job_cpus_per_node": "16", - "job_end_time": "1746613727", - "job_gid": "11636", - "job_gpus": "1", - "job_id": "1027934", - "job_name": "msa_ph_pt", - "job_nodelist": "kolyoz1", - "job_num_nodes": "1", - "job_partition": "kolyoz-cuda", - "job_qos": "tbag", - "job_start_time": "1746354527", - "job_uid": "11636", - "job_user": "zisik", - "jobid": "1027934", - "localid": "0", - "mem_per_cpu": "14000", - "nnodes": "1", - "node_aliases": "(null)", - "nodeid": "0", - "nodelist": "kolyoz1", - "prio_process": "0", - "procid": "0", - "submit_dir": "/arf/scratch/zisik", - "submit_host": "cuda-ui", - "task_pid": "3157550", - "tasks_per_node": "1", - "topology_addr": "kolyoz1", - "topology_addr_pattern": "node", - "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" - }, - "cudaVersion": "12.6" -} \ No newline at end of file diff --git a/wandb/run-20250504_132912-1agsw1y8/files/wandb-summary.json b/wandb/run-20250504_132912-1agsw1y8/files/wandb-summary.json deleted file mode 100644 index 030bbea79bd4d5fc3ae46d01de3f64e2d7ead2c3..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132912-1agsw1y8/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"train/learning_rate":3.383107938893063e-05,"train/global_step":15972,"eval/steps_per_second":75.053,"_timestamp":1.7463635035359182e+09,"eval/accuracy":0.4992604225635032,"_step":31,"eval/loss":0.6931192278862,"train/grad_norm":0.10788150876760483,"train/epoch":1,"_wandb":{"runtime":8950},"_runtime":8950.516897928,"train/loss":0.6932,"eval/runtime":182.4166,"eval/samples_per_second":600.395} \ No newline at end of file diff --git a/wandb/run-20250504_132912-1agsw1y8/logs/debug-core.log b/wandb/run-20250504_132912-1agsw1y8/logs/debug-core.log deleted file mode 100644 index dad0e4abf15eab93aed95168c60fe6412f76a17e..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132912-1agsw1y8/logs/debug-core.log +++ /dev/null @@ -1,14 +0,0 @@ -{"time":"2025-05-04T13:29:12.35887463+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp1u83hfoi/port-3157577.txt","pid":3157577,"debug":false,"disable-analytics":false} -{"time":"2025-05-04T13:29:12.358923345+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} -{"time":"2025-05-04T13:29:12.35977753+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":45947,"Zone":""}} -{"time":"2025-05-04T13:29:12.359879073+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3157577} -{"time":"2025-05-04T13:29:12.546636547+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:34718"} -{"time":"2025-05-04T13:29:13.02161239+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"1agsw1y8","id":"127.0.0.1:34718"} -{"time":"2025-05-04T13:29:13.145638422+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"1agsw1y8","id":"127.0.0.1:34718"} -{"time":"2025-05-04T15:58:23.607250248+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:34718"} -{"time":"2025-05-04T15:58:23.607435128+03:00","level":"INFO","msg":"server is shutting down"} -{"time":"2025-05-04T15:58:23.607401252+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:34718"} -{"time":"2025-05-04T15:58:23.607720003+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:34718"} -{"time":"2025-05-04T15:58:24.801882716+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:34718"} -{"time":"2025-05-04T15:58:24.801915389+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:34718"} -{"time":"2025-05-04T15:58:24.801937893+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_132912-1agsw1y8/logs/debug-internal.log b/wandb/run-20250504_132912-1agsw1y8/logs/debug-internal.log deleted file mode 100644 index 3e1b9c9e1960c66d21bac86084b75cecf9a700d0..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132912-1agsw1y8/logs/debug-internal.log +++ /dev/null @@ -1,19 +0,0 @@ -{"time":"2025-05-04T13:29:13.023253759+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} -{"time":"2025-05-04T13:29:13.023302807+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_132912-1agsw1y8/logs/debug-core.log"} -{"time":"2025-05-04T13:29:13.145570529+03:00","level":"INFO","msg":"created new stream","id":"1agsw1y8"} -{"time":"2025-05-04T13:29:13.145625833+03:00","level":"INFO","msg":"stream: started","id":"1agsw1y8"} -{"time":"2025-05-04T13:29:13.145806528+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"1agsw1y8"} -{"time":"2025-05-04T13:29:13.145923955+03:00","level":"INFO","msg":"handler: started","stream_id":"1agsw1y8"} -{"time":"2025-05-04T13:29:13.146011145+03:00","level":"INFO","msg":"sender: started","stream_id":"1agsw1y8"} -{"time":"2025-05-04T13:29:13.51656923+03:00","level":"INFO","msg":"Starting system monitor"} -{"time":"2025-05-04T15:58:23.607363166+03:00","level":"INFO","msg":"stream: closing","id":"1agsw1y8"} -{"time":"2025-05-04T15:58:23.607412721+03:00","level":"INFO","msg":"Stopping system monitor"} -{"time":"2025-05-04T15:58:23.608736938+03:00","level":"INFO","msg":"Stopped system monitor"} -{"time":"2025-05-04T15:58:23.995834762+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} -{"time":"2025-05-04T15:58:23.995863601+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} -{"time":"2025-05-04T15:58:23.995874256+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} -{"time":"2025-05-04T15:58:24.53730388+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} -{"time":"2025-05-04T15:58:24.801427373+03:00","level":"INFO","msg":"handler: closed","stream_id":"1agsw1y8"} -{"time":"2025-05-04T15:58:24.801476891+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"1agsw1y8"} -{"time":"2025-05-04T15:58:24.801525233+03:00","level":"INFO","msg":"sender: closed","stream_id":"1agsw1y8"} -{"time":"2025-05-04T15:58:24.801589463+03:00","level":"INFO","msg":"stream: closed","id":"1agsw1y8"} diff --git a/wandb/run-20250504_132912-1agsw1y8/logs/debug.log b/wandb/run-20250504_132912-1agsw1y8/logs/debug.log deleted file mode 100644 index ea208d69901f2374562663d7c34e15b09373c8f9..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132912-1agsw1y8/logs/debug.log +++ /dev/null @@ -1,27 +0,0 @@ -2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 -2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Configure stats pid to 3157577 -2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings -2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings -2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} -2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} -2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_132912-1agsw1y8/logs/debug.log -2025-05-04 13:29:13,014 INFO MainThread:3157577 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_132912-1agsw1y8/logs/debug-internal.log -2025-05-04 13:29:13,014 INFO MainThread:3157577 [wandb_init.py:init():619] calling init triggers -2025-05-04 13:29:13,014 INFO MainThread:3157577 [wandb_init.py:init():626] wandb.init called with sweep_config: {} -config: {} -2025-05-04 13:29:13,014 INFO MainThread:3157577 [wandb_init.py:init():669] starting backend -2025-05-04 13:29:13,014 INFO MainThread:3157577 [wandb_init.py:init():673] sending inform_init request -2025-05-04 13:29:13,018 INFO MainThread:3157577 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn -2025-05-04 13:29:13,019 INFO MainThread:3157577 [wandb_init.py:init():686] backend started and connected -2025-05-04 13:29:13,026 INFO MainThread:3157577 [wandb_init.py:init():781] updated telemetry -2025-05-04 13:29:13,030 INFO MainThread:3157577 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout -2025-05-04 13:29:13,503 INFO MainThread:3157577 [wandb_init.py:init():867] starting run threads in backend -2025-05-04 13:29:14,946 INFO MainThread:3157577 [wandb_run.py:_console_start():2456] atexit reg -2025-05-04 13:29:14,946 INFO MainThread:3157577 [wandb_run.py:_redirect():2305] redirect: wrap_raw -2025-05-04 13:29:14,946 INFO MainThread:3157577 [wandb_run.py:_redirect():2370] Wrapping output streams. -2025-05-04 13:29:14,946 INFO MainThread:3157577 [wandb_run.py:_redirect():2395] Redirects installed. -2025-05-04 13:29:14,954 INFO MainThread:3157577 [wandb_init.py:init():911] run started, returning control to user process -2025-05-04 13:33:19,417 INFO MainThread:3157577 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_13-33-08_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} -2025-05-04 15:58:23,607 WARNING MsgRouterThr:3157577 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_132912-1agsw1y8/run-1agsw1y8.wandb b/wandb/run-20250504_132912-1agsw1y8/run-1agsw1y8.wandb deleted file mode 100644 index d0fe86b920af1550de340c5d128c7edf489a6165..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_132912-1agsw1y8/run-1agsw1y8.wandb +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:71cf2569d2e508f45833ce35b1904bcc5325f9369eef0a76ea074fad88d8621d -size 5615901 diff --git a/wandb/run-20250504_160615-f65jh2lv/files/output.log b/wandb/run-20250504_160615-f65jh2lv/files/output.log deleted file mode 100644 index 9f8ad836e7976228186ebb3ee636e8e5558b4888..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_160615-f65jh2lv/files/output.log +++ /dev/null @@ -1,8 +0,0 @@ -You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 -Map: 100%|██████████| 511104/511104 [00:20<00:00, 25304.42 examples/s] -Map: 100%|██████████| 109522/109522 [00:02<00:00, 36704.44 examples/s] -/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead - warnings.warn( -[2025-05-04 16:06:52,248] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. - 1%| | 246/47916 [02:12<7:08:44, 1.85it/s] diff --git a/wandb/run-20250504_160615-f65jh2lv/files/requirements.txt b/wandb/run-20250504_160615-f65jh2lv/files/requirements.txt deleted file mode 100644 index 847c45ecccb522de294762faeeb01fe5fb02f7ac..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_160615-f65jh2lv/files/requirements.txt +++ /dev/null @@ -1,541 +0,0 @@ -nvidia-cuda-cupti-cu12==12.4.127 -nvidia-cuda-nvrtc-cu12==12.4.127 -pyg-lib==0.4.0+pt20cu117 -biopython==1.85 -iniconfig==2.0.0 -tokenizers==0.20.0 -accelerate==1.3.0 -torch==2.6.0 -nvidia-nccl-cu12==2.21.5 -transformers==4.45.2 -nvidia-cusparse-cu12==12.3.1.170 -torch-scatter==2.1.2+pt20cu117 -nvidia-cusparselt-cu12==0.6.2 -nvidia-nvtx-cu12==12.4.127 -zstd==1.5.6.6 -fair-esm==2.0.0 -omegaconf==2.3.0 -pluggy==1.5.0 -pytest==8.3.5 -nvidia-curand-cu12==10.3.5.147 -nvidia-cufft-cu12==11.2.1.3 -torch-cluster==1.6.3+pt20cu117 -regex==2024.9.11 -nvidia-cudnn-cu12==9.1.0.70 -torch-spline-conv==1.2.2+pt20cu117 -nvidia-cusolver-cu12==11.6.1.9 -antlr4-python3-runtime==4.9.3 -msgpack-numpy==0.4.8 -nlp==0.2.0 -einops==0.8.1 -nvidia-cublas-cu12==12.4.5.8 -triton==3.2.0 -ninja==1.11.1.3 -hydra-core==1.3.2 -nvidia-nvjitlink-cu12==12.4.127 -biotite==0.41.2 -torch-sparse==0.6.18+pt20cu117 -esm==3.1.4 -sympy==1.13.1 -nvidia-cuda-runtime-cu12==12.4.127 -jupyter-lsp==2.2.5 -jupyter-events==0.10.0 -ipykernel==6.29.5 -Mako==1.3.5 -proto-plus==1.25.0 -fst-pso==1.8.1 -gensim==4.3.3 -htmlmin==0.1.12 -tokenizers==0.13.3 -timm==1.0.11 -MarkupSafe==3.0.2 -safetensors==0.4.5 -requests==2.32.3 -gast==0.5.5 -cuml==24.12.0a33 -jaxlib==0.4.23.dev20240214 -spacy-loggers==1.0.5 -pytz==2024.1 -idna==3.10 -python-dateutil==2.9.0 -mdurl==0.1.2 -blis==0.7.10 -jupyter==1.1.1 -pyerfa==2.0.1.5 -comm==0.2.2 -pygraphviz==1.14 -dill==0.3.8 -paramiko==3.5.0 -llama-index==0.8.36 -mdit-py-plugins==0.4.2 -Werkzeug==3.1.3 -pyu2f==0.1.5 -dask-glm==0.2.0 -httpx==0.27.2 -typeguard==4.4.1 -mypy-extensions==1.0.0 -kmodes==0.12.2 -keras==2.15.0 -ydata-profiling==0.0.dev0 -regex==2024.11.6 -xarray==2024.11.0 -setuptools==75.3.0 -charset-normalizer==3.4.0 -jupyterlab_nvdashboard==0.11.0 -pylibraft==24.12.0a36 -spacy==3.7.6 -mlflow-skinny==2.17.2 -nvtx==0.2.10 -multimethod==1.12 -pexpect==4.9.0 -torch==2.1.0.post301 -flatbuffers==24.3.25 -python-json-logger==2.0.7 -PyJWT==2.9.0 -multiprocess==0.70.16 -colorlover==0.3.0 -yarl==1.16.0 -locket==1.0.0 -patsy==1.0.0 -rapids-dask-dependency==24.12.0a0 -stanza==1.9.2 -debugpy==1.8.8 -jupyterlab_pygments==0.3.0 -pylibcudf==24.12.0a337 -lz4==4.3.3 -pandas==2.2.3 -tifffile==2024.9.20 -pynvml==11.4.1 -cufflinks==0.17.3 -ipywidgets==8.1.5 -requests-oauthlib==2.0.0 -google-auth-oauthlib==1.2.1 -rsa==4.9 -webcolors==24.8.0 -jsonschema-specifications==2024.10.1 -scikit-learn==1.5.2 -langchain-text-splitters==0.3.2 -pandas-datareader==0.10.0 -tomli==2.0.2 -tzdata==2024.2 -scikit-image==0.24.0 -tensorboard_data_server==0.7.0 -kiwisolver==1.4.7 -cloudpathlib==0.20.0 -isodate==0.6.1 -adversarial-robustness-toolbox==1.19.1 -SQLAlchemy==2.0.36 -pytest-runner==6.0.0 -pycairo==1.27.0 -treelite==4.3.0 -jiter==0.7.0 -threadpoolctl==3.5.0 -pandocfilters==1.5.0 -loguru==0.7.2 -smart_open==7.0.5 -shellingham==1.5.4 -deepspeed==0.15.4 -prompt_toolkit==3.0.48 -databricks-sdk==0.34.0 -langchain-core==0.3.15 -imageio==2.36.0 -openapi-schema-pydantic==1.2.4 -zict==3.0.0 -cachetools==5.5.0 -colorful==0.5.6 -mpmath==1.3.0 -nest_asyncio==1.6.0 -pyFUME==0.2.25 -opencv-python-headless==4.9.0 -fastai==2.7.18 -importlib_resources==6.4.5 -binaryornot==0.4.4 -evaluate==0.4.1 -matplotlib-inline==0.1.7 -wasabi==1.1.2 -pycparser==2.22 -GitPython==3.1.43 -pluggy==1.5.0 -async-lru==2.0.4 -pgmpy==0.1.24 -anyio==4.4.0 -executing==2.1.0 -orjson==3.10.11 -humanfriendly==10.0 -tornado==6.4.1 -gmpy2==2.1.5 -rlPyCairo==0.2.0 -distributed==2024.11.0 -FuzzyTM==2.0.5 -torchtext==0.15.2a0+5ce3163 -pytest==8.3.5 -pyod==2.0.2 -ImageHash==4.3.1 -soupsieve==2.5 -tblib==3.0.0 -emoji==2.14.0 -aiohappyeyeballs==2.4.3 -uri-template==1.3.0 -tensorflow_estimator==2.15.0 -babel==2.16.0 -dask-cuda==24.12.0a12 -overrides==7.7.0 -opencensus==0.11.3 -openai==0.28.1 -language_data==1.2.0 -jedi==0.19.2 -cookiecutter==2.6.0 -entrypoints==0.4 -exceptiongroup==1.2.2 -marisa-trie==1.2.0 -uvloop==0.20.0 -aiosignal==1.3.1 -Flask==3.0.3 -tensorboard==2.15.2 -cffi==1.17.1 -tf_keras==2.15.0 -absl-py==2.1.0 -blinker==1.9.0 -types-python-dateutil==2.9.0.20241003 -opencv-python==4.9.0 -frozendict==2.4.6 -aiohttp-cors==0.7.0 -statsmodels==0.14.4 -tinycss2==1.4.0 -terminado==0.18.1 -pycaret==2.2.3 -aiohttp==3.10.10 -distributed-ucxx==0.41.0 -prometheus_client==0.21.0 -fastdownload==0.0.7 -grpcio==1.59.3 -google-api-core==2.22.0 -jupyterlab_widgets==3.0.13 -appdirs==1.4.4 -littleutils==0.0.0 -ray==2.24.0 -kaggle==1.6.17 -jsonschema==4.23.0 -google-auth==2.36.0 -scikit-base==0.11.0 -visions==0.7.6 -pyarrow==15.0.0 -transformers==4.33.0 -prometheus_flask_exporter==0.23.1 -dm-tree==0.1.8 -colorama==0.4.6 -requests-toolbelt==1.0.0 -cached-property==1.5.2 -cymem==2.0.8 -PyNaCl==1.5.0 -PyWavelets==1.7.0 -httptools==0.6.1 -typing-utils==0.1.0 -email_validator==2.2.0 -marshmallow==3.23.1 -Deprecated==1.2.14 -virtualenv==20.4.7 -optuna==3.6.1 -jupyter_server==2.14.2 -termcolor==2.5.0 -mpi4py==4.0.1 -torchdata==0.7.1+8cea82f -dataclasses==0.8 -cloudpickle==3.1.0 -tree_sitter_languages==1.10.2 -tabulate==0.9.0 -ipython==8.29.0 -lightgbm==4.3.0 -captum==0.6.0 -confuse==2.0.1 -torchvision==0.16.1+adc3221 -lxml==4.9.4 -fastapi==0.115.4 -python-multipart==0.0.17 -dnspython==2.7.0 -jupyter-console==6.6.3 -preshed==3.0.9 -py-cpuinfo==9.0.0 -Send2Trash==1.8.3 -murmurhash==1.0.10 -sniffio==1.3.1 -websockets==13.1 -h11==0.14.0 -smmap==5.0.0 -textual==0.85.2 -jsonpatch==1.33 -opencensus-context==0.1.3 -nbconvert==7.16.4 -sentry-sdk==2.19.0 -opentelemetry-semantic-conventions==0.37b0 -pandas-profiling==2.8.0 -pillow==10.3.0 -peft==0.13.2 -rpds-py==0.21.0 -bokeh==3.6.1 -distro==1.9.0 -itsdangerous==2.2.0 -wandb==0.18.7 -jsonpointer==3.0.0 -astropy-iers-data==0.2024.11.11.0.32.38 -horovod==0.28.1 -graphviz==0.20.3 -vtk==9.3.1 -bleach==6.2.0 -numexpr==2.8.7 -pydantic_core==2.23.4 -Jinja2==3.1.4 -widgetsnbextension==4.0.13 -filelock==3.16.1 -catboost==1.2.7 -raft-dask==24.12.0a36 -async-timeout==4.0.3 -datefinder==0.7.3 -coloredlogs==15.0.1 -platformdirs==4.3.6 -spacy-legacy==3.0.12 -chardet==5.2.0 -jupyter_client==8.6.3 -importlib_metadata==8.5.0 -rfc3986-validator==0.1.1 -huggingface_hub==0.26.2 -PySocks==1.7.1 -mlxtend==0.23.2 -outdated==0.2.2 -partd==1.4.2 -thinc==8.2.5 -astropy==6.1.6 -rdflib==6.3.2 -h2==4.1.0 -typer==0.13.0 -xyzservices==2024.9.0 -toolz==0.12.1 -frozenlist==1.5.0 -rdkit==2024.9.2 -pyasn1==0.6.1 -jupyter_server_terminals==0.5.3 -ucx-py==0.41.0a11 -astunparse==1.6.3 -simpful==2.12.0 -notebook_shim==0.2.4 -scipy==1.13.1 -colorlog==6.9.0 -tiktoken==0.3.3 -plotly==5.24.1 -fastrlock==0.8.2 -chart-studio==1.1.0 -stack-data==0.6.2 -google-pasta==0.2.0 -sktime==0.34.0 -PyYAML==6.0.2 -sympy==1.13.3 -multidict==6.1.0 -ml-dtypes==0.2.0 -tensorboardX==2.6.2.2 -decorator==5.1.1 -cytoolz==1.0.0 -ase==3.23.0 -isoduration==20.11.0 -html5lib==1.1 -langsmith==0.1.142 -future==1.0.0 -onnx2torch==1.5.15 -multipledispatch==0.6.0 -protobuf==4.24.4 -ucxx==0.41.0 -pandas_flavor==0.6.0 -msgpack==1.1.0 -pyasn1_modules==0.4.1 -imagecodecs==2024.1.1 -mlflow==2.17.2 -watchfiles==0.24.0 -dm-sonnet==2.0.2 -langcodes==3.4.1 -freetype-py==2.3.0 -argon2-cffi-bindings==21.2.0 -trimesh==4.5.2 -opt_einsum==3.4.0 -tenacity==8.5.0 -h5py==3.12.1 -fastapi-cli==0.0.5 -oauthlib==3.2.2 -parso==0.8.4 -weasel==0.4.1 -yfinance==0.2.49 -networkx==2.8.8 -bitsandbytes==0.44.1 -lazy_loader==0.4 -querystring_parser==1.2.4 -contourpy==1.3.0 -unicodedata2==15.1.0 -bcrypt==4.2.0 -munkres==1.1.4 -langchain==0.0.298 -hpack==4.0.0 -cryptography==43.0.3 -umap-learn==0.5.7 -arrow==1.3.0 -docker==7.1.0 -certifi==2025.1.31 -fastjsonschema==2.20.0 -tensorflow==2.15.0 -googleapis-common-protos==1.65.0 -iniconfig==2.0.0 -Markdown==3.6 -llvmlite==0.43.0 -wslink==2.3.2 -attrs==24.2.0 -rich==13.9.4 -cupy==13.3.0 -uc-micro-py==1.0.3 -alembic==1.14.0 -joblib==1.4.2 -reportlab==4.2.5 -miniful==0.0.6 -jupyter_core==5.7.2 -wheel==0.45.0 -phik==0.12.3 -mistune==3.0.2 -wcwidth==0.2.13 -dacite==1.8.1 -accelerate==0.22.0 -sacremoses==0.0.53 -revtok==0.0.3 -python-slugify==8.0.4 -tangled-up-in-unicode==0.2.0 -dask==2024.11.0 -markdown-it-py==3.0.0 -sentencepiece==0.1.99 -beautifulsoup4==4.12.3 -six==1.16.0 -numba-cuda==0.0.17 -argon2-cffi==23.1.0 -xxhash==3.5.0 -hjson==3.1.0 -fonttools==4.54.1 -graphql-core==3.2.5 -pyparsing==3.2.0 -pure_eval==0.2.3 -distlib==0.3.9 -lightning==2.4.0 -wordcloud==0.0.0 -catalogue==2.0.10 -jax==0.4.27 -tree-sitter==0.23.2 -notebook==7.2.2 -dataclasses-json==0.6.7 -propcache==0.2.0 -numba==0.60.0 -dask-expr==1.1.17 -pydantic==2.9.2 -gunicorn==22.0.0 -missingno==0.5.2 -pyOpenSSL==24.2.1 -openpyxl==3.1.5 -packaging==24.1 -python-dotenv==1.0.1 -cycler==0.12.1 -types-pytz==2024.2.0.20241003 -yellowbrick==1.5 -referencing==0.35.1 -pyLDAvis==3.4.1 -lazypredict==0.2.16 -fqdn==1.5.1 -websocket-client==1.8.0 -fastcore==1.7.19 -pynvjitlink-cu12==0.3.0 -pingouin==0.5.5 -numpy==1.26.4 -typing-inspect==0.9.0 -nltk==3.9.1 -onnxruntime==1.19.2 -tensorflow-probability==0.23.0 -datasets==3.0.2 -pickleshare==0.7.5 -peewee==3.17.7 -torch-geometric==2.6.1 -ptyprocess==0.7.0 -greenlet==3.1.1 -graphql-relay==3.2.0 -graphene==3.4.3 -et_xmlfile==2.0.0 -webencodings==0.5.1 -hyperframe==6.0.1 -multitasking==0.0.9 -typer-slim==0.13.0 -onnx==1.15.0 -uvicorn==0.32.0 -memray==1.13.4 -xgboost==2.1.2 -Brotli==1.1.0 -zipp==3.21.0 -nbformat==5.10.4 -responses==0.18.0 -funcy==2.0 -Pygments==2.18.0 -tqdm==4.67.0 -linkify-it-py==2.0.3 -srsly==2.4.8 -cuda-python==12.6.0 -lightning-utilities==0.11.8 -cudf==24.12.0a337 -dask-ml==2024.4.4 -docker-pycreds==0.4.0 -pkgutil_resolve_name==1.3.10 -opentelemetry-api==1.16.0 -fsspec==2024.9.0 -nbclient==0.10.0 -psutil==5.9.8 -pytorch-lightning==2.4.0 -sortedcontainers==2.4.0 -matplotlib==3.9.2 -defusedxml==0.7.1 -urllib3==1.26.19 -jupyterlab_server==2.27.3 -retrying==1.3.3 -dask-cudf==24.12.0a337 -sqlparse==0.5.1 -text-unidecode==1.3 -seaborn==0.13.2 -typing_extensions==4.12.2 -pyzmq==26.2.0 -rfc3339-validator==0.1.4 -pynndescent==0.5.13 -pip==24.3.1 -confection==0.1.4 -wrapt==1.14.1 -fastprogress==1.0.3 -traitlets==5.14.3 -asttokens==2.4.1 -json5==0.9.28 -pandas-stubs==2.2.3.241126 -torchmetrics==1.2.1 -gitdb==4.0.11 -annotated-types==0.7.0 -ipython-autotime==0.1 -httpcore==1.0.6 -click==8.1.7 -setproctitle==1.3.3 -starlette==0.41.2 -jupyterlab==4.2.5 -rmm==24.12.0a27 -opentelemetry-sdk==1.16.0 -textblob==0.15.3 -imbalanced-learn==0.12.4 -typeguard==4.3.0 -more-itertools==10.3.0 -zipp==3.19.2 -autocommand==2.2.2 -jaraco.context==5.3.0 -packaging==24.1 -importlib_metadata==8.0.0 -platformdirs==4.2.2 -jaraco.functools==4.0.1 -importlib_resources==6.4.0 -tomli==2.0.1 -jaraco.text==3.12.1 -wheel==0.43.0 -jaraco.collections==5.1.0 -typing_extensions==4.12.2 -inflect==7.3.1 -backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_160615-f65jh2lv/files/wandb-metadata.json b/wandb/run-20250504_160615-f65jh2lv/files/wandb-metadata.json deleted file mode 100644 index fd7b856b87ae9094e8c7410b93fc44a222546cc3..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_160615-f65jh2lv/files/wandb-metadata.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", - "python": "3.10.15", - "startedAt": "2025-05-04T13:06:15.895027Z", - "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", - "codePath": "finetuning_bc_prott5.py", - "email": "zeynep.isik1@sabanciuniv.edu", - "root": "/arf/scratch/zisik/prott5_bc_ft", - "host": "kolyoz1", - "username": "zisik", - "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", - "codePathLocal": "finetuning_bc_prott5.py", - "cpu_count": 64, - "cpu_count_logical": 64, - "gpu": "NVIDIA H100 80GB HBM3", - "gpu_count": 1, - "disk": { - "/": { - "total": "7643995308032", - "used": "274886729728" - } - }, - "memory": { - "total": "1081373220864" - }, - "cpu": { - "count": 64, - "countLogical": 64 - }, - "gpu_nvidia": [ - { - "name": "NVIDIA H100 80GB HBM3", - "memoryTotal": "85520809984", - "cudaCores": 16896, - "architecture": "Hopper" - } - ], - "slurm": { - "cluster_name": "cuda", - "conf": "/etc/slurm/slurm.conf", - "cpus_on_node": "16", - "cpus_per_task": "16", - "gpus_on_node": "1", - "gtids": "0", - "job_account": "tbag154", - "job_cpus_per_node": "16", - "job_end_time": "1746623147", - "job_gid": "11636", - "job_gpus": "1", - "job_id": "1027945", - "job_name": "msa_ph_pt", - "job_nodelist": "kolyoz1", - "job_num_nodes": "1", - "job_partition": "kolyoz-cuda", - "job_qos": "tbag", - "job_start_time": "1746363947", - "job_uid": "11636", - "job_user": "zisik", - "jobid": "1027945", - "localid": "0", - "mem_per_cpu": "14000", - "nnodes": "1", - "node_aliases": "(null)", - "nodeid": "0", - "nodelist": "kolyoz1", - "prio_process": "0", - "procid": "0", - "submit_dir": "/arf/scratch/zisik", - "submit_host": "cuda-ui", - "task_pid": "3178532", - "tasks_per_node": "1", - "topology_addr": "kolyoz1", - "topology_addr_pattern": "node", - "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" - }, - "cudaVersion": "12.6" -} \ No newline at end of file diff --git a/wandb/run-20250504_160615-f65jh2lv/logs/debug-core.log b/wandb/run-20250504_160615-f65jh2lv/logs/debug-core.log deleted file mode 100644 index 618fc3f61177df7804f2fc4a8f211c7313be9c35..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_160615-f65jh2lv/logs/debug-core.log +++ /dev/null @@ -1,7 +0,0 @@ -{"time":"2025-05-04T16:06:15.269316376+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp6sywt0mb/port-3178556.txt","pid":3178556,"debug":false,"disable-analytics":false} -{"time":"2025-05-04T16:06:15.269366219+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} -{"time":"2025-05-04T16:06:15.2702663+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3178556} -{"time":"2025-05-04T16:06:15.270143057+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":37579,"Zone":""}} -{"time":"2025-05-04T16:06:15.448913658+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:49916"} -{"time":"2025-05-04T16:06:15.898453126+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"f65jh2lv","id":"127.0.0.1:49916"} -{"time":"2025-05-04T16:06:16.021719647+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"f65jh2lv","id":"127.0.0.1:49916"} diff --git a/wandb/run-20250504_160615-f65jh2lv/logs/debug-internal.log b/wandb/run-20250504_160615-f65jh2lv/logs/debug-internal.log deleted file mode 100644 index 261eef09aa76e080a35f7789b3265f005f6d0225..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_160615-f65jh2lv/logs/debug-internal.log +++ /dev/null @@ -1,8 +0,0 @@ -{"time":"2025-05-04T16:06:15.899998659+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} -{"time":"2025-05-04T16:06:15.900045512+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_160615-f65jh2lv/logs/debug-core.log"} -{"time":"2025-05-04T16:06:16.021644692+03:00","level":"INFO","msg":"created new stream","id":"f65jh2lv"} -{"time":"2025-05-04T16:06:16.021706945+03:00","level":"INFO","msg":"stream: started","id":"f65jh2lv"} -{"time":"2025-05-04T16:06:16.021839756+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"f65jh2lv"} -{"time":"2025-05-04T16:06:16.02194891+03:00","level":"INFO","msg":"handler: started","stream_id":"f65jh2lv"} -{"time":"2025-05-04T16:06:16.022034888+03:00","level":"INFO","msg":"sender: started","stream_id":"f65jh2lv"} -{"time":"2025-05-04T16:06:16.421916148+03:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20250504_160615-f65jh2lv/logs/debug.log b/wandb/run-20250504_160615-f65jh2lv/logs/debug.log deleted file mode 100644 index 06dd2b2a7d6174fa397a32c411642f714082fa74..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_160615-f65jh2lv/logs/debug.log +++ /dev/null @@ -1,26 +0,0 @@ -2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 -2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Configure stats pid to 3178556 -2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings -2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings -2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} -2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} -2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_160615-f65jh2lv/logs/debug.log -2025-05-04 16:06:15,889 INFO MainThread:3178556 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_160615-f65jh2lv/logs/debug-internal.log -2025-05-04 16:06:15,889 INFO MainThread:3178556 [wandb_init.py:init():619] calling init triggers -2025-05-04 16:06:15,889 INFO MainThread:3178556 [wandb_init.py:init():626] wandb.init called with sweep_config: {} -config: {} -2025-05-04 16:06:15,889 INFO MainThread:3178556 [wandb_init.py:init():669] starting backend -2025-05-04 16:06:15,889 INFO MainThread:3178556 [wandb_init.py:init():673] sending inform_init request -2025-05-04 16:06:15,893 INFO MainThread:3178556 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn -2025-05-04 16:06:15,894 INFO MainThread:3178556 [wandb_init.py:init():686] backend started and connected -2025-05-04 16:06:15,902 INFO MainThread:3178556 [wandb_init.py:init():781] updated telemetry -2025-05-04 16:06:15,905 INFO MainThread:3178556 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout -2025-05-04 16:06:16,414 INFO MainThread:3178556 [wandb_init.py:init():867] starting run threads in backend -2025-05-04 16:06:17,992 INFO MainThread:3178556 [wandb_run.py:_console_start():2456] atexit reg -2025-05-04 16:06:17,993 INFO MainThread:3178556 [wandb_run.py:_redirect():2305] redirect: wrap_raw -2025-05-04 16:06:17,993 INFO MainThread:3178556 [wandb_run.py:_redirect():2370] Wrapping output streams. -2025-05-04 16:06:17,993 INFO MainThread:3178556 [wandb_run.py:_redirect():2395] Redirects installed. -2025-05-04 16:06:18,004 INFO MainThread:3178556 [wandb_init.py:init():911] run started, returning control to user process -2025-05-04 16:06:56,772 INFO MainThread:3178556 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_16-06-46_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} diff --git a/wandb/run-20250504_160615-f65jh2lv/run-f65jh2lv.wandb b/wandb/run-20250504_160615-f65jh2lv/run-f65jh2lv.wandb deleted file mode 100644 index 1e8a5f9b3164571a503e4306a04be53481a4529e..0000000000000000000000000000000000000000 Binary files a/wandb/run-20250504_160615-f65jh2lv/run-f65jh2lv.wandb and /dev/null differ diff --git a/wandb/run-20250504_160955-rqk2hbkf/files/config.yaml b/wandb/run-20250504_160955-rqk2hbkf/files/config.yaml deleted file mode 100644 index 7e7549dbe318b236ac4d168d1610ec259f3f67e0..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_160955-rqk2hbkf/files/config.yaml +++ /dev/null @@ -1,44 +0,0 @@ -_wandb: - value: - cli_version: 0.18.7 - m: [] - python_version: 3.10.15 - t: - "1": - - 1 - - 2 - - 3 - - 5 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "2": - - 1 - - 2 - - 3 - - 5 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "3": - - 23 - - 55 - "4": 3.10.15 - "5": 0.18.7 - "6": 4.45.2 - "8": - - 5 - "12": 0.18.7 - "13": linux-x86_64 diff --git a/wandb/run-20250504_160955-rqk2hbkf/files/output.log b/wandb/run-20250504_160955-rqk2hbkf/files/output.log deleted file mode 100644 index 54e81f72adc802bd17a6e8b3e973b2290acd5201..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_160955-rqk2hbkf/files/output.log +++ /dev/null @@ -1,24 +0,0 @@ -Traceback (most recent call last): - File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 33, in - X_train, X_temp, y_train, y_temp = train_test_split(prep_texts, labels, test_size=0.30, random_state=42) - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper - return func(*args, **kwargs) - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/model_selection/_split.py", line 2782, in train_test_split - arrays = indexable(*arrays) - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/utils/validation.py", line 514, in indexable - check_consistent_length(*result) - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/utils/validation.py", line 457, in check_consistent_length - raise ValueError( -ValueError: Found input variables with inconsistent numbers of samples: [10, 730149] -Traceback (most recent call last): - File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 33, in - X_train, X_temp, y_train, y_temp = train_test_split(prep_texts, labels, test_size=0.30, random_state=42) - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper - return func(*args, **kwargs) - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/model_selection/_split.py", line 2782, in train_test_split - arrays = indexable(*arrays) - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/utils/validation.py", line 514, in indexable - check_consistent_length(*result) - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/utils/validation.py", line 457, in check_consistent_length - raise ValueError( -ValueError: Found input variables with inconsistent numbers of samples: [10, 730149] diff --git a/wandb/run-20250504_160955-rqk2hbkf/files/requirements.txt b/wandb/run-20250504_160955-rqk2hbkf/files/requirements.txt deleted file mode 100644 index 847c45ecccb522de294762faeeb01fe5fb02f7ac..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_160955-rqk2hbkf/files/requirements.txt +++ /dev/null @@ -1,541 +0,0 @@ -nvidia-cuda-cupti-cu12==12.4.127 -nvidia-cuda-nvrtc-cu12==12.4.127 -pyg-lib==0.4.0+pt20cu117 -biopython==1.85 -iniconfig==2.0.0 -tokenizers==0.20.0 -accelerate==1.3.0 -torch==2.6.0 -nvidia-nccl-cu12==2.21.5 -transformers==4.45.2 -nvidia-cusparse-cu12==12.3.1.170 -torch-scatter==2.1.2+pt20cu117 -nvidia-cusparselt-cu12==0.6.2 -nvidia-nvtx-cu12==12.4.127 -zstd==1.5.6.6 -fair-esm==2.0.0 -omegaconf==2.3.0 -pluggy==1.5.0 -pytest==8.3.5 -nvidia-curand-cu12==10.3.5.147 -nvidia-cufft-cu12==11.2.1.3 -torch-cluster==1.6.3+pt20cu117 -regex==2024.9.11 -nvidia-cudnn-cu12==9.1.0.70 -torch-spline-conv==1.2.2+pt20cu117 -nvidia-cusolver-cu12==11.6.1.9 -antlr4-python3-runtime==4.9.3 -msgpack-numpy==0.4.8 -nlp==0.2.0 -einops==0.8.1 -nvidia-cublas-cu12==12.4.5.8 -triton==3.2.0 -ninja==1.11.1.3 -hydra-core==1.3.2 -nvidia-nvjitlink-cu12==12.4.127 -biotite==0.41.2 -torch-sparse==0.6.18+pt20cu117 -esm==3.1.4 -sympy==1.13.1 -nvidia-cuda-runtime-cu12==12.4.127 -jupyter-lsp==2.2.5 -jupyter-events==0.10.0 -ipykernel==6.29.5 -Mako==1.3.5 -proto-plus==1.25.0 -fst-pso==1.8.1 -gensim==4.3.3 -htmlmin==0.1.12 -tokenizers==0.13.3 -timm==1.0.11 -MarkupSafe==3.0.2 -safetensors==0.4.5 -requests==2.32.3 -gast==0.5.5 -cuml==24.12.0a33 -jaxlib==0.4.23.dev20240214 -spacy-loggers==1.0.5 -pytz==2024.1 -idna==3.10 -python-dateutil==2.9.0 -mdurl==0.1.2 -blis==0.7.10 -jupyter==1.1.1 -pyerfa==2.0.1.5 -comm==0.2.2 -pygraphviz==1.14 -dill==0.3.8 -paramiko==3.5.0 -llama-index==0.8.36 -mdit-py-plugins==0.4.2 -Werkzeug==3.1.3 -pyu2f==0.1.5 -dask-glm==0.2.0 -httpx==0.27.2 -typeguard==4.4.1 -mypy-extensions==1.0.0 -kmodes==0.12.2 -keras==2.15.0 -ydata-profiling==0.0.dev0 -regex==2024.11.6 -xarray==2024.11.0 -setuptools==75.3.0 -charset-normalizer==3.4.0 -jupyterlab_nvdashboard==0.11.0 -pylibraft==24.12.0a36 -spacy==3.7.6 -mlflow-skinny==2.17.2 -nvtx==0.2.10 -multimethod==1.12 -pexpect==4.9.0 -torch==2.1.0.post301 -flatbuffers==24.3.25 -python-json-logger==2.0.7 -PyJWT==2.9.0 -multiprocess==0.70.16 -colorlover==0.3.0 -yarl==1.16.0 -locket==1.0.0 -patsy==1.0.0 -rapids-dask-dependency==24.12.0a0 -stanza==1.9.2 -debugpy==1.8.8 -jupyterlab_pygments==0.3.0 -pylibcudf==24.12.0a337 -lz4==4.3.3 -pandas==2.2.3 -tifffile==2024.9.20 -pynvml==11.4.1 -cufflinks==0.17.3 -ipywidgets==8.1.5 -requests-oauthlib==2.0.0 -google-auth-oauthlib==1.2.1 -rsa==4.9 -webcolors==24.8.0 -jsonschema-specifications==2024.10.1 -scikit-learn==1.5.2 -langchain-text-splitters==0.3.2 -pandas-datareader==0.10.0 -tomli==2.0.2 -tzdata==2024.2 -scikit-image==0.24.0 -tensorboard_data_server==0.7.0 -kiwisolver==1.4.7 -cloudpathlib==0.20.0 -isodate==0.6.1 -adversarial-robustness-toolbox==1.19.1 -SQLAlchemy==2.0.36 -pytest-runner==6.0.0 -pycairo==1.27.0 -treelite==4.3.0 -jiter==0.7.0 -threadpoolctl==3.5.0 -pandocfilters==1.5.0 -loguru==0.7.2 -smart_open==7.0.5 -shellingham==1.5.4 -deepspeed==0.15.4 -prompt_toolkit==3.0.48 -databricks-sdk==0.34.0 -langchain-core==0.3.15 -imageio==2.36.0 -openapi-schema-pydantic==1.2.4 -zict==3.0.0 -cachetools==5.5.0 -colorful==0.5.6 -mpmath==1.3.0 -nest_asyncio==1.6.0 -pyFUME==0.2.25 -opencv-python-headless==4.9.0 -fastai==2.7.18 -importlib_resources==6.4.5 -binaryornot==0.4.4 -evaluate==0.4.1 -matplotlib-inline==0.1.7 -wasabi==1.1.2 -pycparser==2.22 -GitPython==3.1.43 -pluggy==1.5.0 -async-lru==2.0.4 -pgmpy==0.1.24 -anyio==4.4.0 -executing==2.1.0 -orjson==3.10.11 -humanfriendly==10.0 -tornado==6.4.1 -gmpy2==2.1.5 -rlPyCairo==0.2.0 -distributed==2024.11.0 -FuzzyTM==2.0.5 -torchtext==0.15.2a0+5ce3163 -pytest==8.3.5 -pyod==2.0.2 -ImageHash==4.3.1 -soupsieve==2.5 -tblib==3.0.0 -emoji==2.14.0 -aiohappyeyeballs==2.4.3 -uri-template==1.3.0 -tensorflow_estimator==2.15.0 -babel==2.16.0 -dask-cuda==24.12.0a12 -overrides==7.7.0 -opencensus==0.11.3 -openai==0.28.1 -language_data==1.2.0 -jedi==0.19.2 -cookiecutter==2.6.0 -entrypoints==0.4 -exceptiongroup==1.2.2 -marisa-trie==1.2.0 -uvloop==0.20.0 -aiosignal==1.3.1 -Flask==3.0.3 -tensorboard==2.15.2 -cffi==1.17.1 -tf_keras==2.15.0 -absl-py==2.1.0 -blinker==1.9.0 -types-python-dateutil==2.9.0.20241003 -opencv-python==4.9.0 -frozendict==2.4.6 -aiohttp-cors==0.7.0 -statsmodels==0.14.4 -tinycss2==1.4.0 -terminado==0.18.1 -pycaret==2.2.3 -aiohttp==3.10.10 -distributed-ucxx==0.41.0 -prometheus_client==0.21.0 -fastdownload==0.0.7 -grpcio==1.59.3 -google-api-core==2.22.0 -jupyterlab_widgets==3.0.13 -appdirs==1.4.4 -littleutils==0.0.0 -ray==2.24.0 -kaggle==1.6.17 -jsonschema==4.23.0 -google-auth==2.36.0 -scikit-base==0.11.0 -visions==0.7.6 -pyarrow==15.0.0 -transformers==4.33.0 -prometheus_flask_exporter==0.23.1 -dm-tree==0.1.8 -colorama==0.4.6 -requests-toolbelt==1.0.0 -cached-property==1.5.2 -cymem==2.0.8 -PyNaCl==1.5.0 -PyWavelets==1.7.0 -httptools==0.6.1 -typing-utils==0.1.0 -email_validator==2.2.0 -marshmallow==3.23.1 -Deprecated==1.2.14 -virtualenv==20.4.7 -optuna==3.6.1 -jupyter_server==2.14.2 -termcolor==2.5.0 -mpi4py==4.0.1 -torchdata==0.7.1+8cea82f -dataclasses==0.8 -cloudpickle==3.1.0 -tree_sitter_languages==1.10.2 -tabulate==0.9.0 -ipython==8.29.0 -lightgbm==4.3.0 -captum==0.6.0 -confuse==2.0.1 -torchvision==0.16.1+adc3221 -lxml==4.9.4 -fastapi==0.115.4 -python-multipart==0.0.17 -dnspython==2.7.0 -jupyter-console==6.6.3 -preshed==3.0.9 -py-cpuinfo==9.0.0 -Send2Trash==1.8.3 -murmurhash==1.0.10 -sniffio==1.3.1 -websockets==13.1 -h11==0.14.0 -smmap==5.0.0 -textual==0.85.2 -jsonpatch==1.33 -opencensus-context==0.1.3 -nbconvert==7.16.4 -sentry-sdk==2.19.0 -opentelemetry-semantic-conventions==0.37b0 -pandas-profiling==2.8.0 -pillow==10.3.0 -peft==0.13.2 -rpds-py==0.21.0 -bokeh==3.6.1 -distro==1.9.0 -itsdangerous==2.2.0 -wandb==0.18.7 -jsonpointer==3.0.0 -astropy-iers-data==0.2024.11.11.0.32.38 -horovod==0.28.1 -graphviz==0.20.3 -vtk==9.3.1 -bleach==6.2.0 -numexpr==2.8.7 -pydantic_core==2.23.4 -Jinja2==3.1.4 -widgetsnbextension==4.0.13 -filelock==3.16.1 -catboost==1.2.7 -raft-dask==24.12.0a36 -async-timeout==4.0.3 -datefinder==0.7.3 -coloredlogs==15.0.1 -platformdirs==4.3.6 -spacy-legacy==3.0.12 -chardet==5.2.0 -jupyter_client==8.6.3 -importlib_metadata==8.5.0 -rfc3986-validator==0.1.1 -huggingface_hub==0.26.2 -PySocks==1.7.1 -mlxtend==0.23.2 -outdated==0.2.2 -partd==1.4.2 -thinc==8.2.5 -astropy==6.1.6 -rdflib==6.3.2 -h2==4.1.0 -typer==0.13.0 -xyzservices==2024.9.0 -toolz==0.12.1 -frozenlist==1.5.0 -rdkit==2024.9.2 -pyasn1==0.6.1 -jupyter_server_terminals==0.5.3 -ucx-py==0.41.0a11 -astunparse==1.6.3 -simpful==2.12.0 -notebook_shim==0.2.4 -scipy==1.13.1 -colorlog==6.9.0 -tiktoken==0.3.3 -plotly==5.24.1 -fastrlock==0.8.2 -chart-studio==1.1.0 -stack-data==0.6.2 -google-pasta==0.2.0 -sktime==0.34.0 -PyYAML==6.0.2 -sympy==1.13.3 -multidict==6.1.0 -ml-dtypes==0.2.0 -tensorboardX==2.6.2.2 -decorator==5.1.1 -cytoolz==1.0.0 -ase==3.23.0 -isoduration==20.11.0 -html5lib==1.1 -langsmith==0.1.142 -future==1.0.0 -onnx2torch==1.5.15 -multipledispatch==0.6.0 -protobuf==4.24.4 -ucxx==0.41.0 -pandas_flavor==0.6.0 -msgpack==1.1.0 -pyasn1_modules==0.4.1 -imagecodecs==2024.1.1 -mlflow==2.17.2 -watchfiles==0.24.0 -dm-sonnet==2.0.2 -langcodes==3.4.1 -freetype-py==2.3.0 -argon2-cffi-bindings==21.2.0 -trimesh==4.5.2 -opt_einsum==3.4.0 -tenacity==8.5.0 -h5py==3.12.1 -fastapi-cli==0.0.5 -oauthlib==3.2.2 -parso==0.8.4 -weasel==0.4.1 -yfinance==0.2.49 -networkx==2.8.8 -bitsandbytes==0.44.1 -lazy_loader==0.4 -querystring_parser==1.2.4 -contourpy==1.3.0 -unicodedata2==15.1.0 -bcrypt==4.2.0 -munkres==1.1.4 -langchain==0.0.298 -hpack==4.0.0 -cryptography==43.0.3 -umap-learn==0.5.7 -arrow==1.3.0 -docker==7.1.0 -certifi==2025.1.31 -fastjsonschema==2.20.0 -tensorflow==2.15.0 -googleapis-common-protos==1.65.0 -iniconfig==2.0.0 -Markdown==3.6 -llvmlite==0.43.0 -wslink==2.3.2 -attrs==24.2.0 -rich==13.9.4 -cupy==13.3.0 -uc-micro-py==1.0.3 -alembic==1.14.0 -joblib==1.4.2 -reportlab==4.2.5 -miniful==0.0.6 -jupyter_core==5.7.2 -wheel==0.45.0 -phik==0.12.3 -mistune==3.0.2 -wcwidth==0.2.13 -dacite==1.8.1 -accelerate==0.22.0 -sacremoses==0.0.53 -revtok==0.0.3 -python-slugify==8.0.4 -tangled-up-in-unicode==0.2.0 -dask==2024.11.0 -markdown-it-py==3.0.0 -sentencepiece==0.1.99 -beautifulsoup4==4.12.3 -six==1.16.0 -numba-cuda==0.0.17 -argon2-cffi==23.1.0 -xxhash==3.5.0 -hjson==3.1.0 -fonttools==4.54.1 -graphql-core==3.2.5 -pyparsing==3.2.0 -pure_eval==0.2.3 -distlib==0.3.9 -lightning==2.4.0 -wordcloud==0.0.0 -catalogue==2.0.10 -jax==0.4.27 -tree-sitter==0.23.2 -notebook==7.2.2 -dataclasses-json==0.6.7 -propcache==0.2.0 -numba==0.60.0 -dask-expr==1.1.17 -pydantic==2.9.2 -gunicorn==22.0.0 -missingno==0.5.2 -pyOpenSSL==24.2.1 -openpyxl==3.1.5 -packaging==24.1 -python-dotenv==1.0.1 -cycler==0.12.1 -types-pytz==2024.2.0.20241003 -yellowbrick==1.5 -referencing==0.35.1 -pyLDAvis==3.4.1 -lazypredict==0.2.16 -fqdn==1.5.1 -websocket-client==1.8.0 -fastcore==1.7.19 -pynvjitlink-cu12==0.3.0 -pingouin==0.5.5 -numpy==1.26.4 -typing-inspect==0.9.0 -nltk==3.9.1 -onnxruntime==1.19.2 -tensorflow-probability==0.23.0 -datasets==3.0.2 -pickleshare==0.7.5 -peewee==3.17.7 -torch-geometric==2.6.1 -ptyprocess==0.7.0 -greenlet==3.1.1 -graphql-relay==3.2.0 -graphene==3.4.3 -et_xmlfile==2.0.0 -webencodings==0.5.1 -hyperframe==6.0.1 -multitasking==0.0.9 -typer-slim==0.13.0 -onnx==1.15.0 -uvicorn==0.32.0 -memray==1.13.4 -xgboost==2.1.2 -Brotli==1.1.0 -zipp==3.21.0 -nbformat==5.10.4 -responses==0.18.0 -funcy==2.0 -Pygments==2.18.0 -tqdm==4.67.0 -linkify-it-py==2.0.3 -srsly==2.4.8 -cuda-python==12.6.0 -lightning-utilities==0.11.8 -cudf==24.12.0a337 -dask-ml==2024.4.4 -docker-pycreds==0.4.0 -pkgutil_resolve_name==1.3.10 -opentelemetry-api==1.16.0 -fsspec==2024.9.0 -nbclient==0.10.0 -psutil==5.9.8 -pytorch-lightning==2.4.0 -sortedcontainers==2.4.0 -matplotlib==3.9.2 -defusedxml==0.7.1 -urllib3==1.26.19 -jupyterlab_server==2.27.3 -retrying==1.3.3 -dask-cudf==24.12.0a337 -sqlparse==0.5.1 -text-unidecode==1.3 -seaborn==0.13.2 -typing_extensions==4.12.2 -pyzmq==26.2.0 -rfc3339-validator==0.1.4 -pynndescent==0.5.13 -pip==24.3.1 -confection==0.1.4 -wrapt==1.14.1 -fastprogress==1.0.3 -traitlets==5.14.3 -asttokens==2.4.1 -json5==0.9.28 -pandas-stubs==2.2.3.241126 -torchmetrics==1.2.1 -gitdb==4.0.11 -annotated-types==0.7.0 -ipython-autotime==0.1 -httpcore==1.0.6 -click==8.1.7 -setproctitle==1.3.3 -starlette==0.41.2 -jupyterlab==4.2.5 -rmm==24.12.0a27 -opentelemetry-sdk==1.16.0 -textblob==0.15.3 -imbalanced-learn==0.12.4 -typeguard==4.3.0 -more-itertools==10.3.0 -zipp==3.19.2 -autocommand==2.2.2 -jaraco.context==5.3.0 -packaging==24.1 -importlib_metadata==8.0.0 -platformdirs==4.2.2 -jaraco.functools==4.0.1 -importlib_resources==6.4.0 -tomli==2.0.1 -jaraco.text==3.12.1 -wheel==0.43.0 -jaraco.collections==5.1.0 -typing_extensions==4.12.2 -inflect==7.3.1 -backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_160955-rqk2hbkf/files/wandb-metadata.json b/wandb/run-20250504_160955-rqk2hbkf/files/wandb-metadata.json deleted file mode 100644 index 96c1d759e7d3dd3d826e5a66a823a8a3f9265c9c..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_160955-rqk2hbkf/files/wandb-metadata.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", - "python": "3.10.15", - "startedAt": "2025-05-04T13:09:55.928947Z", - "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", - "codePath": "finetuning_bc_prott5.py", - "email": "zeynep.isik1@sabanciuniv.edu", - "root": "/arf/scratch/zisik/prott5_bc_ft", - "host": "kolyoz1", - "username": "zisik", - "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", - "codePathLocal": "finetuning_bc_prott5.py", - "cpu_count": 64, - "cpu_count_logical": 64, - "gpu": "NVIDIA H100 80GB HBM3", - "gpu_count": 1, - "disk": { - "/": { - "total": "7643995308032", - "used": "272740364288" - } - }, - "memory": { - "total": "1081373220864" - }, - "cpu": { - "count": 64, - "countLogical": 64 - }, - "gpu_nvidia": [ - { - "name": "NVIDIA H100 80GB HBM3", - "memoryTotal": "85520809984", - "cudaCores": 16896, - "architecture": "Hopper" - } - ], - "slurm": { - "cluster_name": "cuda", - "conf": "/etc/slurm/slurm.conf", - "cpus_on_node": "16", - "cpus_per_task": "16", - "gpus_on_node": "1", - "gtids": "0", - "job_account": "tbag154", - "job_cpus_per_node": "16", - "job_end_time": "1746623370", - "job_gid": "11636", - "job_gpus": "1", - "job_id": "1027946", - "job_name": "msa_ph_pt", - "job_nodelist": "kolyoz1", - "job_num_nodes": "1", - "job_partition": "kolyoz-cuda", - "job_qos": "tbag", - "job_start_time": "1746364170", - "job_uid": "11636", - "job_user": "zisik", - "jobid": "1027946", - "localid": "0", - "mem_per_cpu": "14000", - "nnodes": "1", - "node_aliases": "(null)", - "nodeid": "0", - "nodelist": "kolyoz1", - "prio_process": "0", - "procid": "0", - "submit_dir": "/arf/scratch/zisik", - "submit_host": "cuda-ui", - "task_pid": "3179106", - "tasks_per_node": "1", - "topology_addr": "kolyoz1", - "topology_addr_pattern": "node", - "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" - }, - "cudaVersion": "12.6" -} \ No newline at end of file diff --git a/wandb/run-20250504_160955-rqk2hbkf/files/wandb-summary.json b/wandb/run-20250504_160955-rqk2hbkf/files/wandb-summary.json deleted file mode 100644 index 1d52051e315a7a21a9d9e5a40a517408bb086162..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_160955-rqk2hbkf/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"_wandb":{"runtime":2}} \ No newline at end of file diff --git a/wandb/run-20250504_160955-rqk2hbkf/logs/debug-core.log b/wandb/run-20250504_160955-rqk2hbkf/logs/debug-core.log deleted file mode 100644 index 999d56f784a1c5621e4f166d8ed3d656b4110162..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_160955-rqk2hbkf/logs/debug-core.log +++ /dev/null @@ -1,14 +0,0 @@ -{"time":"2025-05-04T16:09:55.241065297+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmplpbc9pnb/port-3179132.txt","pid":3179132,"debug":false,"disable-analytics":false} -{"time":"2025-05-04T16:09:55.241124751+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} -{"time":"2025-05-04T16:09:55.241864+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":37981,"Zone":""}} -{"time":"2025-05-04T16:09:55.241967868+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3179132} -{"time":"2025-05-04T16:09:55.428960455+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:40950"} -{"time":"2025-05-04T16:09:55.928508592+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"rqk2hbkf","id":"127.0.0.1:40950"} -{"time":"2025-05-04T16:09:56.056026556+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"rqk2hbkf","id":"127.0.0.1:40950"} -{"time":"2025-05-04T16:09:58.597503038+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:40950"} -{"time":"2025-05-04T16:09:58.597631333+03:00","level":"INFO","msg":"server is shutting down"} -{"time":"2025-05-04T16:09:58.597601675+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:40950"} -{"time":"2025-05-04T16:09:58.597793186+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:40950"} -{"time":"2025-05-04T16:09:59.528863432+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:40950"} -{"time":"2025-05-04T16:09:59.528880642+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:40950"} -{"time":"2025-05-04T16:09:59.528893164+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_160955-rqk2hbkf/logs/debug-internal.log b/wandb/run-20250504_160955-rqk2hbkf/logs/debug-internal.log deleted file mode 100644 index a63c5f27c7d8b0b1be30a0aa81b63cec47472ec9..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_160955-rqk2hbkf/logs/debug-internal.log +++ /dev/null @@ -1,19 +0,0 @@ -{"time":"2025-05-04T16:09:55.930352223+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} -{"time":"2025-05-04T16:09:55.930398642+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_160955-rqk2hbkf/logs/debug-core.log"} -{"time":"2025-05-04T16:09:56.055953645+03:00","level":"INFO","msg":"created new stream","id":"rqk2hbkf"} -{"time":"2025-05-04T16:09:56.056013829+03:00","level":"INFO","msg":"stream: started","id":"rqk2hbkf"} -{"time":"2025-05-04T16:09:56.056183059+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"rqk2hbkf"} -{"time":"2025-05-04T16:09:56.056291373+03:00","level":"INFO","msg":"sender: started","stream_id":"rqk2hbkf"} -{"time":"2025-05-04T16:09:56.056498843+03:00","level":"INFO","msg":"handler: started","stream_id":"rqk2hbkf"} -{"time":"2025-05-04T16:09:56.455842701+03:00","level":"INFO","msg":"Starting system monitor"} -{"time":"2025-05-04T16:09:58.597599181+03:00","level":"INFO","msg":"stream: closing","id":"rqk2hbkf"} -{"time":"2025-05-04T16:09:58.597716873+03:00","level":"INFO","msg":"Stopping system monitor"} -{"time":"2025-05-04T16:09:58.598825235+03:00","level":"INFO","msg":"Stopped system monitor"} -{"time":"2025-05-04T16:09:58.792882763+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} -{"time":"2025-05-04T16:09:58.792915401+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} -{"time":"2025-05-04T16:09:58.792926694+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} -{"time":"2025-05-04T16:09:59.286977407+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} -{"time":"2025-05-04T16:09:59.528666057+03:00","level":"INFO","msg":"handler: closed","stream_id":"rqk2hbkf"} -{"time":"2025-05-04T16:09:59.528710573+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"rqk2hbkf"} -{"time":"2025-05-04T16:09:59.528726369+03:00","level":"INFO","msg":"sender: closed","stream_id":"rqk2hbkf"} -{"time":"2025-05-04T16:09:59.528792264+03:00","level":"INFO","msg":"stream: closed","id":"rqk2hbkf"} diff --git a/wandb/run-20250504_160955-rqk2hbkf/logs/debug.log b/wandb/run-20250504_160955-rqk2hbkf/logs/debug.log deleted file mode 100644 index bb9afff70b842f64258d1cce03b036f94b3b7f15..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_160955-rqk2hbkf/logs/debug.log +++ /dev/null @@ -1,26 +0,0 @@ -2025-05-04 16:09:55,914 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 -2025-05-04 16:09:55,915 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Configure stats pid to 3179132 -2025-05-04 16:09:55,915 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings -2025-05-04 16:09:55,915 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings -2025-05-04 16:09:55,915 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} -2025-05-04 16:09:55,915 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} -2025-05-04 16:09:55,915 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 16:09:55,916 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 16:09:55,916 INFO MainThread:3179132 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_160955-rqk2hbkf/logs/debug.log -2025-05-04 16:09:55,916 INFO MainThread:3179132 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_160955-rqk2hbkf/logs/debug-internal.log -2025-05-04 16:09:55,917 INFO MainThread:3179132 [wandb_init.py:init():619] calling init triggers -2025-05-04 16:09:55,917 INFO MainThread:3179132 [wandb_init.py:init():626] wandb.init called with sweep_config: {} -config: {} -2025-05-04 16:09:55,917 INFO MainThread:3179132 [wandb_init.py:init():669] starting backend -2025-05-04 16:09:55,917 INFO MainThread:3179132 [wandb_init.py:init():673] sending inform_init request -2025-05-04 16:09:55,925 INFO MainThread:3179132 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn -2025-05-04 16:09:55,927 INFO MainThread:3179132 [wandb_init.py:init():686] backend started and connected -2025-05-04 16:09:55,965 INFO MainThread:3179132 [wandb_init.py:init():781] updated telemetry -2025-05-04 16:09:55,969 INFO MainThread:3179132 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout -2025-05-04 16:09:56,441 INFO MainThread:3179132 [wandb_init.py:init():867] starting run threads in backend -2025-05-04 16:09:57,857 INFO MainThread:3179132 [wandb_run.py:_console_start():2456] atexit reg -2025-05-04 16:09:57,858 INFO MainThread:3179132 [wandb_run.py:_redirect():2305] redirect: wrap_raw -2025-05-04 16:09:57,859 INFO MainThread:3179132 [wandb_run.py:_redirect():2370] Wrapping output streams. -2025-05-04 16:09:57,859 INFO MainThread:3179132 [wandb_run.py:_redirect():2395] Redirects installed. -2025-05-04 16:09:57,874 INFO MainThread:3179132 [wandb_init.py:init():911] run started, returning control to user process -2025-05-04 16:09:58,598 WARNING MsgRouterThr:3179132 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_160955-rqk2hbkf/run-rqk2hbkf.wandb b/wandb/run-20250504_160955-rqk2hbkf/run-rqk2hbkf.wandb deleted file mode 100644 index e9a6c7bc04fa77bdb7e2940e46071101d371b1d3..0000000000000000000000000000000000000000 Binary files a/wandb/run-20250504_160955-rqk2hbkf/run-rqk2hbkf.wandb and /dev/null differ diff --git a/wandb/run-20250504_161246-rdbtc2pz/files/config.yaml b/wandb/run-20250504_161246-rdbtc2pz/files/config.yaml deleted file mode 100644 index 901a1d15058a51157e2bae9ec48a096a58e60825..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_161246-rdbtc2pz/files/config.yaml +++ /dev/null @@ -1,357 +0,0 @@ -_wandb: - value: - cli_version: 0.18.7 - m: - - "1": eval/loss - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/global_step - "6": - - 3 - "7": [] - - "1": eval/runtime - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/samples_per_second - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/steps_per_second - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/accuracy - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/epoch - "5": 2 - "6": - - 1 - - 3 - "7": [] - python_version: 3.10.15 - t: - "1": - - 1 - - 2 - - 3 - - 5 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "2": - - 1 - - 2 - - 3 - - 5 - - 6 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "3": - - 7 - - 23 - - 55 - - 62 - - 66 - "4": 3.10.15 - "5": 0.18.7 - "6": 4.45.2 - "8": - - 5 - "9": - "1": transformers_trainer - "12": 0.18.7 - "13": linux-x86_64 -accelerator_config: - value: - dispatch_batches: null - even_batches: true - gradient_accumulation_kwargs: null - non_blocking: false - split_batches: false - use_seedable_sampler: true -adafactor: - value: false -adam_beta1: - value: 0.9 -adam_beta2: - value: 0.999 -adam_epsilon: - value: 1e-08 -auto_find_batch_size: - value: false -batch_eval_metrics: - value: false -bf16: - value: false -bf16_full_eval: - value: false -data_seed: - value: null -dataloader_drop_last: - value: false -dataloader_num_workers: - value: 0 -dataloader_persistent_workers: - value: false -dataloader_pin_memory: - value: true -dataloader_prefetch_factor: - value: null -ddp_backend: - value: null -ddp_broadcast_buffers: - value: null -ddp_bucket_cap_mb: - value: null -ddp_find_unused_parameters: - value: null -ddp_timeout: - value: 1800 -debug: - value: [] -deepspeed: - value: null -disable_tqdm: - value: false -dispatch_batches: - value: null -do_eval: - value: true -do_predict: - value: false -do_train: - value: false -eval_accumulation_steps: - value: null -eval_delay: - value: 0 -eval_do_concat_batches: - value: true -eval_on_start: - value: false -eval_steps: - value: null -eval_strategy: - value: epoch -eval_use_gather_object: - value: false -evaluation_strategy: - value: epoch -fp16: - value: true -fp16_backend: - value: auto -fp16_full_eval: - value: false -fp16_opt_level: - value: O1 -fsdp: - value: [] -fsdp_config: - value: - min_num_params: 0 - xla: false - xla_fsdp_grad_ckpt: false - xla_fsdp_v2: false -fsdp_min_num_params: - value: 0 -fsdp_transformer_layer_cls_to_wrap: - value: null -full_determinism: - value: false -gradient_accumulation_steps: - value: 4 -gradient_checkpointing: - value: false -gradient_checkpointing_kwargs: - value: null -greater_is_better: - value: false -group_by_length: - value: false -half_precision_backend: - value: auto -hub_always_push: - value: false -hub_model_id: - value: null -hub_private_repo: - value: false -hub_strategy: - value: every_save -hub_token: - value: -ignore_data_skip: - value: false -include_inputs_for_metrics: - value: false -include_num_input_tokens_seen: - value: false -include_tokens_per_second: - value: false -jit_mode_eval: - value: false -label_names: - value: null -label_smoothing_factor: - value: 0 -learning_rate: - value: 5e-05 -length_column_name: - value: length -load_best_model_at_end: - value: true -local_rank: - value: 0 -log_level: - value: passive -log_level_replica: - value: warning -log_on_each_node: - value: true -logging_dir: - value: t5-bc-out/runs/May04_16-12-52_kolyoz1 -logging_first_step: - value: false -logging_nan_inf_filter: - value: true -logging_steps: - value: 500 -logging_strategy: - value: steps -lr_scheduler_type: - value: linear -max_grad_norm: - value: 1 -max_steps: - value: -1 -metric_for_best_model: - value: loss -mp_parameters: - value: "" -neftune_noise_alpha: - value: null -no_cuda: - value: false -num_train_epochs: - value: 3 -optim: - value: adamw_torch -optim_args: - value: null -optim_target_modules: - value: null -output_dir: - value: t5-bc-out -overwrite_output_dir: - value: false -past_index: - value: -1 -per_device_eval_batch_size: - value: 8 -per_device_train_batch_size: - value: 8 -per_gpu_eval_batch_size: - value: null -per_gpu_train_batch_size: - value: null -prediction_loss_only: - value: false -push_to_hub: - value: false -push_to_hub_model_id: - value: null -push_to_hub_organization: - value: null -push_to_hub_token: - value: -ray_scope: - value: last -remove_unused_columns: - value: true -report_to: - value: - - wandb -restore_callback_states_from_checkpoint: - value: false -resume_from_checkpoint: - value: null -run_name: - value: t5-bc-out -save_on_each_node: - value: false -save_only_model: - value: false -save_safetensors: - value: false -save_steps: - value: 500 -save_strategy: - value: epoch -save_total_limit: - value: null -seed: - value: 42 -skip_memory_metrics: - value: true -split_batches: - value: null -tf32: - value: null -torch_compile: - value: false -torch_compile_backend: - value: null -torch_compile_mode: - value: null -torch_empty_cache_steps: - value: null -torchdynamo: - value: null -tpu_metrics_debug: - value: false -tpu_num_cores: - value: null -use_cpu: - value: false -use_ipex: - value: false -use_legacy_prediction_loop: - value: false -use_liger_kernel: - value: false -use_mps_device: - value: false -warmup_ratio: - value: 0 -warmup_steps: - value: 0 -weight_decay: - value: 0 diff --git a/wandb/run-20250504_161246-rdbtc2pz/files/output.log b/wandb/run-20250504_161246-rdbtc2pz/files/output.log deleted file mode 100644 index 19a60f451615e772fad3d7c838cfbc32af90c5ca..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_161246-rdbtc2pz/files/output.log +++ /dev/null @@ -1,27 +0,0 @@ -You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 -Map: 100%|██████████| 70/70 [00:00<00:00, 4499.50 examples/s] -Map: 100%|██████████| 15/15 [00:00<00:00, 2515.68 examples/s] -/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead - warnings.warn( -[2025-05-04 16:12:57,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. -100%|██████████| 6/6 [01:04<00:00, 10.71s/it] -Map: 100%|██████████| 15/15 [00:00<00:00, 3408.53 examples/s] -{'eval_loss': 0.2836913764476776, 'eval_accuracy': 1.0, 'eval_runtime': 0.0837, 'eval_samples_per_second': 179.205, 'eval_steps_per_second': 23.894, 'epoch': 0.89} -{'eval_loss': 0.10505779087543488, 'eval_accuracy': 1.0, 'eval_runtime': 0.0869, 'eval_samples_per_second': 172.624, 'eval_steps_per_second': 23.017, 'epoch': 1.78} -{'eval_loss': 0.05776570364832878, 'eval_accuracy': 1.0, 'eval_runtime': 0.1, 'eval_samples_per_second': 149.979, 'eval_steps_per_second': 19.997, 'epoch': 2.67} -{'train_runtime': 64.2466, 'train_samples_per_second': 3.269, 'train_steps_per_second': 0.093, 'train_loss': 0.3210471471150716, 'epoch': 2.67} -100%|██████████| 2/2 [00:00<00:00, 77.74it/s] -{'eval_loss': 0.05800781771540642, 'eval_accuracy': 1.0, 'eval_runtime': 0.0642, 'eval_samples_per_second': 233.689, 'eval_steps_per_second': 31.158, 'epoch': 2.6666666666666665} -Traceback (most recent call last): - File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in - model.push_to_hub("isikz/prot_t5_binary_classifier") - File "/arf/home/zisik/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1928, in __getattr__ - raise AttributeError( -AttributeError: 'T5BinaryClassifier' object has no attribute 'push_to_hub' -Traceback (most recent call last): - File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in - model.push_to_hub("isikz/prot_t5_binary_classifier") - File "/arf/home/zisik/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1928, in __getattr__ - raise AttributeError( -AttributeError: 'T5BinaryClassifier' object has no attribute 'push_to_hub' diff --git a/wandb/run-20250504_161246-rdbtc2pz/files/requirements.txt b/wandb/run-20250504_161246-rdbtc2pz/files/requirements.txt deleted file mode 100644 index 847c45ecccb522de294762faeeb01fe5fb02f7ac..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_161246-rdbtc2pz/files/requirements.txt +++ /dev/null @@ -1,541 +0,0 @@ -nvidia-cuda-cupti-cu12==12.4.127 -nvidia-cuda-nvrtc-cu12==12.4.127 -pyg-lib==0.4.0+pt20cu117 -biopython==1.85 -iniconfig==2.0.0 -tokenizers==0.20.0 -accelerate==1.3.0 -torch==2.6.0 -nvidia-nccl-cu12==2.21.5 -transformers==4.45.2 -nvidia-cusparse-cu12==12.3.1.170 -torch-scatter==2.1.2+pt20cu117 -nvidia-cusparselt-cu12==0.6.2 -nvidia-nvtx-cu12==12.4.127 -zstd==1.5.6.6 -fair-esm==2.0.0 -omegaconf==2.3.0 -pluggy==1.5.0 -pytest==8.3.5 -nvidia-curand-cu12==10.3.5.147 -nvidia-cufft-cu12==11.2.1.3 -torch-cluster==1.6.3+pt20cu117 -regex==2024.9.11 -nvidia-cudnn-cu12==9.1.0.70 -torch-spline-conv==1.2.2+pt20cu117 -nvidia-cusolver-cu12==11.6.1.9 -antlr4-python3-runtime==4.9.3 -msgpack-numpy==0.4.8 -nlp==0.2.0 -einops==0.8.1 -nvidia-cublas-cu12==12.4.5.8 -triton==3.2.0 -ninja==1.11.1.3 -hydra-core==1.3.2 -nvidia-nvjitlink-cu12==12.4.127 -biotite==0.41.2 -torch-sparse==0.6.18+pt20cu117 -esm==3.1.4 -sympy==1.13.1 -nvidia-cuda-runtime-cu12==12.4.127 -jupyter-lsp==2.2.5 -jupyter-events==0.10.0 -ipykernel==6.29.5 -Mako==1.3.5 -proto-plus==1.25.0 -fst-pso==1.8.1 -gensim==4.3.3 -htmlmin==0.1.12 -tokenizers==0.13.3 -timm==1.0.11 -MarkupSafe==3.0.2 -safetensors==0.4.5 -requests==2.32.3 -gast==0.5.5 -cuml==24.12.0a33 -jaxlib==0.4.23.dev20240214 -spacy-loggers==1.0.5 -pytz==2024.1 -idna==3.10 -python-dateutil==2.9.0 -mdurl==0.1.2 -blis==0.7.10 -jupyter==1.1.1 -pyerfa==2.0.1.5 -comm==0.2.2 -pygraphviz==1.14 -dill==0.3.8 -paramiko==3.5.0 -llama-index==0.8.36 -mdit-py-plugins==0.4.2 -Werkzeug==3.1.3 -pyu2f==0.1.5 -dask-glm==0.2.0 -httpx==0.27.2 -typeguard==4.4.1 -mypy-extensions==1.0.0 -kmodes==0.12.2 -keras==2.15.0 -ydata-profiling==0.0.dev0 -regex==2024.11.6 -xarray==2024.11.0 -setuptools==75.3.0 -charset-normalizer==3.4.0 -jupyterlab_nvdashboard==0.11.0 -pylibraft==24.12.0a36 -spacy==3.7.6 -mlflow-skinny==2.17.2 -nvtx==0.2.10 -multimethod==1.12 -pexpect==4.9.0 -torch==2.1.0.post301 -flatbuffers==24.3.25 -python-json-logger==2.0.7 -PyJWT==2.9.0 -multiprocess==0.70.16 -colorlover==0.3.0 -yarl==1.16.0 -locket==1.0.0 -patsy==1.0.0 -rapids-dask-dependency==24.12.0a0 -stanza==1.9.2 -debugpy==1.8.8 -jupyterlab_pygments==0.3.0 -pylibcudf==24.12.0a337 -lz4==4.3.3 -pandas==2.2.3 -tifffile==2024.9.20 -pynvml==11.4.1 -cufflinks==0.17.3 -ipywidgets==8.1.5 -requests-oauthlib==2.0.0 -google-auth-oauthlib==1.2.1 -rsa==4.9 -webcolors==24.8.0 -jsonschema-specifications==2024.10.1 -scikit-learn==1.5.2 -langchain-text-splitters==0.3.2 -pandas-datareader==0.10.0 -tomli==2.0.2 -tzdata==2024.2 -scikit-image==0.24.0 -tensorboard_data_server==0.7.0 -kiwisolver==1.4.7 -cloudpathlib==0.20.0 -isodate==0.6.1 -adversarial-robustness-toolbox==1.19.1 -SQLAlchemy==2.0.36 -pytest-runner==6.0.0 -pycairo==1.27.0 -treelite==4.3.0 -jiter==0.7.0 -threadpoolctl==3.5.0 -pandocfilters==1.5.0 -loguru==0.7.2 -smart_open==7.0.5 -shellingham==1.5.4 -deepspeed==0.15.4 -prompt_toolkit==3.0.48 -databricks-sdk==0.34.0 -langchain-core==0.3.15 -imageio==2.36.0 -openapi-schema-pydantic==1.2.4 -zict==3.0.0 -cachetools==5.5.0 -colorful==0.5.6 -mpmath==1.3.0 -nest_asyncio==1.6.0 -pyFUME==0.2.25 -opencv-python-headless==4.9.0 -fastai==2.7.18 -importlib_resources==6.4.5 -binaryornot==0.4.4 -evaluate==0.4.1 -matplotlib-inline==0.1.7 -wasabi==1.1.2 -pycparser==2.22 -GitPython==3.1.43 -pluggy==1.5.0 -async-lru==2.0.4 -pgmpy==0.1.24 -anyio==4.4.0 -executing==2.1.0 -orjson==3.10.11 -humanfriendly==10.0 -tornado==6.4.1 -gmpy2==2.1.5 -rlPyCairo==0.2.0 -distributed==2024.11.0 -FuzzyTM==2.0.5 -torchtext==0.15.2a0+5ce3163 -pytest==8.3.5 -pyod==2.0.2 -ImageHash==4.3.1 -soupsieve==2.5 -tblib==3.0.0 -emoji==2.14.0 -aiohappyeyeballs==2.4.3 -uri-template==1.3.0 -tensorflow_estimator==2.15.0 -babel==2.16.0 -dask-cuda==24.12.0a12 -overrides==7.7.0 -opencensus==0.11.3 -openai==0.28.1 -language_data==1.2.0 -jedi==0.19.2 -cookiecutter==2.6.0 -entrypoints==0.4 -exceptiongroup==1.2.2 -marisa-trie==1.2.0 -uvloop==0.20.0 -aiosignal==1.3.1 -Flask==3.0.3 -tensorboard==2.15.2 -cffi==1.17.1 -tf_keras==2.15.0 -absl-py==2.1.0 -blinker==1.9.0 -types-python-dateutil==2.9.0.20241003 -opencv-python==4.9.0 -frozendict==2.4.6 -aiohttp-cors==0.7.0 -statsmodels==0.14.4 -tinycss2==1.4.0 -terminado==0.18.1 -pycaret==2.2.3 -aiohttp==3.10.10 -distributed-ucxx==0.41.0 -prometheus_client==0.21.0 -fastdownload==0.0.7 -grpcio==1.59.3 -google-api-core==2.22.0 -jupyterlab_widgets==3.0.13 -appdirs==1.4.4 -littleutils==0.0.0 -ray==2.24.0 -kaggle==1.6.17 -jsonschema==4.23.0 -google-auth==2.36.0 -scikit-base==0.11.0 -visions==0.7.6 -pyarrow==15.0.0 -transformers==4.33.0 -prometheus_flask_exporter==0.23.1 -dm-tree==0.1.8 -colorama==0.4.6 -requests-toolbelt==1.0.0 -cached-property==1.5.2 -cymem==2.0.8 -PyNaCl==1.5.0 -PyWavelets==1.7.0 -httptools==0.6.1 -typing-utils==0.1.0 -email_validator==2.2.0 -marshmallow==3.23.1 -Deprecated==1.2.14 -virtualenv==20.4.7 -optuna==3.6.1 -jupyter_server==2.14.2 -termcolor==2.5.0 -mpi4py==4.0.1 -torchdata==0.7.1+8cea82f -dataclasses==0.8 -cloudpickle==3.1.0 -tree_sitter_languages==1.10.2 -tabulate==0.9.0 -ipython==8.29.0 -lightgbm==4.3.0 -captum==0.6.0 -confuse==2.0.1 -torchvision==0.16.1+adc3221 -lxml==4.9.4 -fastapi==0.115.4 -python-multipart==0.0.17 -dnspython==2.7.0 -jupyter-console==6.6.3 -preshed==3.0.9 -py-cpuinfo==9.0.0 -Send2Trash==1.8.3 -murmurhash==1.0.10 -sniffio==1.3.1 -websockets==13.1 -h11==0.14.0 -smmap==5.0.0 -textual==0.85.2 -jsonpatch==1.33 -opencensus-context==0.1.3 -nbconvert==7.16.4 -sentry-sdk==2.19.0 -opentelemetry-semantic-conventions==0.37b0 -pandas-profiling==2.8.0 -pillow==10.3.0 -peft==0.13.2 -rpds-py==0.21.0 -bokeh==3.6.1 -distro==1.9.0 -itsdangerous==2.2.0 -wandb==0.18.7 -jsonpointer==3.0.0 -astropy-iers-data==0.2024.11.11.0.32.38 -horovod==0.28.1 -graphviz==0.20.3 -vtk==9.3.1 -bleach==6.2.0 -numexpr==2.8.7 -pydantic_core==2.23.4 -Jinja2==3.1.4 -widgetsnbextension==4.0.13 -filelock==3.16.1 -catboost==1.2.7 -raft-dask==24.12.0a36 -async-timeout==4.0.3 -datefinder==0.7.3 -coloredlogs==15.0.1 -platformdirs==4.3.6 -spacy-legacy==3.0.12 -chardet==5.2.0 -jupyter_client==8.6.3 -importlib_metadata==8.5.0 -rfc3986-validator==0.1.1 -huggingface_hub==0.26.2 -PySocks==1.7.1 -mlxtend==0.23.2 -outdated==0.2.2 -partd==1.4.2 -thinc==8.2.5 -astropy==6.1.6 -rdflib==6.3.2 -h2==4.1.0 -typer==0.13.0 -xyzservices==2024.9.0 -toolz==0.12.1 -frozenlist==1.5.0 -rdkit==2024.9.2 -pyasn1==0.6.1 -jupyter_server_terminals==0.5.3 -ucx-py==0.41.0a11 -astunparse==1.6.3 -simpful==2.12.0 -notebook_shim==0.2.4 -scipy==1.13.1 -colorlog==6.9.0 -tiktoken==0.3.3 -plotly==5.24.1 -fastrlock==0.8.2 -chart-studio==1.1.0 -stack-data==0.6.2 -google-pasta==0.2.0 -sktime==0.34.0 -PyYAML==6.0.2 -sympy==1.13.3 -multidict==6.1.0 -ml-dtypes==0.2.0 -tensorboardX==2.6.2.2 -decorator==5.1.1 -cytoolz==1.0.0 -ase==3.23.0 -isoduration==20.11.0 -html5lib==1.1 -langsmith==0.1.142 -future==1.0.0 -onnx2torch==1.5.15 -multipledispatch==0.6.0 -protobuf==4.24.4 -ucxx==0.41.0 -pandas_flavor==0.6.0 -msgpack==1.1.0 -pyasn1_modules==0.4.1 -imagecodecs==2024.1.1 -mlflow==2.17.2 -watchfiles==0.24.0 -dm-sonnet==2.0.2 -langcodes==3.4.1 -freetype-py==2.3.0 -argon2-cffi-bindings==21.2.0 -trimesh==4.5.2 -opt_einsum==3.4.0 -tenacity==8.5.0 -h5py==3.12.1 -fastapi-cli==0.0.5 -oauthlib==3.2.2 -parso==0.8.4 -weasel==0.4.1 -yfinance==0.2.49 -networkx==2.8.8 -bitsandbytes==0.44.1 -lazy_loader==0.4 -querystring_parser==1.2.4 -contourpy==1.3.0 -unicodedata2==15.1.0 -bcrypt==4.2.0 -munkres==1.1.4 -langchain==0.0.298 -hpack==4.0.0 -cryptography==43.0.3 -umap-learn==0.5.7 -arrow==1.3.0 -docker==7.1.0 -certifi==2025.1.31 -fastjsonschema==2.20.0 -tensorflow==2.15.0 -googleapis-common-protos==1.65.0 -iniconfig==2.0.0 -Markdown==3.6 -llvmlite==0.43.0 -wslink==2.3.2 -attrs==24.2.0 -rich==13.9.4 -cupy==13.3.0 -uc-micro-py==1.0.3 -alembic==1.14.0 -joblib==1.4.2 -reportlab==4.2.5 -miniful==0.0.6 -jupyter_core==5.7.2 -wheel==0.45.0 -phik==0.12.3 -mistune==3.0.2 -wcwidth==0.2.13 -dacite==1.8.1 -accelerate==0.22.0 -sacremoses==0.0.53 -revtok==0.0.3 -python-slugify==8.0.4 -tangled-up-in-unicode==0.2.0 -dask==2024.11.0 -markdown-it-py==3.0.0 -sentencepiece==0.1.99 -beautifulsoup4==4.12.3 -six==1.16.0 -numba-cuda==0.0.17 -argon2-cffi==23.1.0 -xxhash==3.5.0 -hjson==3.1.0 -fonttools==4.54.1 -graphql-core==3.2.5 -pyparsing==3.2.0 -pure_eval==0.2.3 -distlib==0.3.9 -lightning==2.4.0 -wordcloud==0.0.0 -catalogue==2.0.10 -jax==0.4.27 -tree-sitter==0.23.2 -notebook==7.2.2 -dataclasses-json==0.6.7 -propcache==0.2.0 -numba==0.60.0 -dask-expr==1.1.17 -pydantic==2.9.2 -gunicorn==22.0.0 -missingno==0.5.2 -pyOpenSSL==24.2.1 -openpyxl==3.1.5 -packaging==24.1 -python-dotenv==1.0.1 -cycler==0.12.1 -types-pytz==2024.2.0.20241003 -yellowbrick==1.5 -referencing==0.35.1 -pyLDAvis==3.4.1 -lazypredict==0.2.16 -fqdn==1.5.1 -websocket-client==1.8.0 -fastcore==1.7.19 -pynvjitlink-cu12==0.3.0 -pingouin==0.5.5 -numpy==1.26.4 -typing-inspect==0.9.0 -nltk==3.9.1 -onnxruntime==1.19.2 -tensorflow-probability==0.23.0 -datasets==3.0.2 -pickleshare==0.7.5 -peewee==3.17.7 -torch-geometric==2.6.1 -ptyprocess==0.7.0 -greenlet==3.1.1 -graphql-relay==3.2.0 -graphene==3.4.3 -et_xmlfile==2.0.0 -webencodings==0.5.1 -hyperframe==6.0.1 -multitasking==0.0.9 -typer-slim==0.13.0 -onnx==1.15.0 -uvicorn==0.32.0 -memray==1.13.4 -xgboost==2.1.2 -Brotli==1.1.0 -zipp==3.21.0 -nbformat==5.10.4 -responses==0.18.0 -funcy==2.0 -Pygments==2.18.0 -tqdm==4.67.0 -linkify-it-py==2.0.3 -srsly==2.4.8 -cuda-python==12.6.0 -lightning-utilities==0.11.8 -cudf==24.12.0a337 -dask-ml==2024.4.4 -docker-pycreds==0.4.0 -pkgutil_resolve_name==1.3.10 -opentelemetry-api==1.16.0 -fsspec==2024.9.0 -nbclient==0.10.0 -psutil==5.9.8 -pytorch-lightning==2.4.0 -sortedcontainers==2.4.0 -matplotlib==3.9.2 -defusedxml==0.7.1 -urllib3==1.26.19 -jupyterlab_server==2.27.3 -retrying==1.3.3 -dask-cudf==24.12.0a337 -sqlparse==0.5.1 -text-unidecode==1.3 -seaborn==0.13.2 -typing_extensions==4.12.2 -pyzmq==26.2.0 -rfc3339-validator==0.1.4 -pynndescent==0.5.13 -pip==24.3.1 -confection==0.1.4 -wrapt==1.14.1 -fastprogress==1.0.3 -traitlets==5.14.3 -asttokens==2.4.1 -json5==0.9.28 -pandas-stubs==2.2.3.241126 -torchmetrics==1.2.1 -gitdb==4.0.11 -annotated-types==0.7.0 -ipython-autotime==0.1 -httpcore==1.0.6 -click==8.1.7 -setproctitle==1.3.3 -starlette==0.41.2 -jupyterlab==4.2.5 -rmm==24.12.0a27 -opentelemetry-sdk==1.16.0 -textblob==0.15.3 -imbalanced-learn==0.12.4 -typeguard==4.3.0 -more-itertools==10.3.0 -zipp==3.19.2 -autocommand==2.2.2 -jaraco.context==5.3.0 -packaging==24.1 -importlib_metadata==8.0.0 -platformdirs==4.2.2 -jaraco.functools==4.0.1 -importlib_resources==6.4.0 -tomli==2.0.1 -jaraco.text==3.12.1 -wheel==0.43.0 -jaraco.collections==5.1.0 -typing_extensions==4.12.2 -inflect==7.3.1 -backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_161246-rdbtc2pz/files/wandb-metadata.json b/wandb/run-20250504_161246-rdbtc2pz/files/wandb-metadata.json deleted file mode 100644 index d7d40733d28be0ffdf7ad38c1cd91cd7308f5fd5..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_161246-rdbtc2pz/files/wandb-metadata.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", - "python": "3.10.15", - "startedAt": "2025-05-04T13:12:46.058889Z", - "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", - "codePath": "finetuning_bc_prott5.py", - "email": "zeynep.isik1@sabanciuniv.edu", - "root": "/arf/scratch/zisik/prott5_bc_ft", - "host": "kolyoz1", - "username": "zisik", - "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", - "codePathLocal": "finetuning_bc_prott5.py", - "cpu_count": 64, - "cpu_count_logical": 64, - "gpu": "NVIDIA H100 80GB HBM3", - "gpu_count": 1, - "disk": { - "/": { - "total": "7643995308032", - "used": "274907410432" - } - }, - "memory": { - "total": "1081373220864" - }, - "cpu": { - "count": 64, - "countLogical": 64 - }, - "gpu_nvidia": [ - { - "name": "NVIDIA H100 80GB HBM3", - "memoryTotal": "85520809984", - "cudaCores": 16896, - "architecture": "Hopper" - } - ], - "slurm": { - "cluster_name": "cuda", - "conf": "/etc/slurm/slurm.conf", - "cpus_on_node": "16", - "cpus_per_task": "16", - "gpus_on_node": "1", - "gtids": "0", - "job_account": "tbag154", - "job_cpus_per_node": "16", - "job_end_time": "1746623540", - "job_gid": "11636", - "job_gpus": "1", - "job_id": "1027947", - "job_name": "msa_ph_pt", - "job_nodelist": "kolyoz1", - "job_num_nodes": "1", - "job_partition": "kolyoz-cuda", - "job_qos": "tbag", - "job_start_time": "1746364340", - "job_uid": "11636", - "job_user": "zisik", - "jobid": "1027947", - "localid": "0", - "mem_per_cpu": "14000", - "nnodes": "1", - "node_aliases": "(null)", - "nodeid": "0", - "nodelist": "kolyoz1", - "prio_process": "0", - "procid": "0", - "submit_dir": "/arf/scratch/zisik", - "submit_host": "cuda-ui", - "task_pid": "3179500", - "tasks_per_node": "1", - "topology_addr": "kolyoz1", - "topology_addr_pattern": "node", - "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" - }, - "cudaVersion": "12.6" -} \ No newline at end of file diff --git a/wandb/run-20250504_161246-rdbtc2pz/files/wandb-summary.json b/wandb/run-20250504_161246-rdbtc2pz/files/wandb-summary.json deleted file mode 100644 index ca1304c1a6f3387bb206e3ac8c9bfa50dd878c77..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_161246-rdbtc2pz/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"train_loss":0.3210471471150716,"_runtime":80.142129451,"train_runtime":64.2466,"eval/loss":0.05800781771540642,"eval/steps_per_second":31.158,"total_flos":0,"eval/samples_per_second":233.689,"train/global_step":6,"_timestamp":1.746364446200474e+09,"train_samples_per_second":3.269,"_wandb":{"runtime":80},"eval/runtime":0.0642,"train_steps_per_second":0.093,"train/epoch":2.6666666666666665,"eval/accuracy":1,"_step":4} \ No newline at end of file diff --git a/wandb/run-20250504_161246-rdbtc2pz/logs/debug-core.log b/wandb/run-20250504_161246-rdbtc2pz/logs/debug-core.log deleted file mode 100644 index 2233811df4108abf9e6d1a7a308e7fd9f315ac85..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_161246-rdbtc2pz/logs/debug-core.log +++ /dev/null @@ -1,14 +0,0 @@ -{"time":"2025-05-04T16:12:45.059197409+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmphflqkva1/port-3179526.txt","pid":3179526,"debug":false,"disable-analytics":false} -{"time":"2025-05-04T16:12:45.059250836+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} -{"time":"2025-05-04T16:12:45.060076988+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3179526} -{"time":"2025-05-04T16:12:45.059982306+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":45921,"Zone":""}} -{"time":"2025-05-04T16:12:45.246915089+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:33132"} -{"time":"2025-05-04T16:12:46.063164622+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"rdbtc2pz","id":"127.0.0.1:33132"} -{"time":"2025-05-04T16:12:46.187062148+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"rdbtc2pz","id":"127.0.0.1:33132"} -{"time":"2025-05-04T16:14:06.269673416+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:33132"} -{"time":"2025-05-04T16:14:06.269788395+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:33132"} -{"time":"2025-05-04T16:14:06.26984398+03:00","level":"INFO","msg":"server is shutting down"} -{"time":"2025-05-04T16:14:06.269980058+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:33132"} -{"time":"2025-05-04T16:14:07.608460726+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:33132"} -{"time":"2025-05-04T16:14:07.608482723+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:33132"} -{"time":"2025-05-04T16:14:07.60849804+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_161246-rdbtc2pz/logs/debug-internal.log b/wandb/run-20250504_161246-rdbtc2pz/logs/debug-internal.log deleted file mode 100644 index 2f9c168a41928938cfb3d4e4e4131691e000328e..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_161246-rdbtc2pz/logs/debug-internal.log +++ /dev/null @@ -1,19 +0,0 @@ -{"time":"2025-05-04T16:12:46.065859772+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} -{"time":"2025-05-04T16:12:46.065909143+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_161246-rdbtc2pz/logs/debug-core.log"} -{"time":"2025-05-04T16:12:46.186999454+03:00","level":"INFO","msg":"created new stream","id":"rdbtc2pz"} -{"time":"2025-05-04T16:12:46.187050012+03:00","level":"INFO","msg":"stream: started","id":"rdbtc2pz"} -{"time":"2025-05-04T16:12:46.187228889+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"rdbtc2pz"} -{"time":"2025-05-04T16:12:46.187328701+03:00","level":"INFO","msg":"handler: started","stream_id":"rdbtc2pz"} -{"time":"2025-05-04T16:12:46.187417103+03:00","level":"INFO","msg":"sender: started","stream_id":"rdbtc2pz"} -{"time":"2025-05-04T16:12:46.598141294+03:00","level":"INFO","msg":"Starting system monitor"} -{"time":"2025-05-04T16:14:06.269782406+03:00","level":"INFO","msg":"stream: closing","id":"rdbtc2pz"} -{"time":"2025-05-04T16:14:06.269825637+03:00","level":"INFO","msg":"Stopping system monitor"} -{"time":"2025-05-04T16:14:06.270879471+03:00","level":"INFO","msg":"Stopped system monitor"} -{"time":"2025-05-04T16:14:06.55541099+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} -{"time":"2025-05-04T16:14:06.555433954+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} -{"time":"2025-05-04T16:14:06.555445965+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} -{"time":"2025-05-04T16:14:07.09767572+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} -{"time":"2025-05-04T16:14:07.607443104+03:00","level":"INFO","msg":"handler: closed","stream_id":"rdbtc2pz"} -{"time":"2025-05-04T16:14:07.607487355+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"rdbtc2pz"} -{"time":"2025-05-04T16:14:07.607532609+03:00","level":"INFO","msg":"sender: closed","stream_id":"rdbtc2pz"} -{"time":"2025-05-04T16:14:07.607587557+03:00","level":"INFO","msg":"stream: closed","id":"rdbtc2pz"} diff --git a/wandb/run-20250504_161246-rdbtc2pz/logs/debug.log b/wandb/run-20250504_161246-rdbtc2pz/logs/debug.log deleted file mode 100644 index 349cfbd59b697951167c42dd519765d328645a03..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_161246-rdbtc2pz/logs/debug.log +++ /dev/null @@ -1,27 +0,0 @@ -2025-05-04 16:12:46,051 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 -2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Configure stats pid to 3179526 -2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings -2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings -2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} -2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} -2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_161246-rdbtc2pz/logs/debug.log -2025-05-04 16:12:46,053 INFO MainThread:3179526 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_161246-rdbtc2pz/logs/debug-internal.log -2025-05-04 16:12:46,053 INFO MainThread:3179526 [wandb_init.py:init():619] calling init triggers -2025-05-04 16:12:46,053 INFO MainThread:3179526 [wandb_init.py:init():626] wandb.init called with sweep_config: {} -config: {} -2025-05-04 16:12:46,053 INFO MainThread:3179526 [wandb_init.py:init():669] starting backend -2025-05-04 16:12:46,053 INFO MainThread:3179526 [wandb_init.py:init():673] sending inform_init request -2025-05-04 16:12:46,057 INFO MainThread:3179526 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn -2025-05-04 16:12:46,058 INFO MainThread:3179526 [wandb_init.py:init():686] backend started and connected -2025-05-04 16:12:46,064 INFO MainThread:3179526 [wandb_init.py:init():781] updated telemetry -2025-05-04 16:12:46,067 INFO MainThread:3179526 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout -2025-05-04 16:12:46,584 INFO MainThread:3179526 [wandb_init.py:init():867] starting run threads in backend -2025-05-04 16:12:47,966 INFO MainThread:3179526 [wandb_run.py:_console_start():2456] atexit reg -2025-05-04 16:12:47,966 INFO MainThread:3179526 [wandb_run.py:_redirect():2305] redirect: wrap_raw -2025-05-04 16:12:47,966 INFO MainThread:3179526 [wandb_run.py:_redirect():2370] Wrapping output streams. -2025-05-04 16:12:47,966 INFO MainThread:3179526 [wandb_run.py:_redirect():2395] Redirects installed. -2025-05-04 16:12:47,974 INFO MainThread:3179526 [wandb_init.py:init():911] run started, returning control to user process -2025-05-04 16:13:01,857 INFO MainThread:3179526 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_16-12-52_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} -2025-05-04 16:14:06,270 WARNING MsgRouterThr:3179526 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_161246-rdbtc2pz/run-rdbtc2pz.wandb b/wandb/run-20250504_161246-rdbtc2pz/run-rdbtc2pz.wandb deleted file mode 100644 index f57225fe3777b25bea8c60ee43eed186f1df565a..0000000000000000000000000000000000000000 Binary files a/wandb/run-20250504_161246-rdbtc2pz/run-rdbtc2pz.wandb and /dev/null differ diff --git a/wandb/run-20250504_162343-cp870jym/files/config.yaml b/wandb/run-20250504_162343-cp870jym/files/config.yaml deleted file mode 100644 index 18d70a8edeb099baad34583edef28bf48cb9585b..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162343-cp870jym/files/config.yaml +++ /dev/null @@ -1,357 +0,0 @@ -_wandb: - value: - cli_version: 0.18.7 - m: - - "1": eval/steps_per_second - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/global_step - "6": - - 3 - "7": [] - - "1": eval/loss - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/epoch - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/accuracy - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/runtime - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/samples_per_second - "5": 2 - "6": - - 1 - - 3 - "7": [] - python_version: 3.10.15 - t: - "1": - - 1 - - 2 - - 3 - - 5 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "2": - - 1 - - 2 - - 3 - - 5 - - 6 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "3": - - 7 - - 23 - - 55 - - 62 - - 66 - "4": 3.10.15 - "5": 0.18.7 - "6": 4.45.2 - "8": - - 5 - "9": - "1": transformers_trainer - "12": 0.18.7 - "13": linux-x86_64 -accelerator_config: - value: - dispatch_batches: null - even_batches: true - gradient_accumulation_kwargs: null - non_blocking: false - split_batches: false - use_seedable_sampler: true -adafactor: - value: false -adam_beta1: - value: 0.9 -adam_beta2: - value: 0.999 -adam_epsilon: - value: 1e-08 -auto_find_batch_size: - value: false -batch_eval_metrics: - value: false -bf16: - value: false -bf16_full_eval: - value: false -data_seed: - value: null -dataloader_drop_last: - value: false -dataloader_num_workers: - value: 0 -dataloader_persistent_workers: - value: false -dataloader_pin_memory: - value: true -dataloader_prefetch_factor: - value: null -ddp_backend: - value: null -ddp_broadcast_buffers: - value: null -ddp_bucket_cap_mb: - value: null -ddp_find_unused_parameters: - value: null -ddp_timeout: - value: 1800 -debug: - value: [] -deepspeed: - value: null -disable_tqdm: - value: false -dispatch_batches: - value: null -do_eval: - value: true -do_predict: - value: false -do_train: - value: false -eval_accumulation_steps: - value: null -eval_delay: - value: 0 -eval_do_concat_batches: - value: true -eval_on_start: - value: false -eval_steps: - value: null -eval_strategy: - value: epoch -eval_use_gather_object: - value: false -evaluation_strategy: - value: epoch -fp16: - value: true -fp16_backend: - value: auto -fp16_full_eval: - value: false -fp16_opt_level: - value: O1 -fsdp: - value: [] -fsdp_config: - value: - min_num_params: 0 - xla: false - xla_fsdp_grad_ckpt: false - xla_fsdp_v2: false -fsdp_min_num_params: - value: 0 -fsdp_transformer_layer_cls_to_wrap: - value: null -full_determinism: - value: false -gradient_accumulation_steps: - value: 4 -gradient_checkpointing: - value: false -gradient_checkpointing_kwargs: - value: null -greater_is_better: - value: false -group_by_length: - value: false -half_precision_backend: - value: auto -hub_always_push: - value: false -hub_model_id: - value: null -hub_private_repo: - value: false -hub_strategy: - value: every_save -hub_token: - value: -ignore_data_skip: - value: false -include_inputs_for_metrics: - value: false -include_num_input_tokens_seen: - value: false -include_tokens_per_second: - value: false -jit_mode_eval: - value: false -label_names: - value: null -label_smoothing_factor: - value: 0 -learning_rate: - value: 5e-05 -length_column_name: - value: length -load_best_model_at_end: - value: true -local_rank: - value: 0 -log_level: - value: passive -log_level_replica: - value: warning -log_on_each_node: - value: true -logging_dir: - value: t5-bc-out/runs/May04_16-23-49_kolyoz1 -logging_first_step: - value: false -logging_nan_inf_filter: - value: true -logging_steps: - value: 500 -logging_strategy: - value: steps -lr_scheduler_type: - value: linear -max_grad_norm: - value: 1 -max_steps: - value: -1 -metric_for_best_model: - value: loss -mp_parameters: - value: "" -neftune_noise_alpha: - value: null -no_cuda: - value: false -num_train_epochs: - value: 3 -optim: - value: adamw_torch -optim_args: - value: null -optim_target_modules: - value: null -output_dir: - value: t5-bc-out -overwrite_output_dir: - value: false -past_index: - value: -1 -per_device_eval_batch_size: - value: 8 -per_device_train_batch_size: - value: 8 -per_gpu_eval_batch_size: - value: null -per_gpu_train_batch_size: - value: null -prediction_loss_only: - value: false -push_to_hub: - value: false -push_to_hub_model_id: - value: null -push_to_hub_organization: - value: null -push_to_hub_token: - value: -ray_scope: - value: last -remove_unused_columns: - value: true -report_to: - value: - - wandb -restore_callback_states_from_checkpoint: - value: false -resume_from_checkpoint: - value: null -run_name: - value: t5-bc-out -save_on_each_node: - value: false -save_only_model: - value: false -save_safetensors: - value: false -save_steps: - value: 500 -save_strategy: - value: epoch -save_total_limit: - value: null -seed: - value: 42 -skip_memory_metrics: - value: true -split_batches: - value: null -tf32: - value: null -torch_compile: - value: false -torch_compile_backend: - value: null -torch_compile_mode: - value: null -torch_empty_cache_steps: - value: null -torchdynamo: - value: null -tpu_metrics_debug: - value: false -tpu_num_cores: - value: null -use_cpu: - value: false -use_ipex: - value: false -use_legacy_prediction_loop: - value: false -use_liger_kernel: - value: false -use_mps_device: - value: false -warmup_ratio: - value: 0 -warmup_steps: - value: 0 -weight_decay: - value: 0 diff --git a/wandb/run-20250504_162343-cp870jym/files/output.log b/wandb/run-20250504_162343-cp870jym/files/output.log deleted file mode 100644 index 37d4dd8bc142bc7bd0e5821b6fd7fc2418f4768a..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162343-cp870jym/files/output.log +++ /dev/null @@ -1,27 +0,0 @@ -You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 -Map: 100%|██████████| 70/70 [00:00<00:00, 4479.59 examples/s] -Map: 100%|██████████| 15/15 [00:00<00:00, 2556.26 examples/s] -/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead - warnings.warn( -[2025-05-04 16:23:55,053] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. -100%|██████████| 6/6 [01:08<00:00, 11.47s/it] -Map: 100%|██████████| 15/15 [00:00<00:00, 3414.44 examples/s] -{'eval_loss': 0.32496747374534607, 'eval_accuracy': 1.0, 'eval_runtime': 0.0946, 'eval_samples_per_second': 158.536, 'eval_steps_per_second': 21.138, 'epoch': 0.89} -{'eval_loss': 0.14126792550086975, 'eval_accuracy': 1.0, 'eval_runtime': 0.0935, 'eval_samples_per_second': 160.347, 'eval_steps_per_second': 21.38, 'epoch': 1.78} -{'eval_loss': 0.08305665105581284, 'eval_accuracy': 1.0, 'eval_runtime': 0.0868, 'eval_samples_per_second': 172.874, 'eval_steps_per_second': 23.05, 'epoch': 2.67} -{'train_runtime': 68.815, 'train_samples_per_second': 3.052, 'train_steps_per_second': 0.087, 'train_loss': 0.34361688296000165, 'epoch': 2.67} -100%|██████████| 2/2 [00:00<00:00, 93.00it/s] -{'eval_loss': 0.07820229977369308, 'eval_accuracy': 1.0, 'eval_runtime': 0.0516, 'eval_samples_per_second': 290.667, 'eval_steps_per_second': 38.756, 'epoch': 2.6666666666666665} -Traceback (most recent call last): - File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in - model.save_pretrained( - File "/arf/home/zisik/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1928, in __getattr__ - raise AttributeError( -AttributeError: 'T5BinaryClassifier' object has no attribute 'save_pretrained' -Traceback (most recent call last): - File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in - model.save_pretrained( - File "/arf/home/zisik/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1928, in __getattr__ - raise AttributeError( -AttributeError: 'T5BinaryClassifier' object has no attribute 'save_pretrained' diff --git a/wandb/run-20250504_162343-cp870jym/files/requirements.txt b/wandb/run-20250504_162343-cp870jym/files/requirements.txt deleted file mode 100644 index 847c45ecccb522de294762faeeb01fe5fb02f7ac..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162343-cp870jym/files/requirements.txt +++ /dev/null @@ -1,541 +0,0 @@ -nvidia-cuda-cupti-cu12==12.4.127 -nvidia-cuda-nvrtc-cu12==12.4.127 -pyg-lib==0.4.0+pt20cu117 -biopython==1.85 -iniconfig==2.0.0 -tokenizers==0.20.0 -accelerate==1.3.0 -torch==2.6.0 -nvidia-nccl-cu12==2.21.5 -transformers==4.45.2 -nvidia-cusparse-cu12==12.3.1.170 -torch-scatter==2.1.2+pt20cu117 -nvidia-cusparselt-cu12==0.6.2 -nvidia-nvtx-cu12==12.4.127 -zstd==1.5.6.6 -fair-esm==2.0.0 -omegaconf==2.3.0 -pluggy==1.5.0 -pytest==8.3.5 -nvidia-curand-cu12==10.3.5.147 -nvidia-cufft-cu12==11.2.1.3 -torch-cluster==1.6.3+pt20cu117 -regex==2024.9.11 -nvidia-cudnn-cu12==9.1.0.70 -torch-spline-conv==1.2.2+pt20cu117 -nvidia-cusolver-cu12==11.6.1.9 -antlr4-python3-runtime==4.9.3 -msgpack-numpy==0.4.8 -nlp==0.2.0 -einops==0.8.1 -nvidia-cublas-cu12==12.4.5.8 -triton==3.2.0 -ninja==1.11.1.3 -hydra-core==1.3.2 -nvidia-nvjitlink-cu12==12.4.127 -biotite==0.41.2 -torch-sparse==0.6.18+pt20cu117 -esm==3.1.4 -sympy==1.13.1 -nvidia-cuda-runtime-cu12==12.4.127 -jupyter-lsp==2.2.5 -jupyter-events==0.10.0 -ipykernel==6.29.5 -Mako==1.3.5 -proto-plus==1.25.0 -fst-pso==1.8.1 -gensim==4.3.3 -htmlmin==0.1.12 -tokenizers==0.13.3 -timm==1.0.11 -MarkupSafe==3.0.2 -safetensors==0.4.5 -requests==2.32.3 -gast==0.5.5 -cuml==24.12.0a33 -jaxlib==0.4.23.dev20240214 -spacy-loggers==1.0.5 -pytz==2024.1 -idna==3.10 -python-dateutil==2.9.0 -mdurl==0.1.2 -blis==0.7.10 -jupyter==1.1.1 -pyerfa==2.0.1.5 -comm==0.2.2 -pygraphviz==1.14 -dill==0.3.8 -paramiko==3.5.0 -llama-index==0.8.36 -mdit-py-plugins==0.4.2 -Werkzeug==3.1.3 -pyu2f==0.1.5 -dask-glm==0.2.0 -httpx==0.27.2 -typeguard==4.4.1 -mypy-extensions==1.0.0 -kmodes==0.12.2 -keras==2.15.0 -ydata-profiling==0.0.dev0 -regex==2024.11.6 -xarray==2024.11.0 -setuptools==75.3.0 -charset-normalizer==3.4.0 -jupyterlab_nvdashboard==0.11.0 -pylibraft==24.12.0a36 -spacy==3.7.6 -mlflow-skinny==2.17.2 -nvtx==0.2.10 -multimethod==1.12 -pexpect==4.9.0 -torch==2.1.0.post301 -flatbuffers==24.3.25 -python-json-logger==2.0.7 -PyJWT==2.9.0 -multiprocess==0.70.16 -colorlover==0.3.0 -yarl==1.16.0 -locket==1.0.0 -patsy==1.0.0 -rapids-dask-dependency==24.12.0a0 -stanza==1.9.2 -debugpy==1.8.8 -jupyterlab_pygments==0.3.0 -pylibcudf==24.12.0a337 -lz4==4.3.3 -pandas==2.2.3 -tifffile==2024.9.20 -pynvml==11.4.1 -cufflinks==0.17.3 -ipywidgets==8.1.5 -requests-oauthlib==2.0.0 -google-auth-oauthlib==1.2.1 -rsa==4.9 -webcolors==24.8.0 -jsonschema-specifications==2024.10.1 -scikit-learn==1.5.2 -langchain-text-splitters==0.3.2 -pandas-datareader==0.10.0 -tomli==2.0.2 -tzdata==2024.2 -scikit-image==0.24.0 -tensorboard_data_server==0.7.0 -kiwisolver==1.4.7 -cloudpathlib==0.20.0 -isodate==0.6.1 -adversarial-robustness-toolbox==1.19.1 -SQLAlchemy==2.0.36 -pytest-runner==6.0.0 -pycairo==1.27.0 -treelite==4.3.0 -jiter==0.7.0 -threadpoolctl==3.5.0 -pandocfilters==1.5.0 -loguru==0.7.2 -smart_open==7.0.5 -shellingham==1.5.4 -deepspeed==0.15.4 -prompt_toolkit==3.0.48 -databricks-sdk==0.34.0 -langchain-core==0.3.15 -imageio==2.36.0 -openapi-schema-pydantic==1.2.4 -zict==3.0.0 -cachetools==5.5.0 -colorful==0.5.6 -mpmath==1.3.0 -nest_asyncio==1.6.0 -pyFUME==0.2.25 -opencv-python-headless==4.9.0 -fastai==2.7.18 -importlib_resources==6.4.5 -binaryornot==0.4.4 -evaluate==0.4.1 -matplotlib-inline==0.1.7 -wasabi==1.1.2 -pycparser==2.22 -GitPython==3.1.43 -pluggy==1.5.0 -async-lru==2.0.4 -pgmpy==0.1.24 -anyio==4.4.0 -executing==2.1.0 -orjson==3.10.11 -humanfriendly==10.0 -tornado==6.4.1 -gmpy2==2.1.5 -rlPyCairo==0.2.0 -distributed==2024.11.0 -FuzzyTM==2.0.5 -torchtext==0.15.2a0+5ce3163 -pytest==8.3.5 -pyod==2.0.2 -ImageHash==4.3.1 -soupsieve==2.5 -tblib==3.0.0 -emoji==2.14.0 -aiohappyeyeballs==2.4.3 -uri-template==1.3.0 -tensorflow_estimator==2.15.0 -babel==2.16.0 -dask-cuda==24.12.0a12 -overrides==7.7.0 -opencensus==0.11.3 -openai==0.28.1 -language_data==1.2.0 -jedi==0.19.2 -cookiecutter==2.6.0 -entrypoints==0.4 -exceptiongroup==1.2.2 -marisa-trie==1.2.0 -uvloop==0.20.0 -aiosignal==1.3.1 -Flask==3.0.3 -tensorboard==2.15.2 -cffi==1.17.1 -tf_keras==2.15.0 -absl-py==2.1.0 -blinker==1.9.0 -types-python-dateutil==2.9.0.20241003 -opencv-python==4.9.0 -frozendict==2.4.6 -aiohttp-cors==0.7.0 -statsmodels==0.14.4 -tinycss2==1.4.0 -terminado==0.18.1 -pycaret==2.2.3 -aiohttp==3.10.10 -distributed-ucxx==0.41.0 -prometheus_client==0.21.0 -fastdownload==0.0.7 -grpcio==1.59.3 -google-api-core==2.22.0 -jupyterlab_widgets==3.0.13 -appdirs==1.4.4 -littleutils==0.0.0 -ray==2.24.0 -kaggle==1.6.17 -jsonschema==4.23.0 -google-auth==2.36.0 -scikit-base==0.11.0 -visions==0.7.6 -pyarrow==15.0.0 -transformers==4.33.0 -prometheus_flask_exporter==0.23.1 -dm-tree==0.1.8 -colorama==0.4.6 -requests-toolbelt==1.0.0 -cached-property==1.5.2 -cymem==2.0.8 -PyNaCl==1.5.0 -PyWavelets==1.7.0 -httptools==0.6.1 -typing-utils==0.1.0 -email_validator==2.2.0 -marshmallow==3.23.1 -Deprecated==1.2.14 -virtualenv==20.4.7 -optuna==3.6.1 -jupyter_server==2.14.2 -termcolor==2.5.0 -mpi4py==4.0.1 -torchdata==0.7.1+8cea82f -dataclasses==0.8 -cloudpickle==3.1.0 -tree_sitter_languages==1.10.2 -tabulate==0.9.0 -ipython==8.29.0 -lightgbm==4.3.0 -captum==0.6.0 -confuse==2.0.1 -torchvision==0.16.1+adc3221 -lxml==4.9.4 -fastapi==0.115.4 -python-multipart==0.0.17 -dnspython==2.7.0 -jupyter-console==6.6.3 -preshed==3.0.9 -py-cpuinfo==9.0.0 -Send2Trash==1.8.3 -murmurhash==1.0.10 -sniffio==1.3.1 -websockets==13.1 -h11==0.14.0 -smmap==5.0.0 -textual==0.85.2 -jsonpatch==1.33 -opencensus-context==0.1.3 -nbconvert==7.16.4 -sentry-sdk==2.19.0 -opentelemetry-semantic-conventions==0.37b0 -pandas-profiling==2.8.0 -pillow==10.3.0 -peft==0.13.2 -rpds-py==0.21.0 -bokeh==3.6.1 -distro==1.9.0 -itsdangerous==2.2.0 -wandb==0.18.7 -jsonpointer==3.0.0 -astropy-iers-data==0.2024.11.11.0.32.38 -horovod==0.28.1 -graphviz==0.20.3 -vtk==9.3.1 -bleach==6.2.0 -numexpr==2.8.7 -pydantic_core==2.23.4 -Jinja2==3.1.4 -widgetsnbextension==4.0.13 -filelock==3.16.1 -catboost==1.2.7 -raft-dask==24.12.0a36 -async-timeout==4.0.3 -datefinder==0.7.3 -coloredlogs==15.0.1 -platformdirs==4.3.6 -spacy-legacy==3.0.12 -chardet==5.2.0 -jupyter_client==8.6.3 -importlib_metadata==8.5.0 -rfc3986-validator==0.1.1 -huggingface_hub==0.26.2 -PySocks==1.7.1 -mlxtend==0.23.2 -outdated==0.2.2 -partd==1.4.2 -thinc==8.2.5 -astropy==6.1.6 -rdflib==6.3.2 -h2==4.1.0 -typer==0.13.0 -xyzservices==2024.9.0 -toolz==0.12.1 -frozenlist==1.5.0 -rdkit==2024.9.2 -pyasn1==0.6.1 -jupyter_server_terminals==0.5.3 -ucx-py==0.41.0a11 -astunparse==1.6.3 -simpful==2.12.0 -notebook_shim==0.2.4 -scipy==1.13.1 -colorlog==6.9.0 -tiktoken==0.3.3 -plotly==5.24.1 -fastrlock==0.8.2 -chart-studio==1.1.0 -stack-data==0.6.2 -google-pasta==0.2.0 -sktime==0.34.0 -PyYAML==6.0.2 -sympy==1.13.3 -multidict==6.1.0 -ml-dtypes==0.2.0 -tensorboardX==2.6.2.2 -decorator==5.1.1 -cytoolz==1.0.0 -ase==3.23.0 -isoduration==20.11.0 -html5lib==1.1 -langsmith==0.1.142 -future==1.0.0 -onnx2torch==1.5.15 -multipledispatch==0.6.0 -protobuf==4.24.4 -ucxx==0.41.0 -pandas_flavor==0.6.0 -msgpack==1.1.0 -pyasn1_modules==0.4.1 -imagecodecs==2024.1.1 -mlflow==2.17.2 -watchfiles==0.24.0 -dm-sonnet==2.0.2 -langcodes==3.4.1 -freetype-py==2.3.0 -argon2-cffi-bindings==21.2.0 -trimesh==4.5.2 -opt_einsum==3.4.0 -tenacity==8.5.0 -h5py==3.12.1 -fastapi-cli==0.0.5 -oauthlib==3.2.2 -parso==0.8.4 -weasel==0.4.1 -yfinance==0.2.49 -networkx==2.8.8 -bitsandbytes==0.44.1 -lazy_loader==0.4 -querystring_parser==1.2.4 -contourpy==1.3.0 -unicodedata2==15.1.0 -bcrypt==4.2.0 -munkres==1.1.4 -langchain==0.0.298 -hpack==4.0.0 -cryptography==43.0.3 -umap-learn==0.5.7 -arrow==1.3.0 -docker==7.1.0 -certifi==2025.1.31 -fastjsonschema==2.20.0 -tensorflow==2.15.0 -googleapis-common-protos==1.65.0 -iniconfig==2.0.0 -Markdown==3.6 -llvmlite==0.43.0 -wslink==2.3.2 -attrs==24.2.0 -rich==13.9.4 -cupy==13.3.0 -uc-micro-py==1.0.3 -alembic==1.14.0 -joblib==1.4.2 -reportlab==4.2.5 -miniful==0.0.6 -jupyter_core==5.7.2 -wheel==0.45.0 -phik==0.12.3 -mistune==3.0.2 -wcwidth==0.2.13 -dacite==1.8.1 -accelerate==0.22.0 -sacremoses==0.0.53 -revtok==0.0.3 -python-slugify==8.0.4 -tangled-up-in-unicode==0.2.0 -dask==2024.11.0 -markdown-it-py==3.0.0 -sentencepiece==0.1.99 -beautifulsoup4==4.12.3 -six==1.16.0 -numba-cuda==0.0.17 -argon2-cffi==23.1.0 -xxhash==3.5.0 -hjson==3.1.0 -fonttools==4.54.1 -graphql-core==3.2.5 -pyparsing==3.2.0 -pure_eval==0.2.3 -distlib==0.3.9 -lightning==2.4.0 -wordcloud==0.0.0 -catalogue==2.0.10 -jax==0.4.27 -tree-sitter==0.23.2 -notebook==7.2.2 -dataclasses-json==0.6.7 -propcache==0.2.0 -numba==0.60.0 -dask-expr==1.1.17 -pydantic==2.9.2 -gunicorn==22.0.0 -missingno==0.5.2 -pyOpenSSL==24.2.1 -openpyxl==3.1.5 -packaging==24.1 -python-dotenv==1.0.1 -cycler==0.12.1 -types-pytz==2024.2.0.20241003 -yellowbrick==1.5 -referencing==0.35.1 -pyLDAvis==3.4.1 -lazypredict==0.2.16 -fqdn==1.5.1 -websocket-client==1.8.0 -fastcore==1.7.19 -pynvjitlink-cu12==0.3.0 -pingouin==0.5.5 -numpy==1.26.4 -typing-inspect==0.9.0 -nltk==3.9.1 -onnxruntime==1.19.2 -tensorflow-probability==0.23.0 -datasets==3.0.2 -pickleshare==0.7.5 -peewee==3.17.7 -torch-geometric==2.6.1 -ptyprocess==0.7.0 -greenlet==3.1.1 -graphql-relay==3.2.0 -graphene==3.4.3 -et_xmlfile==2.0.0 -webencodings==0.5.1 -hyperframe==6.0.1 -multitasking==0.0.9 -typer-slim==0.13.0 -onnx==1.15.0 -uvicorn==0.32.0 -memray==1.13.4 -xgboost==2.1.2 -Brotli==1.1.0 -zipp==3.21.0 -nbformat==5.10.4 -responses==0.18.0 -funcy==2.0 -Pygments==2.18.0 -tqdm==4.67.0 -linkify-it-py==2.0.3 -srsly==2.4.8 -cuda-python==12.6.0 -lightning-utilities==0.11.8 -cudf==24.12.0a337 -dask-ml==2024.4.4 -docker-pycreds==0.4.0 -pkgutil_resolve_name==1.3.10 -opentelemetry-api==1.16.0 -fsspec==2024.9.0 -nbclient==0.10.0 -psutil==5.9.8 -pytorch-lightning==2.4.0 -sortedcontainers==2.4.0 -matplotlib==3.9.2 -defusedxml==0.7.1 -urllib3==1.26.19 -jupyterlab_server==2.27.3 -retrying==1.3.3 -dask-cudf==24.12.0a337 -sqlparse==0.5.1 -text-unidecode==1.3 -seaborn==0.13.2 -typing_extensions==4.12.2 -pyzmq==26.2.0 -rfc3339-validator==0.1.4 -pynndescent==0.5.13 -pip==24.3.1 -confection==0.1.4 -wrapt==1.14.1 -fastprogress==1.0.3 -traitlets==5.14.3 -asttokens==2.4.1 -json5==0.9.28 -pandas-stubs==2.2.3.241126 -torchmetrics==1.2.1 -gitdb==4.0.11 -annotated-types==0.7.0 -ipython-autotime==0.1 -httpcore==1.0.6 -click==8.1.7 -setproctitle==1.3.3 -starlette==0.41.2 -jupyterlab==4.2.5 -rmm==24.12.0a27 -opentelemetry-sdk==1.16.0 -textblob==0.15.3 -imbalanced-learn==0.12.4 -typeguard==4.3.0 -more-itertools==10.3.0 -zipp==3.19.2 -autocommand==2.2.2 -jaraco.context==5.3.0 -packaging==24.1 -importlib_metadata==8.0.0 -platformdirs==4.2.2 -jaraco.functools==4.0.1 -importlib_resources==6.4.0 -tomli==2.0.1 -jaraco.text==3.12.1 -wheel==0.43.0 -jaraco.collections==5.1.0 -typing_extensions==4.12.2 -inflect==7.3.1 -backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_162343-cp870jym/files/wandb-metadata.json b/wandb/run-20250504_162343-cp870jym/files/wandb-metadata.json deleted file mode 100644 index e5bd8ffd94d157531e5b2d7abc7c46e50d9074ff..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162343-cp870jym/files/wandb-metadata.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", - "python": "3.10.15", - "startedAt": "2025-05-04T13:23:43.746737Z", - "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", - "codePath": "finetuning_bc_prott5.py", - "email": "zeynep.isik1@sabanciuniv.edu", - "root": "/arf/scratch/zisik/prott5_bc_ft", - "host": "kolyoz1", - "username": "zisik", - "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", - "codePathLocal": "finetuning_bc_prott5.py", - "cpu_count": 64, - "cpu_count_logical": 64, - "gpu": "NVIDIA H100 80GB HBM3", - "gpu_count": 1, - "disk": { - "/": { - "total": "7643995308032", - "used": "274884100096" - } - }, - "memory": { - "total": "1081373220864" - }, - "cpu": { - "count": 64, - "countLogical": 64 - }, - "gpu_nvidia": [ - { - "name": "NVIDIA H100 80GB HBM3", - "memoryTotal": "85520809984", - "cudaCores": 16896, - "architecture": "Hopper" - } - ], - "slurm": { - "cluster_name": "cuda", - "conf": "/etc/slurm/slurm.conf", - "cpus_on_node": "16", - "cpus_per_task": "16", - "gpus_on_node": "1", - "gtids": "0", - "job_account": "tbag154", - "job_cpus_per_node": "16", - "job_end_time": "1746624198", - "job_gid": "11636", - "job_gpus": "1", - "job_id": "1027950", - "job_name": "msa_ph_pt", - "job_nodelist": "kolyoz1", - "job_num_nodes": "1", - "job_partition": "kolyoz-cuda", - "job_qos": "tbag", - "job_start_time": "1746364998", - "job_uid": "11636", - "job_user": "zisik", - "jobid": "1027950", - "localid": "0", - "mem_per_cpu": "14000", - "nnodes": "1", - "node_aliases": "(null)", - "nodeid": "0", - "nodelist": "kolyoz1", - "prio_process": "0", - "procid": "0", - "submit_dir": "/arf/scratch/zisik", - "submit_host": "cuda-ui", - "task_pid": "3180708", - "tasks_per_node": "1", - "topology_addr": "kolyoz1", - "topology_addr_pattern": "node", - "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" - }, - "cudaVersion": "12.6" -} \ No newline at end of file diff --git a/wandb/run-20250504_162343-cp870jym/files/wandb-summary.json b/wandb/run-20250504_162343-cp870jym/files/wandb-summary.json deleted file mode 100644 index e9e34e6f68682beece2671803867da11fe15a3c5..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162343-cp870jym/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"_step":4,"_runtime":84.708140457,"train_runtime":68.815,"eval/runtime":0.0516,"_wandb":{"runtime":84},"train_samples_per_second":3.052,"train/epoch":2.6666666666666665,"eval/loss":0.07820229977369308,"train_loss":0.34361688296000165,"total_flos":0,"_timestamp":1.7463651084544086e+09,"eval/samples_per_second":290.667,"eval/accuracy":1,"train_steps_per_second":0.087,"train/global_step":6,"eval/steps_per_second":38.756} \ No newline at end of file diff --git a/wandb/run-20250504_162343-cp870jym/logs/debug-core.log b/wandb/run-20250504_162343-cp870jym/logs/debug-core.log deleted file mode 100644 index 1e2f80d79d66ceb2fd9940e2b195bd656cbe50a6..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162343-cp870jym/logs/debug-core.log +++ /dev/null @@ -1,14 +0,0 @@ -{"time":"2025-05-04T16:23:43.103970405+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpgvzpqnd2/port-3180737.txt","pid":3180737,"debug":false,"disable-analytics":false} -{"time":"2025-05-04T16:23:43.104018+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} -{"time":"2025-05-04T16:23:43.104795371+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":39787,"Zone":""}} -{"time":"2025-05-04T16:23:43.104898929+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3180737} -{"time":"2025-05-04T16:23:43.291758092+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:38582"} -{"time":"2025-05-04T16:23:43.748521574+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"cp870jym","id":"127.0.0.1:38582"} -{"time":"2025-05-04T16:23:43.873512977+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"cp870jym","id":"127.0.0.1:38582"} -{"time":"2025-05-04T16:25:08.531174232+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:38582"} -{"time":"2025-05-04T16:25:08.531307956+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:38582"} -{"time":"2025-05-04T16:25:08.531367815+03:00","level":"INFO","msg":"server is shutting down"} -{"time":"2025-05-04T16:25:08.53150429+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:38582"} -{"time":"2025-05-04T16:25:09.788149247+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:38582"} -{"time":"2025-05-04T16:25:09.788183611+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:38582"} -{"time":"2025-05-04T16:25:09.788206528+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_162343-cp870jym/logs/debug-internal.log b/wandb/run-20250504_162343-cp870jym/logs/debug-internal.log deleted file mode 100644 index 1625047ba9735e4a64500c381492219a264c5df4..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162343-cp870jym/logs/debug-internal.log +++ /dev/null @@ -1,19 +0,0 @@ -{"time":"2025-05-04T16:23:43.750249064+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} -{"time":"2025-05-04T16:23:43.750294337+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_162343-cp870jym/logs/debug-core.log"} -{"time":"2025-05-04T16:23:43.873441585+03:00","level":"INFO","msg":"created new stream","id":"cp870jym"} -{"time":"2025-05-04T16:23:43.873500609+03:00","level":"INFO","msg":"stream: started","id":"cp870jym"} -{"time":"2025-05-04T16:23:43.873652279+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"cp870jym"} -{"time":"2025-05-04T16:23:43.873745942+03:00","level":"INFO","msg":"handler: started","stream_id":"cp870jym"} -{"time":"2025-05-04T16:23:43.873943316+03:00","level":"INFO","msg":"sender: started","stream_id":"cp870jym"} -{"time":"2025-05-04T16:23:44.451037367+03:00","level":"INFO","msg":"Starting system monitor"} -{"time":"2025-05-04T16:25:08.531294356+03:00","level":"INFO","msg":"stream: closing","id":"cp870jym"} -{"time":"2025-05-04T16:25:08.531341197+03:00","level":"INFO","msg":"Stopping system monitor"} -{"time":"2025-05-04T16:25:08.532383047+03:00","level":"INFO","msg":"Stopped system monitor"} -{"time":"2025-05-04T16:25:08.797985156+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} -{"time":"2025-05-04T16:25:08.798011707+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} -{"time":"2025-05-04T16:25:08.798022316+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} -{"time":"2025-05-04T16:25:09.301751579+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} -{"time":"2025-05-04T16:25:09.787364957+03:00","level":"INFO","msg":"handler: closed","stream_id":"cp870jym"} -{"time":"2025-05-04T16:25:09.787438823+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"cp870jym"} -{"time":"2025-05-04T16:25:09.78745243+03:00","level":"INFO","msg":"sender: closed","stream_id":"cp870jym"} -{"time":"2025-05-04T16:25:09.787535096+03:00","level":"INFO","msg":"stream: closed","id":"cp870jym"} diff --git a/wandb/run-20250504_162343-cp870jym/logs/debug.log b/wandb/run-20250504_162343-cp870jym/logs/debug.log deleted file mode 100644 index a5def13ec74b1339547213aa42656f477507efbb..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162343-cp870jym/logs/debug.log +++ /dev/null @@ -1,27 +0,0 @@ -2025-05-04 16:23:43,738 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 -2025-05-04 16:23:43,738 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Configure stats pid to 3180737 -2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings -2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings -2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} -2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} -2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_162343-cp870jym/logs/debug.log -2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_162343-cp870jym/logs/debug-internal.log -2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_init.py:init():619] calling init triggers -2025-05-04 16:23:43,740 INFO MainThread:3180737 [wandb_init.py:init():626] wandb.init called with sweep_config: {} -config: {} -2025-05-04 16:23:43,740 INFO MainThread:3180737 [wandb_init.py:init():669] starting backend -2025-05-04 16:23:43,740 INFO MainThread:3180737 [wandb_init.py:init():673] sending inform_init request -2025-05-04 16:23:43,745 INFO MainThread:3180737 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn -2025-05-04 16:23:43,746 INFO MainThread:3180737 [wandb_init.py:init():686] backend started and connected -2025-05-04 16:23:43,754 INFO MainThread:3180737 [wandb_init.py:init():781] updated telemetry -2025-05-04 16:23:43,757 INFO MainThread:3180737 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout -2025-05-04 16:23:44,437 INFO MainThread:3180737 [wandb_init.py:init():867] starting run threads in backend -2025-05-04 16:23:45,830 INFO MainThread:3180737 [wandb_run.py:_console_start():2456] atexit reg -2025-05-04 16:23:45,831 INFO MainThread:3180737 [wandb_run.py:_redirect():2305] redirect: wrap_raw -2025-05-04 16:23:45,831 INFO MainThread:3180737 [wandb_run.py:_redirect():2370] Wrapping output streams. -2025-05-04 16:23:45,832 INFO MainThread:3180737 [wandb_run.py:_redirect():2395] Redirects installed. -2025-05-04 16:23:45,842 INFO MainThread:3180737 [wandb_init.py:init():911] run started, returning control to user process -2025-05-04 16:23:59,567 INFO MainThread:3180737 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_16-23-49_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} -2025-05-04 16:25:08,531 WARNING MsgRouterThr:3180737 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_162343-cp870jym/run-cp870jym.wandb b/wandb/run-20250504_162343-cp870jym/run-cp870jym.wandb deleted file mode 100644 index 948b5b7c23ce67858ab71bf4a99f3cfd510cbee6..0000000000000000000000000000000000000000 Binary files a/wandb/run-20250504_162343-cp870jym/run-cp870jym.wandb and /dev/null differ diff --git a/wandb/run-20250504_162813-vqs6o6w5/files/config.yaml b/wandb/run-20250504_162813-vqs6o6w5/files/config.yaml deleted file mode 100644 index 2e371277d6ebafd880eb06c8f0ef37b936a3a706..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162813-vqs6o6w5/files/config.yaml +++ /dev/null @@ -1,357 +0,0 @@ -_wandb: - value: - cli_version: 0.18.7 - m: - - "1": eval/loss - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/global_step - "6": - - 3 - "7": [] - - "1": eval/runtime - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/steps_per_second - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/epoch - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/accuracy - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/samples_per_second - "5": 2 - "6": - - 1 - - 3 - "7": [] - python_version: 3.10.15 - t: - "1": - - 1 - - 2 - - 3 - - 5 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "2": - - 1 - - 2 - - 3 - - 5 - - 6 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "3": - - 7 - - 23 - - 55 - - 62 - - 66 - "4": 3.10.15 - "5": 0.18.7 - "6": 4.45.2 - "8": - - 5 - "9": - "1": transformers_trainer - "12": 0.18.7 - "13": linux-x86_64 -accelerator_config: - value: - dispatch_batches: null - even_batches: true - gradient_accumulation_kwargs: null - non_blocking: false - split_batches: false - use_seedable_sampler: true -adafactor: - value: false -adam_beta1: - value: 0.9 -adam_beta2: - value: 0.999 -adam_epsilon: - value: 1e-08 -auto_find_batch_size: - value: false -batch_eval_metrics: - value: false -bf16: - value: false -bf16_full_eval: - value: false -data_seed: - value: null -dataloader_drop_last: - value: false -dataloader_num_workers: - value: 0 -dataloader_persistent_workers: - value: false -dataloader_pin_memory: - value: true -dataloader_prefetch_factor: - value: null -ddp_backend: - value: null -ddp_broadcast_buffers: - value: null -ddp_bucket_cap_mb: - value: null -ddp_find_unused_parameters: - value: null -ddp_timeout: - value: 1800 -debug: - value: [] -deepspeed: - value: null -disable_tqdm: - value: false -dispatch_batches: - value: null -do_eval: - value: true -do_predict: - value: false -do_train: - value: false -eval_accumulation_steps: - value: null -eval_delay: - value: 0 -eval_do_concat_batches: - value: true -eval_on_start: - value: false -eval_steps: - value: null -eval_strategy: - value: epoch -eval_use_gather_object: - value: false -evaluation_strategy: - value: epoch -fp16: - value: true -fp16_backend: - value: auto -fp16_full_eval: - value: false -fp16_opt_level: - value: O1 -fsdp: - value: [] -fsdp_config: - value: - min_num_params: 0 - xla: false - xla_fsdp_grad_ckpt: false - xla_fsdp_v2: false -fsdp_min_num_params: - value: 0 -fsdp_transformer_layer_cls_to_wrap: - value: null -full_determinism: - value: false -gradient_accumulation_steps: - value: 4 -gradient_checkpointing: - value: false -gradient_checkpointing_kwargs: - value: null -greater_is_better: - value: false -group_by_length: - value: false -half_precision_backend: - value: auto -hub_always_push: - value: false -hub_model_id: - value: null -hub_private_repo: - value: false -hub_strategy: - value: every_save -hub_token: - value: -ignore_data_skip: - value: false -include_inputs_for_metrics: - value: false -include_num_input_tokens_seen: - value: false -include_tokens_per_second: - value: false -jit_mode_eval: - value: false -label_names: - value: null -label_smoothing_factor: - value: 0 -learning_rate: - value: 5e-05 -length_column_name: - value: length -load_best_model_at_end: - value: true -local_rank: - value: 0 -log_level: - value: passive -log_level_replica: - value: warning -log_on_each_node: - value: true -logging_dir: - value: t5-bc-out/runs/May04_16-28-19_kolyoz1 -logging_first_step: - value: false -logging_nan_inf_filter: - value: true -logging_steps: - value: 500 -logging_strategy: - value: steps -lr_scheduler_type: - value: linear -max_grad_norm: - value: 1 -max_steps: - value: -1 -metric_for_best_model: - value: loss -mp_parameters: - value: "" -neftune_noise_alpha: - value: null -no_cuda: - value: false -num_train_epochs: - value: 3 -optim: - value: adamw_torch -optim_args: - value: null -optim_target_modules: - value: null -output_dir: - value: t5-bc-out -overwrite_output_dir: - value: false -past_index: - value: -1 -per_device_eval_batch_size: - value: 8 -per_device_train_batch_size: - value: 8 -per_gpu_eval_batch_size: - value: null -per_gpu_train_batch_size: - value: null -prediction_loss_only: - value: false -push_to_hub: - value: false -push_to_hub_model_id: - value: null -push_to_hub_organization: - value: null -push_to_hub_token: - value: -ray_scope: - value: last -remove_unused_columns: - value: true -report_to: - value: - - wandb -restore_callback_states_from_checkpoint: - value: false -resume_from_checkpoint: - value: null -run_name: - value: t5-bc-out -save_on_each_node: - value: false -save_only_model: - value: false -save_safetensors: - value: false -save_steps: - value: 500 -save_strategy: - value: epoch -save_total_limit: - value: null -seed: - value: 42 -skip_memory_metrics: - value: true -split_batches: - value: null -tf32: - value: null -torch_compile: - value: false -torch_compile_backend: - value: null -torch_compile_mode: - value: null -torch_empty_cache_steps: - value: null -torchdynamo: - value: null -tpu_metrics_debug: - value: false -tpu_num_cores: - value: null -use_cpu: - value: false -use_ipex: - value: false -use_legacy_prediction_loop: - value: false -use_liger_kernel: - value: false -use_mps_device: - value: false -warmup_ratio: - value: 0 -warmup_steps: - value: 0 -weight_decay: - value: 0 diff --git a/wandb/run-20250504_162813-vqs6o6w5/files/output.log b/wandb/run-20250504_162813-vqs6o6w5/files/output.log deleted file mode 100644 index a8998b57d109ed7312a9c2e3f41928ec6b4905de..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162813-vqs6o6w5/files/output.log +++ /dev/null @@ -1,23 +0,0 @@ -You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 -Map: 100%|██████████| 70/70 [00:00<00:00, 4467.73 examples/s] -Map: 100%|██████████| 15/15 [00:00<00:00, 2557.19 examples/s] -/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead - warnings.warn( -[2025-05-04 16:28:25,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. -100%|██████████| 6/6 [01:06<00:00, 11.03s/it] -Map: 100%|██████████| 15/15 [00:00<00:00, 3353.30 examples/s] -{'eval_loss': 0.23444823920726776, 'eval_accuracy': 1.0, 'eval_runtime': 0.0842, 'eval_samples_per_second': 178.164, 'eval_steps_per_second': 23.755, 'epoch': 0.89} -{'eval_loss': 0.08114013075828552, 'eval_accuracy': 1.0, 'eval_runtime': 0.0928, 'eval_samples_per_second': 161.657, 'eval_steps_per_second': 21.554, 'epoch': 1.78} -{'eval_loss': 0.0510762594640255, 'eval_accuracy': 1.0, 'eval_runtime': 0.0788, 'eval_samples_per_second': 190.397, 'eval_steps_per_second': 25.386, 'epoch': 2.67} -{'train_runtime': 66.2064, 'train_samples_per_second': 3.172, 'train_steps_per_second': 0.091, 'train_loss': 0.281462828318278, 'epoch': 2.67} -100%|██████████| 2/2 [00:00<00:00, 90.79it/s] -{'eval_loss': 0.046335864812135696, 'eval_accuracy': 1.0, 'eval_runtime': 0.0528, 'eval_samples_per_second': 284.031, 'eval_steps_per_second': 37.871, 'epoch': 2.6666666666666665} -Traceback (most recent call last): - File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in - trainer.save_model( -TypeError: Trainer.save_model() got an unexpected keyword argument 'safe_serialization' -Traceback (most recent call last): - File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in - trainer.save_model( -TypeError: Trainer.save_model() got an unexpected keyword argument 'safe_serialization' diff --git a/wandb/run-20250504_162813-vqs6o6w5/files/requirements.txt b/wandb/run-20250504_162813-vqs6o6w5/files/requirements.txt deleted file mode 100644 index 847c45ecccb522de294762faeeb01fe5fb02f7ac..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162813-vqs6o6w5/files/requirements.txt +++ /dev/null @@ -1,541 +0,0 @@ -nvidia-cuda-cupti-cu12==12.4.127 -nvidia-cuda-nvrtc-cu12==12.4.127 -pyg-lib==0.4.0+pt20cu117 -biopython==1.85 -iniconfig==2.0.0 -tokenizers==0.20.0 -accelerate==1.3.0 -torch==2.6.0 -nvidia-nccl-cu12==2.21.5 -transformers==4.45.2 -nvidia-cusparse-cu12==12.3.1.170 -torch-scatter==2.1.2+pt20cu117 -nvidia-cusparselt-cu12==0.6.2 -nvidia-nvtx-cu12==12.4.127 -zstd==1.5.6.6 -fair-esm==2.0.0 -omegaconf==2.3.0 -pluggy==1.5.0 -pytest==8.3.5 -nvidia-curand-cu12==10.3.5.147 -nvidia-cufft-cu12==11.2.1.3 -torch-cluster==1.6.3+pt20cu117 -regex==2024.9.11 -nvidia-cudnn-cu12==9.1.0.70 -torch-spline-conv==1.2.2+pt20cu117 -nvidia-cusolver-cu12==11.6.1.9 -antlr4-python3-runtime==4.9.3 -msgpack-numpy==0.4.8 -nlp==0.2.0 -einops==0.8.1 -nvidia-cublas-cu12==12.4.5.8 -triton==3.2.0 -ninja==1.11.1.3 -hydra-core==1.3.2 -nvidia-nvjitlink-cu12==12.4.127 -biotite==0.41.2 -torch-sparse==0.6.18+pt20cu117 -esm==3.1.4 -sympy==1.13.1 -nvidia-cuda-runtime-cu12==12.4.127 -jupyter-lsp==2.2.5 -jupyter-events==0.10.0 -ipykernel==6.29.5 -Mako==1.3.5 -proto-plus==1.25.0 -fst-pso==1.8.1 -gensim==4.3.3 -htmlmin==0.1.12 -tokenizers==0.13.3 -timm==1.0.11 -MarkupSafe==3.0.2 -safetensors==0.4.5 -requests==2.32.3 -gast==0.5.5 -cuml==24.12.0a33 -jaxlib==0.4.23.dev20240214 -spacy-loggers==1.0.5 -pytz==2024.1 -idna==3.10 -python-dateutil==2.9.0 -mdurl==0.1.2 -blis==0.7.10 -jupyter==1.1.1 -pyerfa==2.0.1.5 -comm==0.2.2 -pygraphviz==1.14 -dill==0.3.8 -paramiko==3.5.0 -llama-index==0.8.36 -mdit-py-plugins==0.4.2 -Werkzeug==3.1.3 -pyu2f==0.1.5 -dask-glm==0.2.0 -httpx==0.27.2 -typeguard==4.4.1 -mypy-extensions==1.0.0 -kmodes==0.12.2 -keras==2.15.0 -ydata-profiling==0.0.dev0 -regex==2024.11.6 -xarray==2024.11.0 -setuptools==75.3.0 -charset-normalizer==3.4.0 -jupyterlab_nvdashboard==0.11.0 -pylibraft==24.12.0a36 -spacy==3.7.6 -mlflow-skinny==2.17.2 -nvtx==0.2.10 -multimethod==1.12 -pexpect==4.9.0 -torch==2.1.0.post301 -flatbuffers==24.3.25 -python-json-logger==2.0.7 -PyJWT==2.9.0 -multiprocess==0.70.16 -colorlover==0.3.0 -yarl==1.16.0 -locket==1.0.0 -patsy==1.0.0 -rapids-dask-dependency==24.12.0a0 -stanza==1.9.2 -debugpy==1.8.8 -jupyterlab_pygments==0.3.0 -pylibcudf==24.12.0a337 -lz4==4.3.3 -pandas==2.2.3 -tifffile==2024.9.20 -pynvml==11.4.1 -cufflinks==0.17.3 -ipywidgets==8.1.5 -requests-oauthlib==2.0.0 -google-auth-oauthlib==1.2.1 -rsa==4.9 -webcolors==24.8.0 -jsonschema-specifications==2024.10.1 -scikit-learn==1.5.2 -langchain-text-splitters==0.3.2 -pandas-datareader==0.10.0 -tomli==2.0.2 -tzdata==2024.2 -scikit-image==0.24.0 -tensorboard_data_server==0.7.0 -kiwisolver==1.4.7 -cloudpathlib==0.20.0 -isodate==0.6.1 -adversarial-robustness-toolbox==1.19.1 -SQLAlchemy==2.0.36 -pytest-runner==6.0.0 -pycairo==1.27.0 -treelite==4.3.0 -jiter==0.7.0 -threadpoolctl==3.5.0 -pandocfilters==1.5.0 -loguru==0.7.2 -smart_open==7.0.5 -shellingham==1.5.4 -deepspeed==0.15.4 -prompt_toolkit==3.0.48 -databricks-sdk==0.34.0 -langchain-core==0.3.15 -imageio==2.36.0 -openapi-schema-pydantic==1.2.4 -zict==3.0.0 -cachetools==5.5.0 -colorful==0.5.6 -mpmath==1.3.0 -nest_asyncio==1.6.0 -pyFUME==0.2.25 -opencv-python-headless==4.9.0 -fastai==2.7.18 -importlib_resources==6.4.5 -binaryornot==0.4.4 -evaluate==0.4.1 -matplotlib-inline==0.1.7 -wasabi==1.1.2 -pycparser==2.22 -GitPython==3.1.43 -pluggy==1.5.0 -async-lru==2.0.4 -pgmpy==0.1.24 -anyio==4.4.0 -executing==2.1.0 -orjson==3.10.11 -humanfriendly==10.0 -tornado==6.4.1 -gmpy2==2.1.5 -rlPyCairo==0.2.0 -distributed==2024.11.0 -FuzzyTM==2.0.5 -torchtext==0.15.2a0+5ce3163 -pytest==8.3.5 -pyod==2.0.2 -ImageHash==4.3.1 -soupsieve==2.5 -tblib==3.0.0 -emoji==2.14.0 -aiohappyeyeballs==2.4.3 -uri-template==1.3.0 -tensorflow_estimator==2.15.0 -babel==2.16.0 -dask-cuda==24.12.0a12 -overrides==7.7.0 -opencensus==0.11.3 -openai==0.28.1 -language_data==1.2.0 -jedi==0.19.2 -cookiecutter==2.6.0 -entrypoints==0.4 -exceptiongroup==1.2.2 -marisa-trie==1.2.0 -uvloop==0.20.0 -aiosignal==1.3.1 -Flask==3.0.3 -tensorboard==2.15.2 -cffi==1.17.1 -tf_keras==2.15.0 -absl-py==2.1.0 -blinker==1.9.0 -types-python-dateutil==2.9.0.20241003 -opencv-python==4.9.0 -frozendict==2.4.6 -aiohttp-cors==0.7.0 -statsmodels==0.14.4 -tinycss2==1.4.0 -terminado==0.18.1 -pycaret==2.2.3 -aiohttp==3.10.10 -distributed-ucxx==0.41.0 -prometheus_client==0.21.0 -fastdownload==0.0.7 -grpcio==1.59.3 -google-api-core==2.22.0 -jupyterlab_widgets==3.0.13 -appdirs==1.4.4 -littleutils==0.0.0 -ray==2.24.0 -kaggle==1.6.17 -jsonschema==4.23.0 -google-auth==2.36.0 -scikit-base==0.11.0 -visions==0.7.6 -pyarrow==15.0.0 -transformers==4.33.0 -prometheus_flask_exporter==0.23.1 -dm-tree==0.1.8 -colorama==0.4.6 -requests-toolbelt==1.0.0 -cached-property==1.5.2 -cymem==2.0.8 -PyNaCl==1.5.0 -PyWavelets==1.7.0 -httptools==0.6.1 -typing-utils==0.1.0 -email_validator==2.2.0 -marshmallow==3.23.1 -Deprecated==1.2.14 -virtualenv==20.4.7 -optuna==3.6.1 -jupyter_server==2.14.2 -termcolor==2.5.0 -mpi4py==4.0.1 -torchdata==0.7.1+8cea82f -dataclasses==0.8 -cloudpickle==3.1.0 -tree_sitter_languages==1.10.2 -tabulate==0.9.0 -ipython==8.29.0 -lightgbm==4.3.0 -captum==0.6.0 -confuse==2.0.1 -torchvision==0.16.1+adc3221 -lxml==4.9.4 -fastapi==0.115.4 -python-multipart==0.0.17 -dnspython==2.7.0 -jupyter-console==6.6.3 -preshed==3.0.9 -py-cpuinfo==9.0.0 -Send2Trash==1.8.3 -murmurhash==1.0.10 -sniffio==1.3.1 -websockets==13.1 -h11==0.14.0 -smmap==5.0.0 -textual==0.85.2 -jsonpatch==1.33 -opencensus-context==0.1.3 -nbconvert==7.16.4 -sentry-sdk==2.19.0 -opentelemetry-semantic-conventions==0.37b0 -pandas-profiling==2.8.0 -pillow==10.3.0 -peft==0.13.2 -rpds-py==0.21.0 -bokeh==3.6.1 -distro==1.9.0 -itsdangerous==2.2.0 -wandb==0.18.7 -jsonpointer==3.0.0 -astropy-iers-data==0.2024.11.11.0.32.38 -horovod==0.28.1 -graphviz==0.20.3 -vtk==9.3.1 -bleach==6.2.0 -numexpr==2.8.7 -pydantic_core==2.23.4 -Jinja2==3.1.4 -widgetsnbextension==4.0.13 -filelock==3.16.1 -catboost==1.2.7 -raft-dask==24.12.0a36 -async-timeout==4.0.3 -datefinder==0.7.3 -coloredlogs==15.0.1 -platformdirs==4.3.6 -spacy-legacy==3.0.12 -chardet==5.2.0 -jupyter_client==8.6.3 -importlib_metadata==8.5.0 -rfc3986-validator==0.1.1 -huggingface_hub==0.26.2 -PySocks==1.7.1 -mlxtend==0.23.2 -outdated==0.2.2 -partd==1.4.2 -thinc==8.2.5 -astropy==6.1.6 -rdflib==6.3.2 -h2==4.1.0 -typer==0.13.0 -xyzservices==2024.9.0 -toolz==0.12.1 -frozenlist==1.5.0 -rdkit==2024.9.2 -pyasn1==0.6.1 -jupyter_server_terminals==0.5.3 -ucx-py==0.41.0a11 -astunparse==1.6.3 -simpful==2.12.0 -notebook_shim==0.2.4 -scipy==1.13.1 -colorlog==6.9.0 -tiktoken==0.3.3 -plotly==5.24.1 -fastrlock==0.8.2 -chart-studio==1.1.0 -stack-data==0.6.2 -google-pasta==0.2.0 -sktime==0.34.0 -PyYAML==6.0.2 -sympy==1.13.3 -multidict==6.1.0 -ml-dtypes==0.2.0 -tensorboardX==2.6.2.2 -decorator==5.1.1 -cytoolz==1.0.0 -ase==3.23.0 -isoduration==20.11.0 -html5lib==1.1 -langsmith==0.1.142 -future==1.0.0 -onnx2torch==1.5.15 -multipledispatch==0.6.0 -protobuf==4.24.4 -ucxx==0.41.0 -pandas_flavor==0.6.0 -msgpack==1.1.0 -pyasn1_modules==0.4.1 -imagecodecs==2024.1.1 -mlflow==2.17.2 -watchfiles==0.24.0 -dm-sonnet==2.0.2 -langcodes==3.4.1 -freetype-py==2.3.0 -argon2-cffi-bindings==21.2.0 -trimesh==4.5.2 -opt_einsum==3.4.0 -tenacity==8.5.0 -h5py==3.12.1 -fastapi-cli==0.0.5 -oauthlib==3.2.2 -parso==0.8.4 -weasel==0.4.1 -yfinance==0.2.49 -networkx==2.8.8 -bitsandbytes==0.44.1 -lazy_loader==0.4 -querystring_parser==1.2.4 -contourpy==1.3.0 -unicodedata2==15.1.0 -bcrypt==4.2.0 -munkres==1.1.4 -langchain==0.0.298 -hpack==4.0.0 -cryptography==43.0.3 -umap-learn==0.5.7 -arrow==1.3.0 -docker==7.1.0 -certifi==2025.1.31 -fastjsonschema==2.20.0 -tensorflow==2.15.0 -googleapis-common-protos==1.65.0 -iniconfig==2.0.0 -Markdown==3.6 -llvmlite==0.43.0 -wslink==2.3.2 -attrs==24.2.0 -rich==13.9.4 -cupy==13.3.0 -uc-micro-py==1.0.3 -alembic==1.14.0 -joblib==1.4.2 -reportlab==4.2.5 -miniful==0.0.6 -jupyter_core==5.7.2 -wheel==0.45.0 -phik==0.12.3 -mistune==3.0.2 -wcwidth==0.2.13 -dacite==1.8.1 -accelerate==0.22.0 -sacremoses==0.0.53 -revtok==0.0.3 -python-slugify==8.0.4 -tangled-up-in-unicode==0.2.0 -dask==2024.11.0 -markdown-it-py==3.0.0 -sentencepiece==0.1.99 -beautifulsoup4==4.12.3 -six==1.16.0 -numba-cuda==0.0.17 -argon2-cffi==23.1.0 -xxhash==3.5.0 -hjson==3.1.0 -fonttools==4.54.1 -graphql-core==3.2.5 -pyparsing==3.2.0 -pure_eval==0.2.3 -distlib==0.3.9 -lightning==2.4.0 -wordcloud==0.0.0 -catalogue==2.0.10 -jax==0.4.27 -tree-sitter==0.23.2 -notebook==7.2.2 -dataclasses-json==0.6.7 -propcache==0.2.0 -numba==0.60.0 -dask-expr==1.1.17 -pydantic==2.9.2 -gunicorn==22.0.0 -missingno==0.5.2 -pyOpenSSL==24.2.1 -openpyxl==3.1.5 -packaging==24.1 -python-dotenv==1.0.1 -cycler==0.12.1 -types-pytz==2024.2.0.20241003 -yellowbrick==1.5 -referencing==0.35.1 -pyLDAvis==3.4.1 -lazypredict==0.2.16 -fqdn==1.5.1 -websocket-client==1.8.0 -fastcore==1.7.19 -pynvjitlink-cu12==0.3.0 -pingouin==0.5.5 -numpy==1.26.4 -typing-inspect==0.9.0 -nltk==3.9.1 -onnxruntime==1.19.2 -tensorflow-probability==0.23.0 -datasets==3.0.2 -pickleshare==0.7.5 -peewee==3.17.7 -torch-geometric==2.6.1 -ptyprocess==0.7.0 -greenlet==3.1.1 -graphql-relay==3.2.0 -graphene==3.4.3 -et_xmlfile==2.0.0 -webencodings==0.5.1 -hyperframe==6.0.1 -multitasking==0.0.9 -typer-slim==0.13.0 -onnx==1.15.0 -uvicorn==0.32.0 -memray==1.13.4 -xgboost==2.1.2 -Brotli==1.1.0 -zipp==3.21.0 -nbformat==5.10.4 -responses==0.18.0 -funcy==2.0 -Pygments==2.18.0 -tqdm==4.67.0 -linkify-it-py==2.0.3 -srsly==2.4.8 -cuda-python==12.6.0 -lightning-utilities==0.11.8 -cudf==24.12.0a337 -dask-ml==2024.4.4 -docker-pycreds==0.4.0 -pkgutil_resolve_name==1.3.10 -opentelemetry-api==1.16.0 -fsspec==2024.9.0 -nbclient==0.10.0 -psutil==5.9.8 -pytorch-lightning==2.4.0 -sortedcontainers==2.4.0 -matplotlib==3.9.2 -defusedxml==0.7.1 -urllib3==1.26.19 -jupyterlab_server==2.27.3 -retrying==1.3.3 -dask-cudf==24.12.0a337 -sqlparse==0.5.1 -text-unidecode==1.3 -seaborn==0.13.2 -typing_extensions==4.12.2 -pyzmq==26.2.0 -rfc3339-validator==0.1.4 -pynndescent==0.5.13 -pip==24.3.1 -confection==0.1.4 -wrapt==1.14.1 -fastprogress==1.0.3 -traitlets==5.14.3 -asttokens==2.4.1 -json5==0.9.28 -pandas-stubs==2.2.3.241126 -torchmetrics==1.2.1 -gitdb==4.0.11 -annotated-types==0.7.0 -ipython-autotime==0.1 -httpcore==1.0.6 -click==8.1.7 -setproctitle==1.3.3 -starlette==0.41.2 -jupyterlab==4.2.5 -rmm==24.12.0a27 -opentelemetry-sdk==1.16.0 -textblob==0.15.3 -imbalanced-learn==0.12.4 -typeguard==4.3.0 -more-itertools==10.3.0 -zipp==3.19.2 -autocommand==2.2.2 -jaraco.context==5.3.0 -packaging==24.1 -importlib_metadata==8.0.0 -platformdirs==4.2.2 -jaraco.functools==4.0.1 -importlib_resources==6.4.0 -tomli==2.0.1 -jaraco.text==3.12.1 -wheel==0.43.0 -jaraco.collections==5.1.0 -typing_extensions==4.12.2 -inflect==7.3.1 -backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_162813-vqs6o6w5/files/wandb-metadata.json b/wandb/run-20250504_162813-vqs6o6w5/files/wandb-metadata.json deleted file mode 100644 index 4cdef3cdbd7a79ab90f8929362f8e799e456f186..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162813-vqs6o6w5/files/wandb-metadata.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", - "python": "3.10.15", - "startedAt": "2025-05-04T13:28:13.563930Z", - "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", - "codePath": "finetuning_bc_prott5.py", - "email": "zeynep.isik1@sabanciuniv.edu", - "root": "/arf/scratch/zisik/prott5_bc_ft", - "host": "kolyoz1", - "username": "zisik", - "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", - "codePathLocal": "finetuning_bc_prott5.py", - "cpu_count": 64, - "cpu_count_logical": 64, - "gpu": "NVIDIA H100 80GB HBM3", - "gpu_count": 1, - "disk": { - "/": { - "total": "7643995308032", - "used": "274899660800" - } - }, - "memory": { - "total": "1081373220864" - }, - "cpu": { - "count": 64, - "countLogical": 64 - }, - "gpu_nvidia": [ - { - "name": "NVIDIA H100 80GB HBM3", - "memoryTotal": "85520809984", - "cudaCores": 16896, - "architecture": "Hopper" - } - ], - "slurm": { - "cluster_name": "cuda", - "conf": "/etc/slurm/slurm.conf", - "cpus_on_node": "16", - "cpus_per_task": "16", - "gpus_on_node": "1", - "gtids": "0", - "job_account": "tbag154", - "job_cpus_per_node": "16", - "job_end_time": "1746624467", - "job_gid": "11636", - "job_gpus": "1", - "job_id": "1027952", - "job_name": "msa_ph_pt", - "job_nodelist": "kolyoz1", - "job_num_nodes": "1", - "job_partition": "kolyoz-cuda", - "job_qos": "tbag", - "job_start_time": "1746365267", - "job_uid": "11636", - "job_user": "zisik", - "jobid": "1027952", - "localid": "0", - "mem_per_cpu": "14000", - "nnodes": "1", - "node_aliases": "(null)", - "nodeid": "0", - "nodelist": "kolyoz1", - "prio_process": "0", - "procid": "0", - "submit_dir": "/arf/scratch/zisik", - "submit_host": "cuda-ui", - "task_pid": "3182008", - "tasks_per_node": "1", - "topology_addr": "kolyoz1", - "topology_addr_pattern": "node", - "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" - }, - "cudaVersion": "12.6" -} \ No newline at end of file diff --git a/wandb/run-20250504_162813-vqs6o6w5/files/wandb-summary.json b/wandb/run-20250504_162813-vqs6o6w5/files/wandb-summary.json deleted file mode 100644 index 35b5e4df9660cdaaebd8d4bd033fa5210a1d5fcd..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162813-vqs6o6w5/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"train_steps_per_second":0.091,"eval/samples_per_second":284.031,"_step":4,"eval/runtime":0.0528,"eval/loss":0.046335864812135696,"total_flos":0,"train_runtime":66.2064,"train_loss":0.281462828318278,"eval/accuracy":1,"_timestamp":1.746365376058223e+09,"_runtime":82.494806798,"train/epoch":2.6666666666666665,"train_samples_per_second":3.172,"_wandb":{"runtime":82},"eval/steps_per_second":37.871,"train/global_step":6} \ No newline at end of file diff --git a/wandb/run-20250504_162813-vqs6o6w5/logs/debug-core.log b/wandb/run-20250504_162813-vqs6o6w5/logs/debug-core.log deleted file mode 100644 index ed0d11f5576631e8906b885ebeffe69cd840f942..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162813-vqs6o6w5/logs/debug-core.log +++ /dev/null @@ -1,14 +0,0 @@ -{"time":"2025-05-04T16:28:12.92389089+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpsu_1e075/port-3182035.txt","pid":3182035,"debug":false,"disable-analytics":false} -{"time":"2025-05-04T16:28:12.923946336+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} -{"time":"2025-05-04T16:28:12.924930159+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3182035} -{"time":"2025-05-04T16:28:12.924790098+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":40313,"Zone":""}} -{"time":"2025-05-04T16:28:13.10973957+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:59452"} -{"time":"2025-05-04T16:28:13.567567491+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"vqs6o6w5","id":"127.0.0.1:59452"} -{"time":"2025-05-04T16:28:13.69241432+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"vqs6o6w5","id":"127.0.0.1:59452"} -{"time":"2025-05-04T16:29:36.127793865+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:59452"} -{"time":"2025-05-04T16:29:36.127929839+03:00","level":"INFO","msg":"server is shutting down"} -{"time":"2025-05-04T16:29:36.127907923+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:59452"} -{"time":"2025-05-04T16:29:36.128125509+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:59452"} -{"time":"2025-05-04T16:29:37.558291716+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:59452"} -{"time":"2025-05-04T16:29:37.558314076+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:59452"} -{"time":"2025-05-04T16:29:37.558329488+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_162813-vqs6o6w5/logs/debug-internal.log b/wandb/run-20250504_162813-vqs6o6w5/logs/debug-internal.log deleted file mode 100644 index 92f3f80ba91679919e80e56dd8abd8e950015b71..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162813-vqs6o6w5/logs/debug-internal.log +++ /dev/null @@ -1,19 +0,0 @@ -{"time":"2025-05-04T16:28:13.569618821+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} -{"time":"2025-05-04T16:28:13.56966556+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_162813-vqs6o6w5/logs/debug-core.log"} -{"time":"2025-05-04T16:28:13.692347406+03:00","level":"INFO","msg":"created new stream","id":"vqs6o6w5"} -{"time":"2025-05-04T16:28:13.692401835+03:00","level":"INFO","msg":"stream: started","id":"vqs6o6w5"} -{"time":"2025-05-04T16:28:13.692589976+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"vqs6o6w5"} -{"time":"2025-05-04T16:28:13.69268191+03:00","level":"INFO","msg":"handler: started","stream_id":"vqs6o6w5"} -{"time":"2025-05-04T16:28:13.692686366+03:00","level":"INFO","msg":"sender: started","stream_id":"vqs6o6w5"} -{"time":"2025-05-04T16:28:14.077830252+03:00","level":"INFO","msg":"Starting system monitor"} -{"time":"2025-05-04T16:29:36.127909239+03:00","level":"INFO","msg":"stream: closing","id":"vqs6o6w5"} -{"time":"2025-05-04T16:29:36.127953372+03:00","level":"INFO","msg":"Stopping system monitor"} -{"time":"2025-05-04T16:29:36.129135049+03:00","level":"INFO","msg":"Stopped system monitor"} -{"time":"2025-05-04T16:29:36.381385986+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} -{"time":"2025-05-04T16:29:36.381410641+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} -{"time":"2025-05-04T16:29:36.381421107+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} -{"time":"2025-05-04T16:29:36.890657991+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} -{"time":"2025-05-04T16:29:37.557157658+03:00","level":"INFO","msg":"handler: closed","stream_id":"vqs6o6w5"} -{"time":"2025-05-04T16:29:37.55721188+03:00","level":"INFO","msg":"sender: closed","stream_id":"vqs6o6w5"} -{"time":"2025-05-04T16:29:37.557201882+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"vqs6o6w5"} -{"time":"2025-05-04T16:29:37.557304847+03:00","level":"INFO","msg":"stream: closed","id":"vqs6o6w5"} diff --git a/wandb/run-20250504_162813-vqs6o6w5/logs/debug.log b/wandb/run-20250504_162813-vqs6o6w5/logs/debug.log deleted file mode 100644 index 0fb69f8e411995c37137af59a45087557d8c1802..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_162813-vqs6o6w5/logs/debug.log +++ /dev/null @@ -1,27 +0,0 @@ -2025-05-04 16:28:13,556 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 -2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Configure stats pid to 3182035 -2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings -2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings -2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} -2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} -2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_162813-vqs6o6w5/logs/debug.log -2025-05-04 16:28:13,558 INFO MainThread:3182035 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_162813-vqs6o6w5/logs/debug-internal.log -2025-05-04 16:28:13,558 INFO MainThread:3182035 [wandb_init.py:init():619] calling init triggers -2025-05-04 16:28:13,558 INFO MainThread:3182035 [wandb_init.py:init():626] wandb.init called with sweep_config: {} -config: {} -2025-05-04 16:28:13,558 INFO MainThread:3182035 [wandb_init.py:init():669] starting backend -2025-05-04 16:28:13,558 INFO MainThread:3182035 [wandb_init.py:init():673] sending inform_init request -2025-05-04 16:28:13,562 INFO MainThread:3182035 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn -2025-05-04 16:28:13,563 INFO MainThread:3182035 [wandb_init.py:init():686] backend started and connected -2025-05-04 16:28:13,569 INFO MainThread:3182035 [wandb_init.py:init():781] updated telemetry -2025-05-04 16:28:13,572 INFO MainThread:3182035 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout -2025-05-04 16:28:14,063 INFO MainThread:3182035 [wandb_init.py:init():867] starting run threads in backend -2025-05-04 16:28:15,452 INFO MainThread:3182035 [wandb_run.py:_console_start():2456] atexit reg -2025-05-04 16:28:15,452 INFO MainThread:3182035 [wandb_run.py:_redirect():2305] redirect: wrap_raw -2025-05-04 16:28:15,452 INFO MainThread:3182035 [wandb_run.py:_redirect():2370] Wrapping output streams. -2025-05-04 16:28:15,452 INFO MainThread:3182035 [wandb_run.py:_redirect():2395] Redirects installed. -2025-05-04 16:28:15,461 INFO MainThread:3182035 [wandb_init.py:init():911] run started, returning control to user process -2025-05-04 16:28:29,768 INFO MainThread:3182035 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_16-28-19_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} -2025-05-04 16:29:36,128 WARNING MsgRouterThr:3182035 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_162813-vqs6o6w5/run-vqs6o6w5.wandb b/wandb/run-20250504_162813-vqs6o6w5/run-vqs6o6w5.wandb deleted file mode 100644 index a738d1c054d6cee9c4fee21d6554ff8c952d6250..0000000000000000000000000000000000000000 Binary files a/wandb/run-20250504_162813-vqs6o6w5/run-vqs6o6w5.wandb and /dev/null differ diff --git a/wandb/run-20250504_163202-a8cxeqmf/files/config.yaml b/wandb/run-20250504_163202-a8cxeqmf/files/config.yaml deleted file mode 100644 index b134dd5782bfb1efdf79a02c28eb4463bbe598a7..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163202-a8cxeqmf/files/config.yaml +++ /dev/null @@ -1,357 +0,0 @@ -_wandb: - value: - cli_version: 0.18.7 - m: - - "1": eval/accuracy - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/global_step - "6": - - 3 - "7": [] - - "1": eval/runtime - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/samples_per_second - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/steps_per_second - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/epoch - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/loss - "5": 2 - "6": - - 1 - - 3 - "7": [] - python_version: 3.10.15 - t: - "1": - - 1 - - 2 - - 3 - - 5 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "2": - - 1 - - 2 - - 3 - - 5 - - 6 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "3": - - 7 - - 23 - - 55 - - 62 - - 66 - "4": 3.10.15 - "5": 0.18.7 - "6": 4.45.2 - "8": - - 5 - "9": - "1": transformers_trainer - "12": 0.18.7 - "13": linux-x86_64 -accelerator_config: - value: - dispatch_batches: null - even_batches: true - gradient_accumulation_kwargs: null - non_blocking: false - split_batches: false - use_seedable_sampler: true -adafactor: - value: false -adam_beta1: - value: 0.9 -adam_beta2: - value: 0.999 -adam_epsilon: - value: 1e-08 -auto_find_batch_size: - value: false -batch_eval_metrics: - value: false -bf16: - value: false -bf16_full_eval: - value: false -data_seed: - value: null -dataloader_drop_last: - value: false -dataloader_num_workers: - value: 0 -dataloader_persistent_workers: - value: false -dataloader_pin_memory: - value: true -dataloader_prefetch_factor: - value: null -ddp_backend: - value: null -ddp_broadcast_buffers: - value: null -ddp_bucket_cap_mb: - value: null -ddp_find_unused_parameters: - value: null -ddp_timeout: - value: 1800 -debug: - value: [] -deepspeed: - value: null -disable_tqdm: - value: false -dispatch_batches: - value: null -do_eval: - value: true -do_predict: - value: false -do_train: - value: false -eval_accumulation_steps: - value: null -eval_delay: - value: 0 -eval_do_concat_batches: - value: true -eval_on_start: - value: false -eval_steps: - value: null -eval_strategy: - value: epoch -eval_use_gather_object: - value: false -evaluation_strategy: - value: epoch -fp16: - value: true -fp16_backend: - value: auto -fp16_full_eval: - value: false -fp16_opt_level: - value: O1 -fsdp: - value: [] -fsdp_config: - value: - min_num_params: 0 - xla: false - xla_fsdp_grad_ckpt: false - xla_fsdp_v2: false -fsdp_min_num_params: - value: 0 -fsdp_transformer_layer_cls_to_wrap: - value: null -full_determinism: - value: false -gradient_accumulation_steps: - value: 4 -gradient_checkpointing: - value: false -gradient_checkpointing_kwargs: - value: null -greater_is_better: - value: false -group_by_length: - value: false -half_precision_backend: - value: auto -hub_always_push: - value: false -hub_model_id: - value: null -hub_private_repo: - value: false -hub_strategy: - value: every_save -hub_token: - value: -ignore_data_skip: - value: false -include_inputs_for_metrics: - value: false -include_num_input_tokens_seen: - value: false -include_tokens_per_second: - value: false -jit_mode_eval: - value: false -label_names: - value: null -label_smoothing_factor: - value: 0 -learning_rate: - value: 5e-05 -length_column_name: - value: length -load_best_model_at_end: - value: true -local_rank: - value: 0 -log_level: - value: passive -log_level_replica: - value: warning -log_on_each_node: - value: true -logging_dir: - value: t5-bc-out/runs/May04_16-32-08_kolyoz1 -logging_first_step: - value: false -logging_nan_inf_filter: - value: true -logging_steps: - value: 500 -logging_strategy: - value: steps -lr_scheduler_type: - value: linear -max_grad_norm: - value: 1 -max_steps: - value: -1 -metric_for_best_model: - value: loss -mp_parameters: - value: "" -neftune_noise_alpha: - value: null -no_cuda: - value: false -num_train_epochs: - value: 3 -optim: - value: adamw_torch -optim_args: - value: null -optim_target_modules: - value: null -output_dir: - value: t5-bc-out -overwrite_output_dir: - value: false -past_index: - value: -1 -per_device_eval_batch_size: - value: 8 -per_device_train_batch_size: - value: 8 -per_gpu_eval_batch_size: - value: null -per_gpu_train_batch_size: - value: null -prediction_loss_only: - value: false -push_to_hub: - value: false -push_to_hub_model_id: - value: null -push_to_hub_organization: - value: null -push_to_hub_token: - value: -ray_scope: - value: last -remove_unused_columns: - value: true -report_to: - value: - - wandb -restore_callback_states_from_checkpoint: - value: false -resume_from_checkpoint: - value: null -run_name: - value: t5-bc-out -save_on_each_node: - value: false -save_only_model: - value: false -save_safetensors: - value: false -save_steps: - value: 500 -save_strategy: - value: epoch -save_total_limit: - value: null -seed: - value: 42 -skip_memory_metrics: - value: true -split_batches: - value: null -tf32: - value: null -torch_compile: - value: false -torch_compile_backend: - value: null -torch_compile_mode: - value: null -torch_empty_cache_steps: - value: null -torchdynamo: - value: null -tpu_metrics_debug: - value: false -tpu_num_cores: - value: null -use_cpu: - value: false -use_ipex: - value: false -use_legacy_prediction_loop: - value: false -use_liger_kernel: - value: false -use_mps_device: - value: false -warmup_ratio: - value: 0 -warmup_steps: - value: 0 -weight_decay: - value: 0 diff --git a/wandb/run-20250504_163202-a8cxeqmf/files/output.log b/wandb/run-20250504_163202-a8cxeqmf/files/output.log deleted file mode 100644 index 3df36c5de543befcbdc45eb6aa9b4dd90fcb1682..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163202-a8cxeqmf/files/output.log +++ /dev/null @@ -1,35 +0,0 @@ -You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 -Map: 100%|██████████| 70/70 [00:00<00:00, 4245.80 examples/s] -Map: 100%|██████████| 15/15 [00:00<00:00, 2515.98 examples/s] -/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead - warnings.warn( -[2025-05-04 16:32:13,990] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. -100%|██████████| 6/6 [01:07<00:00, 11.21s/it] -Map: 100%|██████████| 15/15 [00:00<00:00, 3498.75 examples/s] -{'eval_loss': 0.28029781579971313, 'eval_accuracy': 1.0, 'eval_runtime': 0.0833, 'eval_samples_per_second': 180.04, 'eval_steps_per_second': 24.005, 'epoch': 0.89} -{'eval_loss': 0.1000773161649704, 'eval_accuracy': 1.0, 'eval_runtime': 0.0863, 'eval_samples_per_second': 173.864, 'eval_steps_per_second': 23.182, 'epoch': 1.78} -{'eval_loss': 0.05684203654527664, 'eval_accuracy': 1.0, 'eval_runtime': 0.0937, 'eval_samples_per_second': 160.033, 'eval_steps_per_second': 21.338, 'epoch': 2.67} -{'train_runtime': 67.2983, 'train_samples_per_second': 3.12, 'train_steps_per_second': 0.089, 'train_loss': 0.31141672531763714, 'epoch': 2.67} -100%|██████████| 2/2 [00:00<00:00, 101.26it/s] -{'eval_loss': 0.04954631254076958, 'eval_accuracy': 1.0, 'eval_runtime': 0.0471, 'eval_samples_per_second': 318.692, 'eval_steps_per_second': 42.492, 'epoch': 2.6666666666666665} -Traceback (most recent call last): - File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in - trainer.save_model( - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3623, in save_model - self._save(output_dir) - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3704, in _save - os.makedirs(output_dir, exist_ok=True) - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/os.py", line 225, in makedirs - mkdir(name, mode) -PermissionError: [Errno 13] Permission denied: '/prott5_bc_ft' -Traceback (most recent call last): - File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in - trainer.save_model( - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3623, in save_model - self._save(output_dir) - File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3704, in _save - os.makedirs(output_dir, exist_ok=True) - File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/os.py", line 225, in makedirs - mkdir(name, mode) -PermissionError: [Errno 13] Permission denied: '/prott5_bc_ft' diff --git a/wandb/run-20250504_163202-a8cxeqmf/files/requirements.txt b/wandb/run-20250504_163202-a8cxeqmf/files/requirements.txt deleted file mode 100644 index 847c45ecccb522de294762faeeb01fe5fb02f7ac..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163202-a8cxeqmf/files/requirements.txt +++ /dev/null @@ -1,541 +0,0 @@ -nvidia-cuda-cupti-cu12==12.4.127 -nvidia-cuda-nvrtc-cu12==12.4.127 -pyg-lib==0.4.0+pt20cu117 -biopython==1.85 -iniconfig==2.0.0 -tokenizers==0.20.0 -accelerate==1.3.0 -torch==2.6.0 -nvidia-nccl-cu12==2.21.5 -transformers==4.45.2 -nvidia-cusparse-cu12==12.3.1.170 -torch-scatter==2.1.2+pt20cu117 -nvidia-cusparselt-cu12==0.6.2 -nvidia-nvtx-cu12==12.4.127 -zstd==1.5.6.6 -fair-esm==2.0.0 -omegaconf==2.3.0 -pluggy==1.5.0 -pytest==8.3.5 -nvidia-curand-cu12==10.3.5.147 -nvidia-cufft-cu12==11.2.1.3 -torch-cluster==1.6.3+pt20cu117 -regex==2024.9.11 -nvidia-cudnn-cu12==9.1.0.70 -torch-spline-conv==1.2.2+pt20cu117 -nvidia-cusolver-cu12==11.6.1.9 -antlr4-python3-runtime==4.9.3 -msgpack-numpy==0.4.8 -nlp==0.2.0 -einops==0.8.1 -nvidia-cublas-cu12==12.4.5.8 -triton==3.2.0 -ninja==1.11.1.3 -hydra-core==1.3.2 -nvidia-nvjitlink-cu12==12.4.127 -biotite==0.41.2 -torch-sparse==0.6.18+pt20cu117 -esm==3.1.4 -sympy==1.13.1 -nvidia-cuda-runtime-cu12==12.4.127 -jupyter-lsp==2.2.5 -jupyter-events==0.10.0 -ipykernel==6.29.5 -Mako==1.3.5 -proto-plus==1.25.0 -fst-pso==1.8.1 -gensim==4.3.3 -htmlmin==0.1.12 -tokenizers==0.13.3 -timm==1.0.11 -MarkupSafe==3.0.2 -safetensors==0.4.5 -requests==2.32.3 -gast==0.5.5 -cuml==24.12.0a33 -jaxlib==0.4.23.dev20240214 -spacy-loggers==1.0.5 -pytz==2024.1 -idna==3.10 -python-dateutil==2.9.0 -mdurl==0.1.2 -blis==0.7.10 -jupyter==1.1.1 -pyerfa==2.0.1.5 -comm==0.2.2 -pygraphviz==1.14 -dill==0.3.8 -paramiko==3.5.0 -llama-index==0.8.36 -mdit-py-plugins==0.4.2 -Werkzeug==3.1.3 -pyu2f==0.1.5 -dask-glm==0.2.0 -httpx==0.27.2 -typeguard==4.4.1 -mypy-extensions==1.0.0 -kmodes==0.12.2 -keras==2.15.0 -ydata-profiling==0.0.dev0 -regex==2024.11.6 -xarray==2024.11.0 -setuptools==75.3.0 -charset-normalizer==3.4.0 -jupyterlab_nvdashboard==0.11.0 -pylibraft==24.12.0a36 -spacy==3.7.6 -mlflow-skinny==2.17.2 -nvtx==0.2.10 -multimethod==1.12 -pexpect==4.9.0 -torch==2.1.0.post301 -flatbuffers==24.3.25 -python-json-logger==2.0.7 -PyJWT==2.9.0 -multiprocess==0.70.16 -colorlover==0.3.0 -yarl==1.16.0 -locket==1.0.0 -patsy==1.0.0 -rapids-dask-dependency==24.12.0a0 -stanza==1.9.2 -debugpy==1.8.8 -jupyterlab_pygments==0.3.0 -pylibcudf==24.12.0a337 -lz4==4.3.3 -pandas==2.2.3 -tifffile==2024.9.20 -pynvml==11.4.1 -cufflinks==0.17.3 -ipywidgets==8.1.5 -requests-oauthlib==2.0.0 -google-auth-oauthlib==1.2.1 -rsa==4.9 -webcolors==24.8.0 -jsonschema-specifications==2024.10.1 -scikit-learn==1.5.2 -langchain-text-splitters==0.3.2 -pandas-datareader==0.10.0 -tomli==2.0.2 -tzdata==2024.2 -scikit-image==0.24.0 -tensorboard_data_server==0.7.0 -kiwisolver==1.4.7 -cloudpathlib==0.20.0 -isodate==0.6.1 -adversarial-robustness-toolbox==1.19.1 -SQLAlchemy==2.0.36 -pytest-runner==6.0.0 -pycairo==1.27.0 -treelite==4.3.0 -jiter==0.7.0 -threadpoolctl==3.5.0 -pandocfilters==1.5.0 -loguru==0.7.2 -smart_open==7.0.5 -shellingham==1.5.4 -deepspeed==0.15.4 -prompt_toolkit==3.0.48 -databricks-sdk==0.34.0 -langchain-core==0.3.15 -imageio==2.36.0 -openapi-schema-pydantic==1.2.4 -zict==3.0.0 -cachetools==5.5.0 -colorful==0.5.6 -mpmath==1.3.0 -nest_asyncio==1.6.0 -pyFUME==0.2.25 -opencv-python-headless==4.9.0 -fastai==2.7.18 -importlib_resources==6.4.5 -binaryornot==0.4.4 -evaluate==0.4.1 -matplotlib-inline==0.1.7 -wasabi==1.1.2 -pycparser==2.22 -GitPython==3.1.43 -pluggy==1.5.0 -async-lru==2.0.4 -pgmpy==0.1.24 -anyio==4.4.0 -executing==2.1.0 -orjson==3.10.11 -humanfriendly==10.0 -tornado==6.4.1 -gmpy2==2.1.5 -rlPyCairo==0.2.0 -distributed==2024.11.0 -FuzzyTM==2.0.5 -torchtext==0.15.2a0+5ce3163 -pytest==8.3.5 -pyod==2.0.2 -ImageHash==4.3.1 -soupsieve==2.5 -tblib==3.0.0 -emoji==2.14.0 -aiohappyeyeballs==2.4.3 -uri-template==1.3.0 -tensorflow_estimator==2.15.0 -babel==2.16.0 -dask-cuda==24.12.0a12 -overrides==7.7.0 -opencensus==0.11.3 -openai==0.28.1 -language_data==1.2.0 -jedi==0.19.2 -cookiecutter==2.6.0 -entrypoints==0.4 -exceptiongroup==1.2.2 -marisa-trie==1.2.0 -uvloop==0.20.0 -aiosignal==1.3.1 -Flask==3.0.3 -tensorboard==2.15.2 -cffi==1.17.1 -tf_keras==2.15.0 -absl-py==2.1.0 -blinker==1.9.0 -types-python-dateutil==2.9.0.20241003 -opencv-python==4.9.0 -frozendict==2.4.6 -aiohttp-cors==0.7.0 -statsmodels==0.14.4 -tinycss2==1.4.0 -terminado==0.18.1 -pycaret==2.2.3 -aiohttp==3.10.10 -distributed-ucxx==0.41.0 -prometheus_client==0.21.0 -fastdownload==0.0.7 -grpcio==1.59.3 -google-api-core==2.22.0 -jupyterlab_widgets==3.0.13 -appdirs==1.4.4 -littleutils==0.0.0 -ray==2.24.0 -kaggle==1.6.17 -jsonschema==4.23.0 -google-auth==2.36.0 -scikit-base==0.11.0 -visions==0.7.6 -pyarrow==15.0.0 -transformers==4.33.0 -prometheus_flask_exporter==0.23.1 -dm-tree==0.1.8 -colorama==0.4.6 -requests-toolbelt==1.0.0 -cached-property==1.5.2 -cymem==2.0.8 -PyNaCl==1.5.0 -PyWavelets==1.7.0 -httptools==0.6.1 -typing-utils==0.1.0 -email_validator==2.2.0 -marshmallow==3.23.1 -Deprecated==1.2.14 -virtualenv==20.4.7 -optuna==3.6.1 -jupyter_server==2.14.2 -termcolor==2.5.0 -mpi4py==4.0.1 -torchdata==0.7.1+8cea82f -dataclasses==0.8 -cloudpickle==3.1.0 -tree_sitter_languages==1.10.2 -tabulate==0.9.0 -ipython==8.29.0 -lightgbm==4.3.0 -captum==0.6.0 -confuse==2.0.1 -torchvision==0.16.1+adc3221 -lxml==4.9.4 -fastapi==0.115.4 -python-multipart==0.0.17 -dnspython==2.7.0 -jupyter-console==6.6.3 -preshed==3.0.9 -py-cpuinfo==9.0.0 -Send2Trash==1.8.3 -murmurhash==1.0.10 -sniffio==1.3.1 -websockets==13.1 -h11==0.14.0 -smmap==5.0.0 -textual==0.85.2 -jsonpatch==1.33 -opencensus-context==0.1.3 -nbconvert==7.16.4 -sentry-sdk==2.19.0 -opentelemetry-semantic-conventions==0.37b0 -pandas-profiling==2.8.0 -pillow==10.3.0 -peft==0.13.2 -rpds-py==0.21.0 -bokeh==3.6.1 -distro==1.9.0 -itsdangerous==2.2.0 -wandb==0.18.7 -jsonpointer==3.0.0 -astropy-iers-data==0.2024.11.11.0.32.38 -horovod==0.28.1 -graphviz==0.20.3 -vtk==9.3.1 -bleach==6.2.0 -numexpr==2.8.7 -pydantic_core==2.23.4 -Jinja2==3.1.4 -widgetsnbextension==4.0.13 -filelock==3.16.1 -catboost==1.2.7 -raft-dask==24.12.0a36 -async-timeout==4.0.3 -datefinder==0.7.3 -coloredlogs==15.0.1 -platformdirs==4.3.6 -spacy-legacy==3.0.12 -chardet==5.2.0 -jupyter_client==8.6.3 -importlib_metadata==8.5.0 -rfc3986-validator==0.1.1 -huggingface_hub==0.26.2 -PySocks==1.7.1 -mlxtend==0.23.2 -outdated==0.2.2 -partd==1.4.2 -thinc==8.2.5 -astropy==6.1.6 -rdflib==6.3.2 -h2==4.1.0 -typer==0.13.0 -xyzservices==2024.9.0 -toolz==0.12.1 -frozenlist==1.5.0 -rdkit==2024.9.2 -pyasn1==0.6.1 -jupyter_server_terminals==0.5.3 -ucx-py==0.41.0a11 -astunparse==1.6.3 -simpful==2.12.0 -notebook_shim==0.2.4 -scipy==1.13.1 -colorlog==6.9.0 -tiktoken==0.3.3 -plotly==5.24.1 -fastrlock==0.8.2 -chart-studio==1.1.0 -stack-data==0.6.2 -google-pasta==0.2.0 -sktime==0.34.0 -PyYAML==6.0.2 -sympy==1.13.3 -multidict==6.1.0 -ml-dtypes==0.2.0 -tensorboardX==2.6.2.2 -decorator==5.1.1 -cytoolz==1.0.0 -ase==3.23.0 -isoduration==20.11.0 -html5lib==1.1 -langsmith==0.1.142 -future==1.0.0 -onnx2torch==1.5.15 -multipledispatch==0.6.0 -protobuf==4.24.4 -ucxx==0.41.0 -pandas_flavor==0.6.0 -msgpack==1.1.0 -pyasn1_modules==0.4.1 -imagecodecs==2024.1.1 -mlflow==2.17.2 -watchfiles==0.24.0 -dm-sonnet==2.0.2 -langcodes==3.4.1 -freetype-py==2.3.0 -argon2-cffi-bindings==21.2.0 -trimesh==4.5.2 -opt_einsum==3.4.0 -tenacity==8.5.0 -h5py==3.12.1 -fastapi-cli==0.0.5 -oauthlib==3.2.2 -parso==0.8.4 -weasel==0.4.1 -yfinance==0.2.49 -networkx==2.8.8 -bitsandbytes==0.44.1 -lazy_loader==0.4 -querystring_parser==1.2.4 -contourpy==1.3.0 -unicodedata2==15.1.0 -bcrypt==4.2.0 -munkres==1.1.4 -langchain==0.0.298 -hpack==4.0.0 -cryptography==43.0.3 -umap-learn==0.5.7 -arrow==1.3.0 -docker==7.1.0 -certifi==2025.1.31 -fastjsonschema==2.20.0 -tensorflow==2.15.0 -googleapis-common-protos==1.65.0 -iniconfig==2.0.0 -Markdown==3.6 -llvmlite==0.43.0 -wslink==2.3.2 -attrs==24.2.0 -rich==13.9.4 -cupy==13.3.0 -uc-micro-py==1.0.3 -alembic==1.14.0 -joblib==1.4.2 -reportlab==4.2.5 -miniful==0.0.6 -jupyter_core==5.7.2 -wheel==0.45.0 -phik==0.12.3 -mistune==3.0.2 -wcwidth==0.2.13 -dacite==1.8.1 -accelerate==0.22.0 -sacremoses==0.0.53 -revtok==0.0.3 -python-slugify==8.0.4 -tangled-up-in-unicode==0.2.0 -dask==2024.11.0 -markdown-it-py==3.0.0 -sentencepiece==0.1.99 -beautifulsoup4==4.12.3 -six==1.16.0 -numba-cuda==0.0.17 -argon2-cffi==23.1.0 -xxhash==3.5.0 -hjson==3.1.0 -fonttools==4.54.1 -graphql-core==3.2.5 -pyparsing==3.2.0 -pure_eval==0.2.3 -distlib==0.3.9 -lightning==2.4.0 -wordcloud==0.0.0 -catalogue==2.0.10 -jax==0.4.27 -tree-sitter==0.23.2 -notebook==7.2.2 -dataclasses-json==0.6.7 -propcache==0.2.0 -numba==0.60.0 -dask-expr==1.1.17 -pydantic==2.9.2 -gunicorn==22.0.0 -missingno==0.5.2 -pyOpenSSL==24.2.1 -openpyxl==3.1.5 -packaging==24.1 -python-dotenv==1.0.1 -cycler==0.12.1 -types-pytz==2024.2.0.20241003 -yellowbrick==1.5 -referencing==0.35.1 -pyLDAvis==3.4.1 -lazypredict==0.2.16 -fqdn==1.5.1 -websocket-client==1.8.0 -fastcore==1.7.19 -pynvjitlink-cu12==0.3.0 -pingouin==0.5.5 -numpy==1.26.4 -typing-inspect==0.9.0 -nltk==3.9.1 -onnxruntime==1.19.2 -tensorflow-probability==0.23.0 -datasets==3.0.2 -pickleshare==0.7.5 -peewee==3.17.7 -torch-geometric==2.6.1 -ptyprocess==0.7.0 -greenlet==3.1.1 -graphql-relay==3.2.0 -graphene==3.4.3 -et_xmlfile==2.0.0 -webencodings==0.5.1 -hyperframe==6.0.1 -multitasking==0.0.9 -typer-slim==0.13.0 -onnx==1.15.0 -uvicorn==0.32.0 -memray==1.13.4 -xgboost==2.1.2 -Brotli==1.1.0 -zipp==3.21.0 -nbformat==5.10.4 -responses==0.18.0 -funcy==2.0 -Pygments==2.18.0 -tqdm==4.67.0 -linkify-it-py==2.0.3 -srsly==2.4.8 -cuda-python==12.6.0 -lightning-utilities==0.11.8 -cudf==24.12.0a337 -dask-ml==2024.4.4 -docker-pycreds==0.4.0 -pkgutil_resolve_name==1.3.10 -opentelemetry-api==1.16.0 -fsspec==2024.9.0 -nbclient==0.10.0 -psutil==5.9.8 -pytorch-lightning==2.4.0 -sortedcontainers==2.4.0 -matplotlib==3.9.2 -defusedxml==0.7.1 -urllib3==1.26.19 -jupyterlab_server==2.27.3 -retrying==1.3.3 -dask-cudf==24.12.0a337 -sqlparse==0.5.1 -text-unidecode==1.3 -seaborn==0.13.2 -typing_extensions==4.12.2 -pyzmq==26.2.0 -rfc3339-validator==0.1.4 -pynndescent==0.5.13 -pip==24.3.1 -confection==0.1.4 -wrapt==1.14.1 -fastprogress==1.0.3 -traitlets==5.14.3 -asttokens==2.4.1 -json5==0.9.28 -pandas-stubs==2.2.3.241126 -torchmetrics==1.2.1 -gitdb==4.0.11 -annotated-types==0.7.0 -ipython-autotime==0.1 -httpcore==1.0.6 -click==8.1.7 -setproctitle==1.3.3 -starlette==0.41.2 -jupyterlab==4.2.5 -rmm==24.12.0a27 -opentelemetry-sdk==1.16.0 -textblob==0.15.3 -imbalanced-learn==0.12.4 -typeguard==4.3.0 -more-itertools==10.3.0 -zipp==3.19.2 -autocommand==2.2.2 -jaraco.context==5.3.0 -packaging==24.1 -importlib_metadata==8.0.0 -platformdirs==4.2.2 -jaraco.functools==4.0.1 -importlib_resources==6.4.0 -tomli==2.0.1 -jaraco.text==3.12.1 -wheel==0.43.0 -jaraco.collections==5.1.0 -typing_extensions==4.12.2 -inflect==7.3.1 -backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_163202-a8cxeqmf/files/wandb-metadata.json b/wandb/run-20250504_163202-a8cxeqmf/files/wandb-metadata.json deleted file mode 100644 index 43011783c61f4ffb7c1e598cc8f3542d15c2ce5a..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163202-a8cxeqmf/files/wandb-metadata.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", - "python": "3.10.15", - "startedAt": "2025-05-04T13:32:02.055600Z", - "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", - "codePath": "finetuning_bc_prott5.py", - "email": "zeynep.isik1@sabanciuniv.edu", - "root": "/arf/scratch/zisik/prott5_bc_ft", - "host": "kolyoz1", - "username": "zisik", - "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", - "codePathLocal": "finetuning_bc_prott5.py", - "cpu_count": 64, - "cpu_count_logical": 64, - "gpu": "NVIDIA H100 80GB HBM3", - "gpu_count": 1, - "disk": { - "/": { - "total": "7643995308032", - "used": "274920230912" - } - }, - "memory": { - "total": "1081373220864" - }, - "cpu": { - "count": 64, - "countLogical": 64 - }, - "gpu_nvidia": [ - { - "name": "NVIDIA H100 80GB HBM3", - "memoryTotal": "85520809984", - "cudaCores": 16896, - "architecture": "Hopper" - } - ], - "slurm": { - "cluster_name": "cuda", - "conf": "/etc/slurm/slurm.conf", - "cpus_on_node": "16", - "cpus_per_task": "16", - "gpus_on_node": "1", - "gtids": "0", - "job_account": "tbag154", - "job_cpus_per_node": "16", - "job_end_time": "1746624696", - "job_gid": "11636", - "job_gpus": "1", - "job_id": "1027955", - "job_name": "msa_ph_pt", - "job_nodelist": "kolyoz1", - "job_num_nodes": "1", - "job_partition": "kolyoz-cuda", - "job_qos": "tbag", - "job_start_time": "1746365496", - "job_uid": "11636", - "job_user": "zisik", - "jobid": "1027955", - "localid": "0", - "mem_per_cpu": "14000", - "nnodes": "1", - "node_aliases": "(null)", - "nodeid": "0", - "nodelist": "kolyoz1", - "prio_process": "0", - "procid": "0", - "submit_dir": "/arf/scratch/zisik", - "submit_host": "cuda-ui", - "task_pid": "3182736", - "tasks_per_node": "1", - "topology_addr": "kolyoz1", - "topology_addr_pattern": "node", - "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" - }, - "cudaVersion": "12.6" -} \ No newline at end of file diff --git a/wandb/run-20250504_163202-a8cxeqmf/files/wandb-summary.json b/wandb/run-20250504_163202-a8cxeqmf/files/wandb-summary.json deleted file mode 100644 index ef3a5d0d977ffd8ab00ee2541f1ee061080e14a6..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163202-a8cxeqmf/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"_wandb":{"runtime":83},"train_loss":0.31141672531763714,"train_steps_per_second":0.089,"eval/accuracy":1,"eval/steps_per_second":42.492,"train_samples_per_second":3.12,"eval/samples_per_second":318.692,"train/global_step":6,"train_runtime":67.2983,"_runtime":83.580626717,"_timestamp":1.746365605635744e+09,"eval/loss":0.04954631254076958,"train/epoch":2.6666666666666665,"total_flos":0,"eval/runtime":0.0471,"_step":4} \ No newline at end of file diff --git a/wandb/run-20250504_163202-a8cxeqmf/logs/debug-core.log b/wandb/run-20250504_163202-a8cxeqmf/logs/debug-core.log deleted file mode 100644 index 4d5cf6f58a5c9fc81b4569f485360bbc66a4434a..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163202-a8cxeqmf/logs/debug-core.log +++ /dev/null @@ -1,14 +0,0 @@ -{"time":"2025-05-04T16:32:01.418393778+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmplvhvsc_q/port-3182760.txt","pid":3182760,"debug":false,"disable-analytics":false} -{"time":"2025-05-04T16:32:01.418441665+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} -{"time":"2025-05-04T16:32:01.419205535+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":45799,"Zone":""}} -{"time":"2025-05-04T16:32:01.419534072+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3182760} -{"time":"2025-05-04T16:32:01.606152917+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:48576"} -{"time":"2025-05-04T16:32:02.057688618+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"a8cxeqmf","id":"127.0.0.1:48576"} -{"time":"2025-05-04T16:32:02.186607102+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"a8cxeqmf","id":"127.0.0.1:48576"} -{"time":"2025-05-04T16:33:25.702060103+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:48576"} -{"time":"2025-05-04T16:33:25.702177617+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:48576"} -{"time":"2025-05-04T16:33:25.702273436+03:00","level":"INFO","msg":"server is shutting down"} -{"time":"2025-05-04T16:33:25.702373794+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:48576"} -{"time":"2025-05-04T16:33:26.974600306+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:48576"} -{"time":"2025-05-04T16:33:26.974618713+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:48576"} -{"time":"2025-05-04T16:33:26.974630492+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_163202-a8cxeqmf/logs/debug-internal.log b/wandb/run-20250504_163202-a8cxeqmf/logs/debug-internal.log deleted file mode 100644 index ebf1e141889b4f95835db70d0596ae304d399be9..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163202-a8cxeqmf/logs/debug-internal.log +++ /dev/null @@ -1,19 +0,0 @@ -{"time":"2025-05-04T16:32:02.059376166+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} -{"time":"2025-05-04T16:32:02.059422726+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_163202-a8cxeqmf/logs/debug-core.log"} -{"time":"2025-05-04T16:32:02.18652874+03:00","level":"INFO","msg":"created new stream","id":"a8cxeqmf"} -{"time":"2025-05-04T16:32:02.186595094+03:00","level":"INFO","msg":"stream: started","id":"a8cxeqmf"} -{"time":"2025-05-04T16:32:02.18671057+03:00","level":"INFO","msg":"handler: started","stream_id":"a8cxeqmf"} -{"time":"2025-05-04T16:32:02.186759328+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"a8cxeqmf"} -{"time":"2025-05-04T16:32:02.186873015+03:00","level":"INFO","msg":"sender: started","stream_id":"a8cxeqmf"} -{"time":"2025-05-04T16:32:02.609103171+03:00","level":"INFO","msg":"Starting system monitor"} -{"time":"2025-05-04T16:33:25.702185181+03:00","level":"INFO","msg":"stream: closing","id":"a8cxeqmf"} -{"time":"2025-05-04T16:33:25.702250772+03:00","level":"INFO","msg":"Stopping system monitor"} -{"time":"2025-05-04T16:33:25.703241445+03:00","level":"INFO","msg":"Stopped system monitor"} -{"time":"2025-05-04T16:33:25.984446677+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} -{"time":"2025-05-04T16:33:25.98447338+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} -{"time":"2025-05-04T16:33:25.984484498+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} -{"time":"2025-05-04T16:33:26.497866306+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} -{"time":"2025-05-04T16:33:26.973748978+03:00","level":"INFO","msg":"handler: closed","stream_id":"a8cxeqmf"} -{"time":"2025-05-04T16:33:26.973797312+03:00","level":"INFO","msg":"sender: closed","stream_id":"a8cxeqmf"} -{"time":"2025-05-04T16:33:26.973781655+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"a8cxeqmf"} -{"time":"2025-05-04T16:33:26.973934766+03:00","level":"INFO","msg":"stream: closed","id":"a8cxeqmf"} diff --git a/wandb/run-20250504_163202-a8cxeqmf/logs/debug.log b/wandb/run-20250504_163202-a8cxeqmf/logs/debug.log deleted file mode 100644 index 2d0d3a8a5a0e5c625f7eb9466c7bbde46220ad7b..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163202-a8cxeqmf/logs/debug.log +++ /dev/null @@ -1,27 +0,0 @@ -2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 -2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Configure stats pid to 3182760 -2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings -2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings -2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} -2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} -2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_163202-a8cxeqmf/logs/debug.log -2025-05-04 16:32:02,050 INFO MainThread:3182760 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_163202-a8cxeqmf/logs/debug-internal.log -2025-05-04 16:32:02,050 INFO MainThread:3182760 [wandb_init.py:init():619] calling init triggers -2025-05-04 16:32:02,050 INFO MainThread:3182760 [wandb_init.py:init():626] wandb.init called with sweep_config: {} -config: {} -2025-05-04 16:32:02,050 INFO MainThread:3182760 [wandb_init.py:init():669] starting backend -2025-05-04 16:32:02,050 INFO MainThread:3182760 [wandb_init.py:init():673] sending inform_init request -2025-05-04 16:32:02,054 INFO MainThread:3182760 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn -2025-05-04 16:32:02,055 INFO MainThread:3182760 [wandb_init.py:init():686] backend started and connected -2025-05-04 16:32:02,063 INFO MainThread:3182760 [wandb_init.py:init():781] updated telemetry -2025-05-04 16:32:02,066 INFO MainThread:3182760 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout -2025-05-04 16:32:02,595 INFO MainThread:3182760 [wandb_init.py:init():867] starting run threads in backend -2025-05-04 16:32:03,942 INFO MainThread:3182760 [wandb_run.py:_console_start():2456] atexit reg -2025-05-04 16:32:03,942 INFO MainThread:3182760 [wandb_run.py:_redirect():2305] redirect: wrap_raw -2025-05-04 16:32:03,942 INFO MainThread:3182760 [wandb_run.py:_redirect():2370] Wrapping output streams. -2025-05-04 16:32:03,943 INFO MainThread:3182760 [wandb_run.py:_redirect():2395] Redirects installed. -2025-05-04 16:32:03,951 INFO MainThread:3182760 [wandb_init.py:init():911] run started, returning control to user process -2025-05-04 16:32:18,271 INFO MainThread:3182760 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_16-32-08_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} -2025-05-04 16:33:25,702 WARNING MsgRouterThr:3182760 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_163202-a8cxeqmf/run-a8cxeqmf.wandb b/wandb/run-20250504_163202-a8cxeqmf/run-a8cxeqmf.wandb deleted file mode 100644 index 6a806abc084b97c4e0ea4f3b996341285fb7d3e9..0000000000000000000000000000000000000000 Binary files a/wandb/run-20250504_163202-a8cxeqmf/run-a8cxeqmf.wandb and /dev/null differ diff --git a/wandb/run-20250504_163644-j17n0z1w/files/config.yaml b/wandb/run-20250504_163644-j17n0z1w/files/config.yaml deleted file mode 100644 index 47672e8d6eabcdaeb89962fefba89c107dc6703e..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163644-j17n0z1w/files/config.yaml +++ /dev/null @@ -1,357 +0,0 @@ -_wandb: - value: - cli_version: 0.18.7 - m: - - "1": train/global_step - "6": - - 3 - "7": [] - - "1": eval/accuracy - "5": 1 - "6": - - 1 - - 3 - "7": [] - - "1": eval/samples_per_second - "5": 1 - "6": - - 1 - - 3 - "7": [] - - "1": eval/steps_per_second - "5": 1 - "6": - - 1 - - 3 - "7": [] - - "1": train/epoch - "5": 1 - "6": - - 1 - - 3 - "7": [] - - "1": eval/loss - "5": 1 - "6": - - 1 - - 3 - "7": [] - - "1": eval/runtime - "5": 1 - "6": - - 1 - - 3 - "7": [] - python_version: 3.10.15 - t: - "1": - - 1 - - 2 - - 3 - - 5 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "2": - - 1 - - 2 - - 3 - - 5 - - 6 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "3": - - 7 - - 23 - - 55 - - 62 - - 66 - "4": 3.10.15 - "5": 0.18.7 - "6": 4.45.2 - "8": - - 5 - "9": - "1": transformers_trainer - "12": 0.18.7 - "13": linux-x86_64 -accelerator_config: - value: - dispatch_batches: null - even_batches: true - gradient_accumulation_kwargs: null - non_blocking: false - split_batches: false - use_seedable_sampler: true -adafactor: - value: false -adam_beta1: - value: 0.9 -adam_beta2: - value: 0.999 -adam_epsilon: - value: 1e-08 -auto_find_batch_size: - value: false -batch_eval_metrics: - value: false -bf16: - value: false -bf16_full_eval: - value: false -data_seed: - value: null -dataloader_drop_last: - value: false -dataloader_num_workers: - value: 0 -dataloader_persistent_workers: - value: false -dataloader_pin_memory: - value: true -dataloader_prefetch_factor: - value: null -ddp_backend: - value: null -ddp_broadcast_buffers: - value: null -ddp_bucket_cap_mb: - value: null -ddp_find_unused_parameters: - value: null -ddp_timeout: - value: 1800 -debug: - value: [] -deepspeed: - value: null -disable_tqdm: - value: false -dispatch_batches: - value: null -do_eval: - value: true -do_predict: - value: false -do_train: - value: false -eval_accumulation_steps: - value: null -eval_delay: - value: 0 -eval_do_concat_batches: - value: true -eval_on_start: - value: false -eval_steps: - value: null -eval_strategy: - value: epoch -eval_use_gather_object: - value: false -evaluation_strategy: - value: epoch -fp16: - value: true -fp16_backend: - value: auto -fp16_full_eval: - value: false -fp16_opt_level: - value: O1 -fsdp: - value: [] -fsdp_config: - value: - min_num_params: 0 - xla: false - xla_fsdp_grad_ckpt: false - xla_fsdp_v2: false -fsdp_min_num_params: - value: 0 -fsdp_transformer_layer_cls_to_wrap: - value: null -full_determinism: - value: false -gradient_accumulation_steps: - value: 4 -gradient_checkpointing: - value: false -gradient_checkpointing_kwargs: - value: null -greater_is_better: - value: false -group_by_length: - value: false -half_precision_backend: - value: auto -hub_always_push: - value: false -hub_model_id: - value: null -hub_private_repo: - value: false -hub_strategy: - value: every_save -hub_token: - value: -ignore_data_skip: - value: false -include_inputs_for_metrics: - value: false -include_num_input_tokens_seen: - value: false -include_tokens_per_second: - value: false -jit_mode_eval: - value: false -label_names: - value: null -label_smoothing_factor: - value: 0 -learning_rate: - value: 5e-05 -length_column_name: - value: length -load_best_model_at_end: - value: true -local_rank: - value: 0 -log_level: - value: passive -log_level_replica: - value: warning -log_on_each_node: - value: true -logging_dir: - value: t5-bc-out/runs/May04_16-36-51_kolyoz1 -logging_first_step: - value: false -logging_nan_inf_filter: - value: true -logging_steps: - value: 500 -logging_strategy: - value: steps -lr_scheduler_type: - value: linear -max_grad_norm: - value: 1 -max_steps: - value: -1 -metric_for_best_model: - value: loss -mp_parameters: - value: "" -neftune_noise_alpha: - value: null -no_cuda: - value: false -num_train_epochs: - value: 3 -optim: - value: adamw_torch -optim_args: - value: null -optim_target_modules: - value: null -output_dir: - value: t5-bc-out -overwrite_output_dir: - value: false -past_index: - value: -1 -per_device_eval_batch_size: - value: 8 -per_device_train_batch_size: - value: 8 -per_gpu_eval_batch_size: - value: null -per_gpu_train_batch_size: - value: null -prediction_loss_only: - value: false -push_to_hub: - value: false -push_to_hub_model_id: - value: null -push_to_hub_organization: - value: null -push_to_hub_token: - value: -ray_scope: - value: last -remove_unused_columns: - value: true -report_to: - value: - - wandb -restore_callback_states_from_checkpoint: - value: false -resume_from_checkpoint: - value: null -run_name: - value: t5-bc-out -save_on_each_node: - value: false -save_only_model: - value: false -save_safetensors: - value: false -save_steps: - value: 500 -save_strategy: - value: epoch -save_total_limit: - value: null -seed: - value: 42 -skip_memory_metrics: - value: true -split_batches: - value: null -tf32: - value: null -torch_compile: - value: false -torch_compile_backend: - value: null -torch_compile_mode: - value: null -torch_empty_cache_steps: - value: null -torchdynamo: - value: null -tpu_metrics_debug: - value: false -tpu_num_cores: - value: null -use_cpu: - value: false -use_ipex: - value: false -use_legacy_prediction_loop: - value: false -use_liger_kernel: - value: false -use_mps_device: - value: false -warmup_ratio: - value: 0 -warmup_steps: - value: 0 -weight_decay: - value: 0 diff --git a/wandb/run-20250504_163644-j17n0z1w/files/output.log b/wandb/run-20250504_163644-j17n0z1w/files/output.log deleted file mode 100644 index 789e4c60cde818136ebb5c5f55d9196d2dbffb5e..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163644-j17n0z1w/files/output.log +++ /dev/null @@ -1,15 +0,0 @@ -You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 -Map: 100%|██████████| 70/70 [00:00<00:00, 6893.99 examples/s] -Map: 100%|██████████| 15/15 [00:00<00:00, 3422.06 examples/s] -/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead - warnings.warn( -[2025-05-04 16:36:56,534] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. -100%|██████████| 6/6 [01:09<00:00, 11.54s/it] -Map: 100%|██████████| 15/15 [00:00<00:00, 3398.03 examples/s] -{'eval_loss': 0.30133458971977234, 'eval_accuracy': 1.0, 'eval_runtime': 0.084, 'eval_samples_per_second': 178.602, 'eval_steps_per_second': 23.814, 'epoch': 0.89} -{'eval_loss': 0.14025470614433289, 'eval_accuracy': 1.0, 'eval_runtime': 0.0899, 'eval_samples_per_second': 166.815, 'eval_steps_per_second': 22.242, 'epoch': 1.78} -{'eval_loss': 0.09236248582601547, 'eval_accuracy': 1.0, 'eval_runtime': 0.0606, 'eval_samples_per_second': 247.332, 'eval_steps_per_second': 32.978, 'epoch': 2.67} -{'train_runtime': 69.2309, 'train_samples_per_second': 3.033, 'train_steps_per_second': 0.087, 'train_loss': 0.34036485354105633, 'epoch': 2.67} -100%|██████████| 2/2 [00:00<00:00, 93.34it/s] -{'eval_loss': 0.09890136122703552, 'eval_accuracy': 1.0, 'eval_runtime': 0.0503, 'eval_samples_per_second': 298.458, 'eval_steps_per_second': 39.794, 'epoch': 2.6666666666666665} diff --git a/wandb/run-20250504_163644-j17n0z1w/files/requirements.txt b/wandb/run-20250504_163644-j17n0z1w/files/requirements.txt deleted file mode 100644 index 847c45ecccb522de294762faeeb01fe5fb02f7ac..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163644-j17n0z1w/files/requirements.txt +++ /dev/null @@ -1,541 +0,0 @@ -nvidia-cuda-cupti-cu12==12.4.127 -nvidia-cuda-nvrtc-cu12==12.4.127 -pyg-lib==0.4.0+pt20cu117 -biopython==1.85 -iniconfig==2.0.0 -tokenizers==0.20.0 -accelerate==1.3.0 -torch==2.6.0 -nvidia-nccl-cu12==2.21.5 -transformers==4.45.2 -nvidia-cusparse-cu12==12.3.1.170 -torch-scatter==2.1.2+pt20cu117 -nvidia-cusparselt-cu12==0.6.2 -nvidia-nvtx-cu12==12.4.127 -zstd==1.5.6.6 -fair-esm==2.0.0 -omegaconf==2.3.0 -pluggy==1.5.0 -pytest==8.3.5 -nvidia-curand-cu12==10.3.5.147 -nvidia-cufft-cu12==11.2.1.3 -torch-cluster==1.6.3+pt20cu117 -regex==2024.9.11 -nvidia-cudnn-cu12==9.1.0.70 -torch-spline-conv==1.2.2+pt20cu117 -nvidia-cusolver-cu12==11.6.1.9 -antlr4-python3-runtime==4.9.3 -msgpack-numpy==0.4.8 -nlp==0.2.0 -einops==0.8.1 -nvidia-cublas-cu12==12.4.5.8 -triton==3.2.0 -ninja==1.11.1.3 -hydra-core==1.3.2 -nvidia-nvjitlink-cu12==12.4.127 -biotite==0.41.2 -torch-sparse==0.6.18+pt20cu117 -esm==3.1.4 -sympy==1.13.1 -nvidia-cuda-runtime-cu12==12.4.127 -jupyter-lsp==2.2.5 -jupyter-events==0.10.0 -ipykernel==6.29.5 -Mako==1.3.5 -proto-plus==1.25.0 -fst-pso==1.8.1 -gensim==4.3.3 -htmlmin==0.1.12 -tokenizers==0.13.3 -timm==1.0.11 -MarkupSafe==3.0.2 -safetensors==0.4.5 -requests==2.32.3 -gast==0.5.5 -cuml==24.12.0a33 -jaxlib==0.4.23.dev20240214 -spacy-loggers==1.0.5 -pytz==2024.1 -idna==3.10 -python-dateutil==2.9.0 -mdurl==0.1.2 -blis==0.7.10 -jupyter==1.1.1 -pyerfa==2.0.1.5 -comm==0.2.2 -pygraphviz==1.14 -dill==0.3.8 -paramiko==3.5.0 -llama-index==0.8.36 -mdit-py-plugins==0.4.2 -Werkzeug==3.1.3 -pyu2f==0.1.5 -dask-glm==0.2.0 -httpx==0.27.2 -typeguard==4.4.1 -mypy-extensions==1.0.0 -kmodes==0.12.2 -keras==2.15.0 -ydata-profiling==0.0.dev0 -regex==2024.11.6 -xarray==2024.11.0 -setuptools==75.3.0 -charset-normalizer==3.4.0 -jupyterlab_nvdashboard==0.11.0 -pylibraft==24.12.0a36 -spacy==3.7.6 -mlflow-skinny==2.17.2 -nvtx==0.2.10 -multimethod==1.12 -pexpect==4.9.0 -torch==2.1.0.post301 -flatbuffers==24.3.25 -python-json-logger==2.0.7 -PyJWT==2.9.0 -multiprocess==0.70.16 -colorlover==0.3.0 -yarl==1.16.0 -locket==1.0.0 -patsy==1.0.0 -rapids-dask-dependency==24.12.0a0 -stanza==1.9.2 -debugpy==1.8.8 -jupyterlab_pygments==0.3.0 -pylibcudf==24.12.0a337 -lz4==4.3.3 -pandas==2.2.3 -tifffile==2024.9.20 -pynvml==11.4.1 -cufflinks==0.17.3 -ipywidgets==8.1.5 -requests-oauthlib==2.0.0 -google-auth-oauthlib==1.2.1 -rsa==4.9 -webcolors==24.8.0 -jsonschema-specifications==2024.10.1 -scikit-learn==1.5.2 -langchain-text-splitters==0.3.2 -pandas-datareader==0.10.0 -tomli==2.0.2 -tzdata==2024.2 -scikit-image==0.24.0 -tensorboard_data_server==0.7.0 -kiwisolver==1.4.7 -cloudpathlib==0.20.0 -isodate==0.6.1 -adversarial-robustness-toolbox==1.19.1 -SQLAlchemy==2.0.36 -pytest-runner==6.0.0 -pycairo==1.27.0 -treelite==4.3.0 -jiter==0.7.0 -threadpoolctl==3.5.0 -pandocfilters==1.5.0 -loguru==0.7.2 -smart_open==7.0.5 -shellingham==1.5.4 -deepspeed==0.15.4 -prompt_toolkit==3.0.48 -databricks-sdk==0.34.0 -langchain-core==0.3.15 -imageio==2.36.0 -openapi-schema-pydantic==1.2.4 -zict==3.0.0 -cachetools==5.5.0 -colorful==0.5.6 -mpmath==1.3.0 -nest_asyncio==1.6.0 -pyFUME==0.2.25 -opencv-python-headless==4.9.0 -fastai==2.7.18 -importlib_resources==6.4.5 -binaryornot==0.4.4 -evaluate==0.4.1 -matplotlib-inline==0.1.7 -wasabi==1.1.2 -pycparser==2.22 -GitPython==3.1.43 -pluggy==1.5.0 -async-lru==2.0.4 -pgmpy==0.1.24 -anyio==4.4.0 -executing==2.1.0 -orjson==3.10.11 -humanfriendly==10.0 -tornado==6.4.1 -gmpy2==2.1.5 -rlPyCairo==0.2.0 -distributed==2024.11.0 -FuzzyTM==2.0.5 -torchtext==0.15.2a0+5ce3163 -pytest==8.3.5 -pyod==2.0.2 -ImageHash==4.3.1 -soupsieve==2.5 -tblib==3.0.0 -emoji==2.14.0 -aiohappyeyeballs==2.4.3 -uri-template==1.3.0 -tensorflow_estimator==2.15.0 -babel==2.16.0 -dask-cuda==24.12.0a12 -overrides==7.7.0 -opencensus==0.11.3 -openai==0.28.1 -language_data==1.2.0 -jedi==0.19.2 -cookiecutter==2.6.0 -entrypoints==0.4 -exceptiongroup==1.2.2 -marisa-trie==1.2.0 -uvloop==0.20.0 -aiosignal==1.3.1 -Flask==3.0.3 -tensorboard==2.15.2 -cffi==1.17.1 -tf_keras==2.15.0 -absl-py==2.1.0 -blinker==1.9.0 -types-python-dateutil==2.9.0.20241003 -opencv-python==4.9.0 -frozendict==2.4.6 -aiohttp-cors==0.7.0 -statsmodels==0.14.4 -tinycss2==1.4.0 -terminado==0.18.1 -pycaret==2.2.3 -aiohttp==3.10.10 -distributed-ucxx==0.41.0 -prometheus_client==0.21.0 -fastdownload==0.0.7 -grpcio==1.59.3 -google-api-core==2.22.0 -jupyterlab_widgets==3.0.13 -appdirs==1.4.4 -littleutils==0.0.0 -ray==2.24.0 -kaggle==1.6.17 -jsonschema==4.23.0 -google-auth==2.36.0 -scikit-base==0.11.0 -visions==0.7.6 -pyarrow==15.0.0 -transformers==4.33.0 -prometheus_flask_exporter==0.23.1 -dm-tree==0.1.8 -colorama==0.4.6 -requests-toolbelt==1.0.0 -cached-property==1.5.2 -cymem==2.0.8 -PyNaCl==1.5.0 -PyWavelets==1.7.0 -httptools==0.6.1 -typing-utils==0.1.0 -email_validator==2.2.0 -marshmallow==3.23.1 -Deprecated==1.2.14 -virtualenv==20.4.7 -optuna==3.6.1 -jupyter_server==2.14.2 -termcolor==2.5.0 -mpi4py==4.0.1 -torchdata==0.7.1+8cea82f -dataclasses==0.8 -cloudpickle==3.1.0 -tree_sitter_languages==1.10.2 -tabulate==0.9.0 -ipython==8.29.0 -lightgbm==4.3.0 -captum==0.6.0 -confuse==2.0.1 -torchvision==0.16.1+adc3221 -lxml==4.9.4 -fastapi==0.115.4 -python-multipart==0.0.17 -dnspython==2.7.0 -jupyter-console==6.6.3 -preshed==3.0.9 -py-cpuinfo==9.0.0 -Send2Trash==1.8.3 -murmurhash==1.0.10 -sniffio==1.3.1 -websockets==13.1 -h11==0.14.0 -smmap==5.0.0 -textual==0.85.2 -jsonpatch==1.33 -opencensus-context==0.1.3 -nbconvert==7.16.4 -sentry-sdk==2.19.0 -opentelemetry-semantic-conventions==0.37b0 -pandas-profiling==2.8.0 -pillow==10.3.0 -peft==0.13.2 -rpds-py==0.21.0 -bokeh==3.6.1 -distro==1.9.0 -itsdangerous==2.2.0 -wandb==0.18.7 -jsonpointer==3.0.0 -astropy-iers-data==0.2024.11.11.0.32.38 -horovod==0.28.1 -graphviz==0.20.3 -vtk==9.3.1 -bleach==6.2.0 -numexpr==2.8.7 -pydantic_core==2.23.4 -Jinja2==3.1.4 -widgetsnbextension==4.0.13 -filelock==3.16.1 -catboost==1.2.7 -raft-dask==24.12.0a36 -async-timeout==4.0.3 -datefinder==0.7.3 -coloredlogs==15.0.1 -platformdirs==4.3.6 -spacy-legacy==3.0.12 -chardet==5.2.0 -jupyter_client==8.6.3 -importlib_metadata==8.5.0 -rfc3986-validator==0.1.1 -huggingface_hub==0.26.2 -PySocks==1.7.1 -mlxtend==0.23.2 -outdated==0.2.2 -partd==1.4.2 -thinc==8.2.5 -astropy==6.1.6 -rdflib==6.3.2 -h2==4.1.0 -typer==0.13.0 -xyzservices==2024.9.0 -toolz==0.12.1 -frozenlist==1.5.0 -rdkit==2024.9.2 -pyasn1==0.6.1 -jupyter_server_terminals==0.5.3 -ucx-py==0.41.0a11 -astunparse==1.6.3 -simpful==2.12.0 -notebook_shim==0.2.4 -scipy==1.13.1 -colorlog==6.9.0 -tiktoken==0.3.3 -plotly==5.24.1 -fastrlock==0.8.2 -chart-studio==1.1.0 -stack-data==0.6.2 -google-pasta==0.2.0 -sktime==0.34.0 -PyYAML==6.0.2 -sympy==1.13.3 -multidict==6.1.0 -ml-dtypes==0.2.0 -tensorboardX==2.6.2.2 -decorator==5.1.1 -cytoolz==1.0.0 -ase==3.23.0 -isoduration==20.11.0 -html5lib==1.1 -langsmith==0.1.142 -future==1.0.0 -onnx2torch==1.5.15 -multipledispatch==0.6.0 -protobuf==4.24.4 -ucxx==0.41.0 -pandas_flavor==0.6.0 -msgpack==1.1.0 -pyasn1_modules==0.4.1 -imagecodecs==2024.1.1 -mlflow==2.17.2 -watchfiles==0.24.0 -dm-sonnet==2.0.2 -langcodes==3.4.1 -freetype-py==2.3.0 -argon2-cffi-bindings==21.2.0 -trimesh==4.5.2 -opt_einsum==3.4.0 -tenacity==8.5.0 -h5py==3.12.1 -fastapi-cli==0.0.5 -oauthlib==3.2.2 -parso==0.8.4 -weasel==0.4.1 -yfinance==0.2.49 -networkx==2.8.8 -bitsandbytes==0.44.1 -lazy_loader==0.4 -querystring_parser==1.2.4 -contourpy==1.3.0 -unicodedata2==15.1.0 -bcrypt==4.2.0 -munkres==1.1.4 -langchain==0.0.298 -hpack==4.0.0 -cryptography==43.0.3 -umap-learn==0.5.7 -arrow==1.3.0 -docker==7.1.0 -certifi==2025.1.31 -fastjsonschema==2.20.0 -tensorflow==2.15.0 -googleapis-common-protos==1.65.0 -iniconfig==2.0.0 -Markdown==3.6 -llvmlite==0.43.0 -wslink==2.3.2 -attrs==24.2.0 -rich==13.9.4 -cupy==13.3.0 -uc-micro-py==1.0.3 -alembic==1.14.0 -joblib==1.4.2 -reportlab==4.2.5 -miniful==0.0.6 -jupyter_core==5.7.2 -wheel==0.45.0 -phik==0.12.3 -mistune==3.0.2 -wcwidth==0.2.13 -dacite==1.8.1 -accelerate==0.22.0 -sacremoses==0.0.53 -revtok==0.0.3 -python-slugify==8.0.4 -tangled-up-in-unicode==0.2.0 -dask==2024.11.0 -markdown-it-py==3.0.0 -sentencepiece==0.1.99 -beautifulsoup4==4.12.3 -six==1.16.0 -numba-cuda==0.0.17 -argon2-cffi==23.1.0 -xxhash==3.5.0 -hjson==3.1.0 -fonttools==4.54.1 -graphql-core==3.2.5 -pyparsing==3.2.0 -pure_eval==0.2.3 -distlib==0.3.9 -lightning==2.4.0 -wordcloud==0.0.0 -catalogue==2.0.10 -jax==0.4.27 -tree-sitter==0.23.2 -notebook==7.2.2 -dataclasses-json==0.6.7 -propcache==0.2.0 -numba==0.60.0 -dask-expr==1.1.17 -pydantic==2.9.2 -gunicorn==22.0.0 -missingno==0.5.2 -pyOpenSSL==24.2.1 -openpyxl==3.1.5 -packaging==24.1 -python-dotenv==1.0.1 -cycler==0.12.1 -types-pytz==2024.2.0.20241003 -yellowbrick==1.5 -referencing==0.35.1 -pyLDAvis==3.4.1 -lazypredict==0.2.16 -fqdn==1.5.1 -websocket-client==1.8.0 -fastcore==1.7.19 -pynvjitlink-cu12==0.3.0 -pingouin==0.5.5 -numpy==1.26.4 -typing-inspect==0.9.0 -nltk==3.9.1 -onnxruntime==1.19.2 -tensorflow-probability==0.23.0 -datasets==3.0.2 -pickleshare==0.7.5 -peewee==3.17.7 -torch-geometric==2.6.1 -ptyprocess==0.7.0 -greenlet==3.1.1 -graphql-relay==3.2.0 -graphene==3.4.3 -et_xmlfile==2.0.0 -webencodings==0.5.1 -hyperframe==6.0.1 -multitasking==0.0.9 -typer-slim==0.13.0 -onnx==1.15.0 -uvicorn==0.32.0 -memray==1.13.4 -xgboost==2.1.2 -Brotli==1.1.0 -zipp==3.21.0 -nbformat==5.10.4 -responses==0.18.0 -funcy==2.0 -Pygments==2.18.0 -tqdm==4.67.0 -linkify-it-py==2.0.3 -srsly==2.4.8 -cuda-python==12.6.0 -lightning-utilities==0.11.8 -cudf==24.12.0a337 -dask-ml==2024.4.4 -docker-pycreds==0.4.0 -pkgutil_resolve_name==1.3.10 -opentelemetry-api==1.16.0 -fsspec==2024.9.0 -nbclient==0.10.0 -psutil==5.9.8 -pytorch-lightning==2.4.0 -sortedcontainers==2.4.0 -matplotlib==3.9.2 -defusedxml==0.7.1 -urllib3==1.26.19 -jupyterlab_server==2.27.3 -retrying==1.3.3 -dask-cudf==24.12.0a337 -sqlparse==0.5.1 -text-unidecode==1.3 -seaborn==0.13.2 -typing_extensions==4.12.2 -pyzmq==26.2.0 -rfc3339-validator==0.1.4 -pynndescent==0.5.13 -pip==24.3.1 -confection==0.1.4 -wrapt==1.14.1 -fastprogress==1.0.3 -traitlets==5.14.3 -asttokens==2.4.1 -json5==0.9.28 -pandas-stubs==2.2.3.241126 -torchmetrics==1.2.1 -gitdb==4.0.11 -annotated-types==0.7.0 -ipython-autotime==0.1 -httpcore==1.0.6 -click==8.1.7 -setproctitle==1.3.3 -starlette==0.41.2 -jupyterlab==4.2.5 -rmm==24.12.0a27 -opentelemetry-sdk==1.16.0 -textblob==0.15.3 -imbalanced-learn==0.12.4 -typeguard==4.3.0 -more-itertools==10.3.0 -zipp==3.19.2 -autocommand==2.2.2 -jaraco.context==5.3.0 -packaging==24.1 -importlib_metadata==8.0.0 -platformdirs==4.2.2 -jaraco.functools==4.0.1 -importlib_resources==6.4.0 -tomli==2.0.1 -jaraco.text==3.12.1 -wheel==0.43.0 -jaraco.collections==5.1.0 -typing_extensions==4.12.2 -inflect==7.3.1 -backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_163644-j17n0z1w/files/wandb-metadata.json b/wandb/run-20250504_163644-j17n0z1w/files/wandb-metadata.json deleted file mode 100644 index 661f0244432fdb8428ff70df9987a780c88edab2..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163644-j17n0z1w/files/wandb-metadata.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", - "python": "3.10.15", - "startedAt": "2025-05-04T13:36:44.683493Z", - "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", - "codePath": "finetuning_bc_prott5.py", - "email": "zeynep.isik1@sabanciuniv.edu", - "root": "/arf/scratch/zisik/prott5_bc_ft", - "host": "kolyoz1", - "username": "zisik", - "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", - "codePathLocal": "finetuning_bc_prott5.py", - "cpu_count": 64, - "cpu_count_logical": 64, - "gpu": "NVIDIA H100 80GB HBM3", - "gpu_count": 1, - "disk": { - "/": { - "total": "7643995308032", - "used": "274930868224" - } - }, - "memory": { - "total": "1081373220864" - }, - "cpu": { - "count": 64, - "countLogical": 64 - }, - "gpu_nvidia": [ - { - "name": "NVIDIA H100 80GB HBM3", - "memoryTotal": "85520809984", - "cudaCores": 16896, - "architecture": "Hopper" - } - ], - "slurm": { - "cluster_name": "cuda", - "conf": "/etc/slurm/slurm.conf", - "cpus_on_node": "16", - "cpus_per_task": "16", - "gpus_on_node": "1", - "gtids": "0", - "job_account": "tbag154", - "job_cpus_per_node": "16", - "job_end_time": "1746624978", - "job_gid": "11636", - "job_gpus": "1", - "job_id": "1027956", - "job_name": "msa_ph_pt", - "job_nodelist": "kolyoz1", - "job_num_nodes": "1", - "job_partition": "kolyoz-cuda", - "job_qos": "tbag", - "job_start_time": "1746365778", - "job_uid": "11636", - "job_user": "zisik", - "jobid": "1027956", - "localid": "0", - "mem_per_cpu": "14000", - "nnodes": "1", - "node_aliases": "(null)", - "nodeid": "0", - "nodelist": "kolyoz1", - "prio_process": "0", - "procid": "0", - "submit_dir": "/arf/scratch/zisik", - "submit_host": "cuda-ui", - "task_pid": "3183359", - "tasks_per_node": "1", - "topology_addr": "kolyoz1", - "topology_addr_pattern": "node", - "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" - }, - "cudaVersion": "12.6" -} \ No newline at end of file diff --git a/wandb/run-20250504_163644-j17n0z1w/files/wandb-summary.json b/wandb/run-20250504_163644-j17n0z1w/files/wandb-summary.json deleted file mode 100644 index ff8192ac678b3079d8ae2b540b83b8069023ccca..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163644-j17n0z1w/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"eval/loss":0.09890136122703552,"_timestamp":1.746365889900344e+09,"train_steps_per_second":0.087,"train/epoch":2.6666666666666665,"eval/accuracy":1,"_wandb":{"runtime":90},"total_flos":0,"train_samples_per_second":3.033,"eval/samples_per_second":298.458,"eval/steps_per_second":39.794,"train_runtime":69.2309,"eval/runtime":0.0503,"train_loss":0.34036485354105633,"_step":4,"train/global_step":6,"_runtime":85.217340117} \ No newline at end of file diff --git a/wandb/run-20250504_163644-j17n0z1w/logs/debug-core.log b/wandb/run-20250504_163644-j17n0z1w/logs/debug-core.log deleted file mode 100644 index 740d60d9f71c8c587518099b5e357d43e8786f46..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163644-j17n0z1w/logs/debug-core.log +++ /dev/null @@ -1,14 +0,0 @@ -{"time":"2025-05-04T16:36:43.800622213+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpudnn84p2/port-3183386.txt","pid":3183386,"debug":false,"disable-analytics":false} -{"time":"2025-05-04T16:36:43.800675477+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} -{"time":"2025-05-04T16:36:43.801533455+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3183386} -{"time":"2025-05-04T16:36:43.801429105+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":34585,"Zone":""}} -{"time":"2025-05-04T16:36:43.98511968+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:42446"} -{"time":"2025-05-04T16:36:44.686997088+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"j17n0z1w","id":"127.0.0.1:42446"} -{"time":"2025-05-04T16:36:44.811113202+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"j17n0z1w","id":"127.0.0.1:42446"} -{"time":"2025-05-04T16:38:15.462653307+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:42446"} -{"time":"2025-05-04T16:38:15.462760405+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:42446"} -{"time":"2025-05-04T16:38:15.462866235+03:00","level":"INFO","msg":"server is shutting down"} -{"time":"2025-05-04T16:38:15.462928073+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:42446"} -{"time":"2025-05-04T16:38:16.450021056+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:42446"} -{"time":"2025-05-04T16:38:16.450050764+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:42446"} -{"time":"2025-05-04T16:38:16.450073997+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_163644-j17n0z1w/logs/debug-internal.log b/wandb/run-20250504_163644-j17n0z1w/logs/debug-internal.log deleted file mode 100644 index 702df31c286faaf6c26783e1a598776dc6727960..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163644-j17n0z1w/logs/debug-internal.log +++ /dev/null @@ -1,19 +0,0 @@ -{"time":"2025-05-04T16:36:44.68958036+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} -{"time":"2025-05-04T16:36:44.68962696+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_163644-j17n0z1w/logs/debug-core.log"} -{"time":"2025-05-04T16:36:44.811045191+03:00","level":"INFO","msg":"created new stream","id":"j17n0z1w"} -{"time":"2025-05-04T16:36:44.81110033+03:00","level":"INFO","msg":"stream: started","id":"j17n0z1w"} -{"time":"2025-05-04T16:36:44.811127326+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"j17n0z1w"} -{"time":"2025-05-04T16:36:44.812597235+03:00","level":"INFO","msg":"handler: started","stream_id":"j17n0z1w"} -{"time":"2025-05-04T16:36:44.812682202+03:00","level":"INFO","msg":"sender: started","stream_id":"j17n0z1w"} -{"time":"2025-05-04T16:36:45.2302005+03:00","level":"INFO","msg":"Starting system monitor"} -{"time":"2025-05-04T16:38:15.462763223+03:00","level":"INFO","msg":"stream: closing","id":"j17n0z1w"} -{"time":"2025-05-04T16:38:15.462833186+03:00","level":"INFO","msg":"Stopping system monitor"} -{"time":"2025-05-04T16:38:15.463959432+03:00","level":"INFO","msg":"Stopped system monitor"} -{"time":"2025-05-04T16:38:15.653986013+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} -{"time":"2025-05-04T16:38:15.654018889+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} -{"time":"2025-05-04T16:38:15.654030152+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} -{"time":"2025-05-04T16:38:16.194806616+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} -{"time":"2025-05-04T16:38:16.449749922+03:00","level":"INFO","msg":"handler: closed","stream_id":"j17n0z1w"} -{"time":"2025-05-04T16:38:16.449817209+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"j17n0z1w"} -{"time":"2025-05-04T16:38:16.449847499+03:00","level":"INFO","msg":"sender: closed","stream_id":"j17n0z1w"} -{"time":"2025-05-04T16:38:16.449922381+03:00","level":"INFO","msg":"stream: closed","id":"j17n0z1w"} diff --git a/wandb/run-20250504_163644-j17n0z1w/logs/debug.log b/wandb/run-20250504_163644-j17n0z1w/logs/debug.log deleted file mode 100644 index d737c7f055847a85314b84c35816a14c7b1b12cf..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_163644-j17n0z1w/logs/debug.log +++ /dev/null @@ -1,27 +0,0 @@ -2025-05-04 16:36:44,676 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 -2025-05-04 16:36:44,676 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Configure stats pid to 3183386 -2025-05-04 16:36:44,676 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings -2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings -2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} -2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} -2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_163644-j17n0z1w/logs/debug.log -2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_163644-j17n0z1w/logs/debug-internal.log -2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_init.py:init():619] calling init triggers -2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_init.py:init():626] wandb.init called with sweep_config: {} -config: {} -2025-05-04 16:36:44,678 INFO MainThread:3183386 [wandb_init.py:init():669] starting backend -2025-05-04 16:36:44,678 INFO MainThread:3183386 [wandb_init.py:init():673] sending inform_init request -2025-05-04 16:36:44,682 INFO MainThread:3183386 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn -2025-05-04 16:36:44,683 INFO MainThread:3183386 [wandb_init.py:init():686] backend started and connected -2025-05-04 16:36:44,690 INFO MainThread:3183386 [wandb_init.py:init():781] updated telemetry -2025-05-04 16:36:44,693 INFO MainThread:3183386 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout -2025-05-04 16:36:45,217 INFO MainThread:3183386 [wandb_init.py:init():867] starting run threads in backend -2025-05-04 16:36:46,645 INFO MainThread:3183386 [wandb_run.py:_console_start():2456] atexit reg -2025-05-04 16:36:46,645 INFO MainThread:3183386 [wandb_run.py:_redirect():2305] redirect: wrap_raw -2025-05-04 16:36:46,645 INFO MainThread:3183386 [wandb_run.py:_redirect():2370] Wrapping output streams. -2025-05-04 16:36:46,645 INFO MainThread:3183386 [wandb_run.py:_redirect():2395] Redirects installed. -2025-05-04 16:36:46,651 INFO MainThread:3183386 [wandb_init.py:init():911] run started, returning control to user process -2025-05-04 16:37:00,590 INFO MainThread:3183386 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_16-36-51_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} -2025-05-04 16:38:15,463 WARNING MsgRouterThr:3183386 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_163644-j17n0z1w/run-j17n0z1w.wandb b/wandb/run-20250504_163644-j17n0z1w/run-j17n0z1w.wandb deleted file mode 100644 index dd24d8341cb77819cc99d2fe0367ee037f5874e8..0000000000000000000000000000000000000000 Binary files a/wandb/run-20250504_163644-j17n0z1w/run-j17n0z1w.wandb and /dev/null differ diff --git a/wandb/run-20250504_172503-0ictlmwf/files/config.yaml b/wandb/run-20250504_172503-0ictlmwf/files/config.yaml deleted file mode 100644 index 90819acdbf6ef774fd8d0e6d0cd98145d182ec2b..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_172503-0ictlmwf/files/config.yaml +++ /dev/null @@ -1,375 +0,0 @@ -_wandb: - value: - cli_version: 0.18.7 - m: - - "1": eval/runtime - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/global_step - "6": - - 3 - "7": [] - - "1": eval/samples_per_second - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/loss - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/grad_norm - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/learning_rate - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": train/epoch - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/loss - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/accuracy - "5": 2 - "6": - - 1 - - 3 - "7": [] - - "1": eval/steps_per_second - "5": 2 - "6": - - 1 - - 3 - "7": [] - python_version: 3.10.15 - t: - "1": - - 1 - - 2 - - 3 - - 5 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "2": - - 1 - - 2 - - 3 - - 5 - - 6 - - 11 - - 12 - - 49 - - 51 - - 53 - - 55 - - 71 - - 98 - - 105 - "3": - - 7 - - 23 - - 55 - - 62 - - 66 - "4": 3.10.15 - "5": 0.18.7 - "6": 4.45.2 - "8": - - 5 - "9": - "1": transformers_trainer - "12": 0.18.7 - "13": linux-x86_64 -accelerator_config: - value: - dispatch_batches: null - even_batches: true - gradient_accumulation_kwargs: null - non_blocking: false - split_batches: false - use_seedable_sampler: true -adafactor: - value: false -adam_beta1: - value: 0.9 -adam_beta2: - value: 0.999 -adam_epsilon: - value: 1e-08 -auto_find_batch_size: - value: false -batch_eval_metrics: - value: false -bf16: - value: false -bf16_full_eval: - value: false -data_seed: - value: null -dataloader_drop_last: - value: false -dataloader_num_workers: - value: 0 -dataloader_persistent_workers: - value: false -dataloader_pin_memory: - value: true -dataloader_prefetch_factor: - value: null -ddp_backend: - value: null -ddp_broadcast_buffers: - value: null -ddp_bucket_cap_mb: - value: null -ddp_find_unused_parameters: - value: null -ddp_timeout: - value: 1800 -debug: - value: [] -deepspeed: - value: null -disable_tqdm: - value: false -dispatch_batches: - value: null -do_eval: - value: true -do_predict: - value: false -do_train: - value: false -eval_accumulation_steps: - value: null -eval_delay: - value: 0 -eval_do_concat_batches: - value: true -eval_on_start: - value: false -eval_steps: - value: null -eval_strategy: - value: epoch -eval_use_gather_object: - value: false -evaluation_strategy: - value: epoch -fp16: - value: true -fp16_backend: - value: auto -fp16_full_eval: - value: false -fp16_opt_level: - value: O1 -fsdp: - value: [] -fsdp_config: - value: - min_num_params: 0 - xla: false - xla_fsdp_grad_ckpt: false - xla_fsdp_v2: false -fsdp_min_num_params: - value: 0 -fsdp_transformer_layer_cls_to_wrap: - value: null -full_determinism: - value: false -gradient_accumulation_steps: - value: 4 -gradient_checkpointing: - value: false -gradient_checkpointing_kwargs: - value: null -greater_is_better: - value: false -group_by_length: - value: false -half_precision_backend: - value: auto -hub_always_push: - value: false -hub_model_id: - value: null -hub_private_repo: - value: false -hub_strategy: - value: every_save -hub_token: - value: -ignore_data_skip: - value: false -include_inputs_for_metrics: - value: false -include_num_input_tokens_seen: - value: false -include_tokens_per_second: - value: false -jit_mode_eval: - value: false -label_names: - value: null -label_smoothing_factor: - value: 0 -learning_rate: - value: 5e-05 -length_column_name: - value: length -load_best_model_at_end: - value: true -local_rank: - value: 0 -log_level: - value: passive -log_level_replica: - value: warning -log_on_each_node: - value: true -logging_dir: - value: t5-bc-out/runs/May04_17-25-43_kolyoz1 -logging_first_step: - value: false -logging_nan_inf_filter: - value: true -logging_steps: - value: 500 -logging_strategy: - value: steps -lr_scheduler_type: - value: linear -max_grad_norm: - value: 1 -max_steps: - value: -1 -metric_for_best_model: - value: loss -mp_parameters: - value: "" -neftune_noise_alpha: - value: null -no_cuda: - value: false -num_train_epochs: - value: 3 -optim: - value: adamw_torch -optim_args: - value: null -optim_target_modules: - value: null -output_dir: - value: t5-bc-out -overwrite_output_dir: - value: false -past_index: - value: -1 -per_device_eval_batch_size: - value: 8 -per_device_train_batch_size: - value: 8 -per_gpu_eval_batch_size: - value: null -per_gpu_train_batch_size: - value: null -prediction_loss_only: - value: false -push_to_hub: - value: false -push_to_hub_model_id: - value: null -push_to_hub_organization: - value: null -push_to_hub_token: - value: -ray_scope: - value: last -remove_unused_columns: - value: true -report_to: - value: - - wandb -restore_callback_states_from_checkpoint: - value: false -resume_from_checkpoint: - value: null -run_name: - value: t5-bc-out -save_on_each_node: - value: false -save_only_model: - value: false -save_safetensors: - value: false -save_steps: - value: 500 -save_strategy: - value: epoch -save_total_limit: - value: null -seed: - value: 42 -skip_memory_metrics: - value: true -split_batches: - value: null -tf32: - value: null -torch_compile: - value: false -torch_compile_backend: - value: null -torch_compile_mode: - value: null -torch_empty_cache_steps: - value: null -torchdynamo: - value: null -tpu_metrics_debug: - value: false -tpu_num_cores: - value: null -use_cpu: - value: false -use_ipex: - value: false -use_legacy_prediction_loop: - value: false -use_liger_kernel: - value: false -use_mps_device: - value: false -warmup_ratio: - value: 0 -warmup_steps: - value: 0 -weight_decay: - value: 0 diff --git a/wandb/run-20250504_172503-0ictlmwf/files/output.log b/wandb/run-20250504_172503-0ictlmwf/files/output.log deleted file mode 100644 index f279e4f7e0075186a5bddae1ec00f2da2afeb33d..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_172503-0ictlmwf/files/output.log +++ /dev/null @@ -1,110 +0,0 @@ -You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 -Map: 100%|██████████| 511104/511104 [00:29<00:00, 17366.65 examples/s] -Map: 100%|██████████| 109522/109522 [00:04<00:00, 26402.34 examples/s] -/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead - warnings.warn( -[2025-05-04 17:25:48,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) -wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. - 33%|███▎ | 15972/47916 [2:23:32<4:53:49, 1.81it/s] -{'loss': 0.5856, 'grad_norm': 1.3348039388656616, 'learning_rate': 4.947825361048502e-05, 'epoch': 0.03} -{'loss': 0.5183, 'grad_norm': 2.473144292831421, 'learning_rate': 4.8956507220970036e-05, 'epoch': 0.06} -{'loss': 0.4879, 'grad_norm': 3.6210598945617676, 'learning_rate': 4.843476083145505e-05, 'epoch': 0.09} -{'loss': 0.4579, 'grad_norm': 6.336288928985596, 'learning_rate': 4.791405793471909e-05, 'epoch': 0.13} -{'loss': 0.4421, 'grad_norm': 2.6699299812316895, 'learning_rate': 4.739231154520411e-05, 'epoch': 0.16} -{'loss': 0.4205, 'grad_norm': 7.918868064880371, 'learning_rate': 4.6870565155689124e-05, 'epoch': 0.19} -{'loss': 0.4044, 'grad_norm': 2.9816083908081055, 'learning_rate': 4.634881876617414e-05, 'epoch': 0.22} -{'loss': 0.3901, 'grad_norm': 7.581803321838379, 'learning_rate': 4.582707237665916e-05, 'epoch': 0.25} -{'loss': 0.3834, 'grad_norm': 6.031352996826172, 'learning_rate': 4.5305325987144174e-05, 'epoch': 0.28} -{'loss': 0.3601, 'grad_norm': 2.581623077392578, 'learning_rate': 4.478357959762919e-05, 'epoch': 0.31} -{'loss': 0.3492, 'grad_norm': 4.7024245262146, 'learning_rate': 4.42618332081142e-05, 'epoch': 0.34} -{'loss': 0.3435, 'grad_norm': 8.929915428161621, 'learning_rate': 4.374217380415728e-05, 'epoch': 0.38} -{'loss': 0.3366, 'grad_norm': 3.694370985031128, 'learning_rate': 4.32204274146423e-05, 'epoch': 0.41} -{'loss': 0.3259, 'grad_norm': 5.6961350440979, 'learning_rate': 4.2698681025127307e-05, 'epoch': 0.44} -{'loss': 0.3224, 'grad_norm': 2.740339756011963, 'learning_rate': 4.217693463561232e-05, 'epoch': 0.47} -{'loss': 0.3103, 'grad_norm': 3.7285494804382324, 'learning_rate': 4.165518824609734e-05, 'epoch': 0.5} -{'loss': 0.3107, 'grad_norm': 5.1480326652526855, 'learning_rate': 4.1133441856582356e-05, 'epoch': 0.53} -{'loss': 0.2945, 'grad_norm': 4.8817620277404785, 'learning_rate': 4.0611695467067366e-05, 'epoch': 0.56} -{'loss': 0.2903, 'grad_norm': 5.003459453582764, 'learning_rate': 4.008994907755238e-05, 'epoch': 0.59} -{'loss': 0.284, 'grad_norm': 6.451533317565918, 'learning_rate': 3.95682026880374e-05, 'epoch': 0.63} -{'loss': 0.276, 'grad_norm': 7.442136287689209, 'learning_rate': 3.9046456298522416e-05, 'epoch': 0.66} -{'loss': 0.27, 'grad_norm': 3.617513656616211, 'learning_rate': 3.852575340178646e-05, 'epoch': 0.69} -{'loss': 0.2666, 'grad_norm': 5.776317596435547, 'learning_rate': 3.800400701227148e-05, 'epoch': 0.72} -{'loss': 0.257, 'grad_norm': 6.264099597930908, 'learning_rate': 3.7482260622756494e-05, 'epoch': 0.75} -{'loss': 0.2566, 'grad_norm': 4.222651481628418, 'learning_rate': 3.6960514233241504e-05, 'epoch': 0.78} -{'loss': 0.2502, 'grad_norm': 6.953704833984375, 'learning_rate': 3.643876784372652e-05, 'epoch': 0.81} -{'loss': 0.2364, 'grad_norm': 3.2264351844787598, 'learning_rate': 3.591806494699057e-05, 'epoch': 0.85} -{'loss': 0.2451, 'grad_norm': 6.233669281005859, 'learning_rate': 3.539631855747558e-05, 'epoch': 0.88} -{'loss': 0.2364, 'grad_norm': 8.540342330932617, 'learning_rate': 3.48745721679606e-05, 'epoch': 0.91} -{'loss': 0.2312, 'grad_norm': 4.3881516456604, 'learning_rate': 3.4352825778445616e-05, 'epoch': 0.94} -{'loss': 0.2323, 'grad_norm': 6.7153167724609375, 'learning_rate': 3.383107938893063e-05, 'epoch': 0.97} - -{'eval_loss': 0.2026778757572174, 'eval_accuracy': 0.9204725991125071, 'eval_runtime': 180.0542, 'eval_samples_per_second': 608.272, 'eval_steps_per_second': 76.038, 'epoch': 1.0} -{'loss': 0.2163, 'grad_norm': 4.329936504364014, 'learning_rate': 3.331037649219468e-05, 'epoch': 1.0} -{'loss': 0.139, 'grad_norm': 8.806492805480957, 'learning_rate': 3.278863010267969e-05, 'epoch': 1.03} -{'loss': 0.1419, 'grad_norm': 9.733407020568848, 'learning_rate': 3.226688371316471e-05, 'epoch': 1.06} -{'loss': 0.1361, 'grad_norm': 3.5503616333007812, 'learning_rate': 3.174513732364972e-05, 'epoch': 1.1} -{'loss': 0.1398, 'grad_norm': 5.853847503662109, 'learning_rate': 3.122339093413474e-05, 'epoch': 1.13} -{'loss': 0.1373, 'grad_norm': 1.6936904191970825, 'learning_rate': 3.0701644544619754e-05, 'epoch': 1.16} -{'loss': 0.1423, 'grad_norm': 1.5299335718154907, 'learning_rate': 3.017989815510477e-05, 'epoch': 1.19} -{'loss': 0.1391, 'grad_norm': 3.899322986602783, 'learning_rate': 2.965815176558978e-05, 'epoch': 1.22} -{'loss': 0.1408, 'grad_norm': 2.3118438720703125, 'learning_rate': 2.913744886885383e-05, 'epoch': 1.25} -{'loss': 0.1408, 'grad_norm': 0.6930440068244934, 'learning_rate': 2.8615702479338845e-05, 'epoch': 1.28} -{'loss': 0.1404, 'grad_norm': 2.851909875869751, 'learning_rate': 2.8093956089823858e-05, 'epoch': 1.31} -{'loss': 0.1382, 'grad_norm': 0.22848767042160034, 'learning_rate': 2.7572209700308875e-05, 'epoch': 1.35} -{'loss': 0.1396, 'grad_norm': 3.973886489868164, 'learning_rate': 2.7050463310793888e-05, 'epoch': 1.38} -{'loss': 0.127, 'grad_norm': 3.140080451965332, 'learning_rate': 2.6529760414057936e-05, 'epoch': 1.41} -{'loss': 0.1276, 'grad_norm': 5.468123435974121, 'learning_rate': 2.6008014024542953e-05, 'epoch': 1.44} -{'loss': 0.1219, 'grad_norm': 0.626640260219574, 'learning_rate': 2.5486267635027966e-05, 'epoch': 1.47} -{'loss': 0.1319, 'grad_norm': 3.1899547576904297, 'learning_rate': 2.496452124551298e-05, 'epoch': 1.5} -{'loss': 0.1298, 'grad_norm': 3.199150562286377, 'learning_rate': 2.4442774855997996e-05, 'epoch': 1.53} -{'loss': 0.1217, 'grad_norm': 5.129565715789795, 'learning_rate': 2.3921028466483013e-05, 'epoch': 1.57} -{'loss': 0.1288, 'grad_norm': 4.223311424255371, 'learning_rate': 2.339928207696803e-05, 'epoch': 1.6} -{'loss': 0.1263, 'grad_norm': 10.741965293884277, 'learning_rate': 2.2877535687453046e-05, 'epoch': 1.63} -{'loss': 0.122, 'grad_norm': 3.0217132568359375, 'learning_rate': 2.235578929793806e-05, 'epoch': 1.66} -{'loss': 0.122, 'grad_norm': 7.847172737121582, 'learning_rate': 2.1835086401202104e-05, 'epoch': 1.69} -{'loss': 0.1266, 'grad_norm': 9.223713874816895, 'learning_rate': 2.1313340011687117e-05, 'epoch': 1.72} -{'loss': 0.1274, 'grad_norm': 2.0706963539123535, 'learning_rate': 2.0791593622172137e-05, 'epoch': 1.75} -{'loss': 0.1214, 'grad_norm': 3.1475393772125244, 'learning_rate': 2.0270890725436182e-05, 'epoch': 1.78} -{'loss': 0.1191, 'grad_norm': 3.7348415851593018, 'learning_rate': 1.9749144335921196e-05, 'epoch': 1.82} -{'loss': 0.1199, 'grad_norm': 3.230713129043579, 'learning_rate': 1.9227397946406212e-05, 'epoch': 1.85} -{'loss': 0.1176, 'grad_norm': 0.4691683351993561, 'learning_rate': 1.8705651556891226e-05, 'epoch': 1.88} -{'loss': 0.1176, 'grad_norm': 4.382262706756592, 'learning_rate': 1.8183905167376242e-05, 'epoch': 1.91} -{'loss': 0.1083, 'grad_norm': 9.810182571411133, 'learning_rate': 1.7662158777861255e-05, 'epoch': 1.94} -{'loss': 0.1103, 'grad_norm': 8.107538223266602, 'learning_rate': 1.7140412388346275e-05, 'epoch': 1.97} -{'eval_loss': 0.1829579919576645, 'eval_accuracy': 0.9478369642628878, 'eval_runtime': 179.9731, 'eval_samples_per_second': 608.547, 'eval_steps_per_second': 76.072, 'epoch': 2.0} -{'loss': 0.1087, 'grad_norm': 0.5452843308448792, 'learning_rate': 1.661866599883129e-05, 'epoch': 2.0} -{'loss': 0.0456, 'grad_norm': 1.0569943189620972, 'learning_rate': 1.6097963102095334e-05, 'epoch': 2.03} -{'loss': 0.0523, 'grad_norm': 0.22022764384746552, 'learning_rate': 1.557621671258035e-05, 'epoch': 2.07} -{'loss': 0.0492, 'grad_norm': 9.75222396850586, 'learning_rate': 1.5054470323065365e-05, 'epoch': 2.1} -{'loss': 0.0498, 'grad_norm': 3.1281306743621826, 'learning_rate': 1.453272393355038e-05, 'epoch': 2.13} -{'loss': 0.0506, 'grad_norm': 0.012396792881190777, 'learning_rate': 1.4012021036814427e-05, 'epoch': 2.16} -{'loss': 0.0569, 'grad_norm': 6.527154922485352, 'learning_rate': 1.3490274647299442e-05, 'epoch': 2.19} -{'loss': 0.0548, 'grad_norm': 3.5429670810699463, 'learning_rate': 1.2968528257784457e-05, 'epoch': 2.22} -{'loss': 0.0558, 'grad_norm': 1.333369255065918, 'learning_rate': 1.2446781868269472e-05, 'epoch': 2.25} -{'loss': 0.0464, 'grad_norm': 0.10260029882192612, 'learning_rate': 1.1926078971533518e-05, 'epoch': 2.29} -{'loss': 0.0515, 'grad_norm': 0.14060164988040924, 'learning_rate': 1.1404332582018533e-05, 'epoch': 2.32} -{'loss': 0.0448, 'grad_norm': 1.031032919883728, 'learning_rate': 1.0882586192503548e-05, 'epoch': 2.35} -{'loss': 0.0475, 'grad_norm': 0.20121368765830994, 'learning_rate': 1.0360839802988565e-05, 'epoch': 2.38} -{'loss': 0.0522, 'grad_norm': 0.06531311571598053, 'learning_rate': 9.84013690625261e-06, 'epoch': 2.41} -{'loss': 0.0434, 'grad_norm': 0.04498385637998581, 'learning_rate': 9.318390516737625e-06, 'epoch': 2.44} -{'loss': 0.0468, 'grad_norm': 0.3482716679573059, 'learning_rate': 8.796644127222641e-06, 'epoch': 2.47} -{'loss': 0.0505, 'grad_norm': 4.0475053787231445, 'learning_rate': 8.274897737707656e-06, 'epoch': 2.5} -{'loss': 0.0421, 'grad_norm': 0.6960127353668213, 'learning_rate': 7.753151348192671e-06, 'epoch': 2.54} -{'loss': 0.0451, 'grad_norm': 0.8902493119239807, 'learning_rate': 7.231404958677686e-06, 'epoch': 2.57} -{'loss': 0.0522, 'grad_norm': 0.46462351083755493, 'learning_rate': 6.710702061941732e-06, 'epoch': 2.6} -{'loss': 0.0468, 'grad_norm': 0.07463126629590988, 'learning_rate': 6.1889556724267476e-06, 'epoch': 2.63} -{'loss': 0.0429, 'grad_norm': 0.05138092488050461, 'learning_rate': 5.6672092829117625e-06, 'epoch': 2.66} -{'loss': 0.038, 'grad_norm': 0.06017659977078438, 'learning_rate': 5.145462893396778e-06, 'epoch': 2.69} -{'loss': 0.0418, 'grad_norm': 3.794154405593872, 'learning_rate': 4.624759996660823e-06, 'epoch': 2.72} -{'loss': 0.0418, 'grad_norm': 9.929149627685547, 'learning_rate': 4.103013607145838e-06, 'epoch': 2.75} -{'loss': 0.0435, 'grad_norm': 0.10156802833080292, 'learning_rate': 3.5812672176308544e-06, 'epoch': 2.79} -{'loss': 0.039, 'grad_norm': 15.590471267700195, 'learning_rate': 3.0595208281158697e-06, 'epoch': 2.82} -{'loss': 0.0451, 'grad_norm': 0.1026441678404808, 'learning_rate': 2.5377744386008846e-06, 'epoch': 2.85} -{'loss': 0.0408, 'grad_norm': 0.08782440423965454, 'learning_rate': 2.0160280490859004e-06, 'epoch': 2.88} -{'loss': 0.0372, 'grad_norm': 17.5203857421875, 'learning_rate': 1.494281659570916e-06, 'epoch': 2.91} -{'loss': 0.041, 'grad_norm': 0.08832889050245285, 'learning_rate': 9.735787628349612e-07, 'epoch': 2.94} -{'loss': 0.0417, 'grad_norm': 10.057083129882812, 'learning_rate': 4.518323733199766e-07, 'epoch': 2.97} -{'eval_loss': 0.2335142344236374, 'eval_accuracy': 0.9541735906941071, 'eval_runtime': 176.4196, 'eval_samples_per_second': 620.804, 'eval_steps_per_second': 77.605, 'epoch': 3.0} -{'train_runtime': 26437.5455, 'train_samples_per_second': 57.998, 'train_steps_per_second': 1.812, 'train_loss': 0.1687752874718155, 'epoch': 3.0} -100%|██████████| 13691/13691 [02:53<00:00, 78.95it/s] -{'eval_loss': 0.17655357718467712, 'eval_accuracy': 0.9493257124074396, 'eval_runtime': 173.4293, 'eval_samples_per_second': 631.514, 'eval_steps_per_second': 78.943, 'epoch': 3.0} diff --git a/wandb/run-20250504_172503-0ictlmwf/files/requirements.txt b/wandb/run-20250504_172503-0ictlmwf/files/requirements.txt deleted file mode 100644 index 847c45ecccb522de294762faeeb01fe5fb02f7ac..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_172503-0ictlmwf/files/requirements.txt +++ /dev/null @@ -1,541 +0,0 @@ -nvidia-cuda-cupti-cu12==12.4.127 -nvidia-cuda-nvrtc-cu12==12.4.127 -pyg-lib==0.4.0+pt20cu117 -biopython==1.85 -iniconfig==2.0.0 -tokenizers==0.20.0 -accelerate==1.3.0 -torch==2.6.0 -nvidia-nccl-cu12==2.21.5 -transformers==4.45.2 -nvidia-cusparse-cu12==12.3.1.170 -torch-scatter==2.1.2+pt20cu117 -nvidia-cusparselt-cu12==0.6.2 -nvidia-nvtx-cu12==12.4.127 -zstd==1.5.6.6 -fair-esm==2.0.0 -omegaconf==2.3.0 -pluggy==1.5.0 -pytest==8.3.5 -nvidia-curand-cu12==10.3.5.147 -nvidia-cufft-cu12==11.2.1.3 -torch-cluster==1.6.3+pt20cu117 -regex==2024.9.11 -nvidia-cudnn-cu12==9.1.0.70 -torch-spline-conv==1.2.2+pt20cu117 -nvidia-cusolver-cu12==11.6.1.9 -antlr4-python3-runtime==4.9.3 -msgpack-numpy==0.4.8 -nlp==0.2.0 -einops==0.8.1 -nvidia-cublas-cu12==12.4.5.8 -triton==3.2.0 -ninja==1.11.1.3 -hydra-core==1.3.2 -nvidia-nvjitlink-cu12==12.4.127 -biotite==0.41.2 -torch-sparse==0.6.18+pt20cu117 -esm==3.1.4 -sympy==1.13.1 -nvidia-cuda-runtime-cu12==12.4.127 -jupyter-lsp==2.2.5 -jupyter-events==0.10.0 -ipykernel==6.29.5 -Mako==1.3.5 -proto-plus==1.25.0 -fst-pso==1.8.1 -gensim==4.3.3 -htmlmin==0.1.12 -tokenizers==0.13.3 -timm==1.0.11 -MarkupSafe==3.0.2 -safetensors==0.4.5 -requests==2.32.3 -gast==0.5.5 -cuml==24.12.0a33 -jaxlib==0.4.23.dev20240214 -spacy-loggers==1.0.5 -pytz==2024.1 -idna==3.10 -python-dateutil==2.9.0 -mdurl==0.1.2 -blis==0.7.10 -jupyter==1.1.1 -pyerfa==2.0.1.5 -comm==0.2.2 -pygraphviz==1.14 -dill==0.3.8 -paramiko==3.5.0 -llama-index==0.8.36 -mdit-py-plugins==0.4.2 -Werkzeug==3.1.3 -pyu2f==0.1.5 -dask-glm==0.2.0 -httpx==0.27.2 -typeguard==4.4.1 -mypy-extensions==1.0.0 -kmodes==0.12.2 -keras==2.15.0 -ydata-profiling==0.0.dev0 -regex==2024.11.6 -xarray==2024.11.0 -setuptools==75.3.0 -charset-normalizer==3.4.0 -jupyterlab_nvdashboard==0.11.0 -pylibraft==24.12.0a36 -spacy==3.7.6 -mlflow-skinny==2.17.2 -nvtx==0.2.10 -multimethod==1.12 -pexpect==4.9.0 -torch==2.1.0.post301 -flatbuffers==24.3.25 -python-json-logger==2.0.7 -PyJWT==2.9.0 -multiprocess==0.70.16 -colorlover==0.3.0 -yarl==1.16.0 -locket==1.0.0 -patsy==1.0.0 -rapids-dask-dependency==24.12.0a0 -stanza==1.9.2 -debugpy==1.8.8 -jupyterlab_pygments==0.3.0 -pylibcudf==24.12.0a337 -lz4==4.3.3 -pandas==2.2.3 -tifffile==2024.9.20 -pynvml==11.4.1 -cufflinks==0.17.3 -ipywidgets==8.1.5 -requests-oauthlib==2.0.0 -google-auth-oauthlib==1.2.1 -rsa==4.9 -webcolors==24.8.0 -jsonschema-specifications==2024.10.1 -scikit-learn==1.5.2 -langchain-text-splitters==0.3.2 -pandas-datareader==0.10.0 -tomli==2.0.2 -tzdata==2024.2 -scikit-image==0.24.0 -tensorboard_data_server==0.7.0 -kiwisolver==1.4.7 -cloudpathlib==0.20.0 -isodate==0.6.1 -adversarial-robustness-toolbox==1.19.1 -SQLAlchemy==2.0.36 -pytest-runner==6.0.0 -pycairo==1.27.0 -treelite==4.3.0 -jiter==0.7.0 -threadpoolctl==3.5.0 -pandocfilters==1.5.0 -loguru==0.7.2 -smart_open==7.0.5 -shellingham==1.5.4 -deepspeed==0.15.4 -prompt_toolkit==3.0.48 -databricks-sdk==0.34.0 -langchain-core==0.3.15 -imageio==2.36.0 -openapi-schema-pydantic==1.2.4 -zict==3.0.0 -cachetools==5.5.0 -colorful==0.5.6 -mpmath==1.3.0 -nest_asyncio==1.6.0 -pyFUME==0.2.25 -opencv-python-headless==4.9.0 -fastai==2.7.18 -importlib_resources==6.4.5 -binaryornot==0.4.4 -evaluate==0.4.1 -matplotlib-inline==0.1.7 -wasabi==1.1.2 -pycparser==2.22 -GitPython==3.1.43 -pluggy==1.5.0 -async-lru==2.0.4 -pgmpy==0.1.24 -anyio==4.4.0 -executing==2.1.0 -orjson==3.10.11 -humanfriendly==10.0 -tornado==6.4.1 -gmpy2==2.1.5 -rlPyCairo==0.2.0 -distributed==2024.11.0 -FuzzyTM==2.0.5 -torchtext==0.15.2a0+5ce3163 -pytest==8.3.5 -pyod==2.0.2 -ImageHash==4.3.1 -soupsieve==2.5 -tblib==3.0.0 -emoji==2.14.0 -aiohappyeyeballs==2.4.3 -uri-template==1.3.0 -tensorflow_estimator==2.15.0 -babel==2.16.0 -dask-cuda==24.12.0a12 -overrides==7.7.0 -opencensus==0.11.3 -openai==0.28.1 -language_data==1.2.0 -jedi==0.19.2 -cookiecutter==2.6.0 -entrypoints==0.4 -exceptiongroup==1.2.2 -marisa-trie==1.2.0 -uvloop==0.20.0 -aiosignal==1.3.1 -Flask==3.0.3 -tensorboard==2.15.2 -cffi==1.17.1 -tf_keras==2.15.0 -absl-py==2.1.0 -blinker==1.9.0 -types-python-dateutil==2.9.0.20241003 -opencv-python==4.9.0 -frozendict==2.4.6 -aiohttp-cors==0.7.0 -statsmodels==0.14.4 -tinycss2==1.4.0 -terminado==0.18.1 -pycaret==2.2.3 -aiohttp==3.10.10 -distributed-ucxx==0.41.0 -prometheus_client==0.21.0 -fastdownload==0.0.7 -grpcio==1.59.3 -google-api-core==2.22.0 -jupyterlab_widgets==3.0.13 -appdirs==1.4.4 -littleutils==0.0.0 -ray==2.24.0 -kaggle==1.6.17 -jsonschema==4.23.0 -google-auth==2.36.0 -scikit-base==0.11.0 -visions==0.7.6 -pyarrow==15.0.0 -transformers==4.33.0 -prometheus_flask_exporter==0.23.1 -dm-tree==0.1.8 -colorama==0.4.6 -requests-toolbelt==1.0.0 -cached-property==1.5.2 -cymem==2.0.8 -PyNaCl==1.5.0 -PyWavelets==1.7.0 -httptools==0.6.1 -typing-utils==0.1.0 -email_validator==2.2.0 -marshmallow==3.23.1 -Deprecated==1.2.14 -virtualenv==20.4.7 -optuna==3.6.1 -jupyter_server==2.14.2 -termcolor==2.5.0 -mpi4py==4.0.1 -torchdata==0.7.1+8cea82f -dataclasses==0.8 -cloudpickle==3.1.0 -tree_sitter_languages==1.10.2 -tabulate==0.9.0 -ipython==8.29.0 -lightgbm==4.3.0 -captum==0.6.0 -confuse==2.0.1 -torchvision==0.16.1+adc3221 -lxml==4.9.4 -fastapi==0.115.4 -python-multipart==0.0.17 -dnspython==2.7.0 -jupyter-console==6.6.3 -preshed==3.0.9 -py-cpuinfo==9.0.0 -Send2Trash==1.8.3 -murmurhash==1.0.10 -sniffio==1.3.1 -websockets==13.1 -h11==0.14.0 -smmap==5.0.0 -textual==0.85.2 -jsonpatch==1.33 -opencensus-context==0.1.3 -nbconvert==7.16.4 -sentry-sdk==2.19.0 -opentelemetry-semantic-conventions==0.37b0 -pandas-profiling==2.8.0 -pillow==10.3.0 -peft==0.13.2 -rpds-py==0.21.0 -bokeh==3.6.1 -distro==1.9.0 -itsdangerous==2.2.0 -wandb==0.18.7 -jsonpointer==3.0.0 -astropy-iers-data==0.2024.11.11.0.32.38 -horovod==0.28.1 -graphviz==0.20.3 -vtk==9.3.1 -bleach==6.2.0 -numexpr==2.8.7 -pydantic_core==2.23.4 -Jinja2==3.1.4 -widgetsnbextension==4.0.13 -filelock==3.16.1 -catboost==1.2.7 -raft-dask==24.12.0a36 -async-timeout==4.0.3 -datefinder==0.7.3 -coloredlogs==15.0.1 -platformdirs==4.3.6 -spacy-legacy==3.0.12 -chardet==5.2.0 -jupyter_client==8.6.3 -importlib_metadata==8.5.0 -rfc3986-validator==0.1.1 -huggingface_hub==0.26.2 -PySocks==1.7.1 -mlxtend==0.23.2 -outdated==0.2.2 -partd==1.4.2 -thinc==8.2.5 -astropy==6.1.6 -rdflib==6.3.2 -h2==4.1.0 -typer==0.13.0 -xyzservices==2024.9.0 -toolz==0.12.1 -frozenlist==1.5.0 -rdkit==2024.9.2 -pyasn1==0.6.1 -jupyter_server_terminals==0.5.3 -ucx-py==0.41.0a11 -astunparse==1.6.3 -simpful==2.12.0 -notebook_shim==0.2.4 -scipy==1.13.1 -colorlog==6.9.0 -tiktoken==0.3.3 -plotly==5.24.1 -fastrlock==0.8.2 -chart-studio==1.1.0 -stack-data==0.6.2 -google-pasta==0.2.0 -sktime==0.34.0 -PyYAML==6.0.2 -sympy==1.13.3 -multidict==6.1.0 -ml-dtypes==0.2.0 -tensorboardX==2.6.2.2 -decorator==5.1.1 -cytoolz==1.0.0 -ase==3.23.0 -isoduration==20.11.0 -html5lib==1.1 -langsmith==0.1.142 -future==1.0.0 -onnx2torch==1.5.15 -multipledispatch==0.6.0 -protobuf==4.24.4 -ucxx==0.41.0 -pandas_flavor==0.6.0 -msgpack==1.1.0 -pyasn1_modules==0.4.1 -imagecodecs==2024.1.1 -mlflow==2.17.2 -watchfiles==0.24.0 -dm-sonnet==2.0.2 -langcodes==3.4.1 -freetype-py==2.3.0 -argon2-cffi-bindings==21.2.0 -trimesh==4.5.2 -opt_einsum==3.4.0 -tenacity==8.5.0 -h5py==3.12.1 -fastapi-cli==0.0.5 -oauthlib==3.2.2 -parso==0.8.4 -weasel==0.4.1 -yfinance==0.2.49 -networkx==2.8.8 -bitsandbytes==0.44.1 -lazy_loader==0.4 -querystring_parser==1.2.4 -contourpy==1.3.0 -unicodedata2==15.1.0 -bcrypt==4.2.0 -munkres==1.1.4 -langchain==0.0.298 -hpack==4.0.0 -cryptography==43.0.3 -umap-learn==0.5.7 -arrow==1.3.0 -docker==7.1.0 -certifi==2025.1.31 -fastjsonschema==2.20.0 -tensorflow==2.15.0 -googleapis-common-protos==1.65.0 -iniconfig==2.0.0 -Markdown==3.6 -llvmlite==0.43.0 -wslink==2.3.2 -attrs==24.2.0 -rich==13.9.4 -cupy==13.3.0 -uc-micro-py==1.0.3 -alembic==1.14.0 -joblib==1.4.2 -reportlab==4.2.5 -miniful==0.0.6 -jupyter_core==5.7.2 -wheel==0.45.0 -phik==0.12.3 -mistune==3.0.2 -wcwidth==0.2.13 -dacite==1.8.1 -accelerate==0.22.0 -sacremoses==0.0.53 -revtok==0.0.3 -python-slugify==8.0.4 -tangled-up-in-unicode==0.2.0 -dask==2024.11.0 -markdown-it-py==3.0.0 -sentencepiece==0.1.99 -beautifulsoup4==4.12.3 -six==1.16.0 -numba-cuda==0.0.17 -argon2-cffi==23.1.0 -xxhash==3.5.0 -hjson==3.1.0 -fonttools==4.54.1 -graphql-core==3.2.5 -pyparsing==3.2.0 -pure_eval==0.2.3 -distlib==0.3.9 -lightning==2.4.0 -wordcloud==0.0.0 -catalogue==2.0.10 -jax==0.4.27 -tree-sitter==0.23.2 -notebook==7.2.2 -dataclasses-json==0.6.7 -propcache==0.2.0 -numba==0.60.0 -dask-expr==1.1.17 -pydantic==2.9.2 -gunicorn==22.0.0 -missingno==0.5.2 -pyOpenSSL==24.2.1 -openpyxl==3.1.5 -packaging==24.1 -python-dotenv==1.0.1 -cycler==0.12.1 -types-pytz==2024.2.0.20241003 -yellowbrick==1.5 -referencing==0.35.1 -pyLDAvis==3.4.1 -lazypredict==0.2.16 -fqdn==1.5.1 -websocket-client==1.8.0 -fastcore==1.7.19 -pynvjitlink-cu12==0.3.0 -pingouin==0.5.5 -numpy==1.26.4 -typing-inspect==0.9.0 -nltk==3.9.1 -onnxruntime==1.19.2 -tensorflow-probability==0.23.0 -datasets==3.0.2 -pickleshare==0.7.5 -peewee==3.17.7 -torch-geometric==2.6.1 -ptyprocess==0.7.0 -greenlet==3.1.1 -graphql-relay==3.2.0 -graphene==3.4.3 -et_xmlfile==2.0.0 -webencodings==0.5.1 -hyperframe==6.0.1 -multitasking==0.0.9 -typer-slim==0.13.0 -onnx==1.15.0 -uvicorn==0.32.0 -memray==1.13.4 -xgboost==2.1.2 -Brotli==1.1.0 -zipp==3.21.0 -nbformat==5.10.4 -responses==0.18.0 -funcy==2.0 -Pygments==2.18.0 -tqdm==4.67.0 -linkify-it-py==2.0.3 -srsly==2.4.8 -cuda-python==12.6.0 -lightning-utilities==0.11.8 -cudf==24.12.0a337 -dask-ml==2024.4.4 -docker-pycreds==0.4.0 -pkgutil_resolve_name==1.3.10 -opentelemetry-api==1.16.0 -fsspec==2024.9.0 -nbclient==0.10.0 -psutil==5.9.8 -pytorch-lightning==2.4.0 -sortedcontainers==2.4.0 -matplotlib==3.9.2 -defusedxml==0.7.1 -urllib3==1.26.19 -jupyterlab_server==2.27.3 -retrying==1.3.3 -dask-cudf==24.12.0a337 -sqlparse==0.5.1 -text-unidecode==1.3 -seaborn==0.13.2 -typing_extensions==4.12.2 -pyzmq==26.2.0 -rfc3339-validator==0.1.4 -pynndescent==0.5.13 -pip==24.3.1 -confection==0.1.4 -wrapt==1.14.1 -fastprogress==1.0.3 -traitlets==5.14.3 -asttokens==2.4.1 -json5==0.9.28 -pandas-stubs==2.2.3.241126 -torchmetrics==1.2.1 -gitdb==4.0.11 -annotated-types==0.7.0 -ipython-autotime==0.1 -httpcore==1.0.6 -click==8.1.7 -setproctitle==1.3.3 -starlette==0.41.2 -jupyterlab==4.2.5 -rmm==24.12.0a27 -opentelemetry-sdk==1.16.0 -textblob==0.15.3 -imbalanced-learn==0.12.4 -typeguard==4.3.0 -more-itertools==10.3.0 -zipp==3.19.2 -autocommand==2.2.2 -jaraco.context==5.3.0 -packaging==24.1 -importlib_metadata==8.0.0 -platformdirs==4.2.2 -jaraco.functools==4.0.1 -importlib_resources==6.4.0 -tomli==2.0.1 -jaraco.text==3.12.1 -wheel==0.43.0 -jaraco.collections==5.1.0 -typing_extensions==4.12.2 -inflect==7.3.1 -backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_172503-0ictlmwf/files/wandb-metadata.json b/wandb/run-20250504_172503-0ictlmwf/files/wandb-metadata.json deleted file mode 100644 index f3c892eb99e110ed3a340c5c88fca4d4e7601345..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_172503-0ictlmwf/files/wandb-metadata.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", - "python": "3.10.15", - "startedAt": "2025-05-04T14:25:03.372176Z", - "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", - "codePath": "finetuning_bc_prott5.py", - "email": "zeynep.isik1@sabanciuniv.edu", - "root": "/arf/scratch/zisik/prott5_bc_ft", - "host": "kolyoz1", - "username": "zisik", - "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", - "codePathLocal": "finetuning_bc_prott5.py", - "cpu_count": 64, - "cpu_count_logical": 64, - "gpu": "NVIDIA H100 80GB HBM3", - "gpu_count": 1, - "disk": { - "/": { - "total": "7643995308032", - "used": "274939207680" - } - }, - "memory": { - "total": "1081373220864" - }, - "cpu": { - "count": 64, - "countLogical": 64 - }, - "gpu_nvidia": [ - { - "name": "NVIDIA H100 80GB HBM3", - "memoryTotal": "85520809984", - "cudaCores": 16896, - "architecture": "Hopper" - } - ], - "slurm": { - "cluster_name": "cuda", - "conf": "/etc/slurm/slurm.conf", - "cpus_on_node": "16", - "cpus_per_task": "16", - "gpus_on_node": "1", - "gtids": "0", - "job_account": "tbag154", - "job_cpus_per_node": "16", - "job_end_time": "1746627878", - "job_gid": "11636", - "job_gpus": "3", - "job_id": "1027971", - "job_name": "msa_ph_pt", - "job_nodelist": "kolyoz1", - "job_num_nodes": "1", - "job_partition": "kolyoz-cuda", - "job_qos": "tbag", - "job_start_time": "1746368678", - "job_uid": "11636", - "job_user": "zisik", - "jobid": "1027971", - "localid": "0", - "mem_per_cpu": "14000", - "nnodes": "1", - "node_aliases": "(null)", - "nodeid": "0", - "nodelist": "kolyoz1", - "prio_process": "0", - "procid": "0", - "submit_dir": "/arf/scratch/zisik", - "submit_host": "cuda-ui", - "task_pid": "3189684", - "tasks_per_node": "1", - "topology_addr": "kolyoz1", - "topology_addr_pattern": "node", - "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" - }, - "cudaVersion": "12.6" -} \ No newline at end of file diff --git a/wandb/run-20250504_172503-0ictlmwf/files/wandb-summary.json b/wandb/run-20250504_172503-0ictlmwf/files/wandb-summary.json deleted file mode 100644 index 1d540fd3ad4df49061ca8c1d3c6754d4ef243041..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_172503-0ictlmwf/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"train/epoch":3,"_step":99,"train/grad_norm":10.057083129882812,"train/learning_rate":4.518323733199766e-07,"eval/steps_per_second":78.943,"train/loss":0.0417,"train_steps_per_second":1.812,"_wandb":{"runtime":26669},"total_flos":0,"eval/loss":0.17655357718467712,"train_loss":0.1687752874718155,"train_runtime":26437.5455,"eval/accuracy":0.9493257124074396,"_timestamp":1.7463953681937246e+09,"train_samples_per_second":57.998,"eval/runtime":173.4293,"train/global_step":47916,"eval/samples_per_second":631.514,"_runtime":26664.822052477} \ No newline at end of file diff --git a/wandb/run-20250504_172503-0ictlmwf/logs/debug-core.log b/wandb/run-20250504_172503-0ictlmwf/logs/debug-core.log deleted file mode 100644 index be6c40900e077df04434c2058af02e8279bfd761..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_172503-0ictlmwf/logs/debug-core.log +++ /dev/null @@ -1,14 +0,0 @@ -{"time":"2025-05-04T17:25:02.471184127+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpiwo6benn/port-3189710.txt","pid":3189710,"debug":false,"disable-analytics":false} -{"time":"2025-05-04T17:25:02.471231751+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} -{"time":"2025-05-04T17:25:02.472141348+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":33653,"Zone":""}} -{"time":"2025-05-04T17:25:02.472268534+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3189710} -{"time":"2025-05-04T17:25:02.658908169+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:54060"} -{"time":"2025-05-04T17:25:03.374276784+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"0ictlmwf","id":"127.0.0.1:54060"} -{"time":"2025-05-04T17:25:03.501310319+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"0ictlmwf","id":"127.0.0.1:54060"} -{"time":"2025-05-05T00:49:32.577676717+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:54060"} -{"time":"2025-05-05T00:49:32.577829204+03:00","level":"INFO","msg":"server is shutting down"} -{"time":"2025-05-05T00:49:32.577794736+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:54060"} -{"time":"2025-05-05T00:49:32.578007826+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:54060"} -{"time":"2025-05-05T00:49:33.742564589+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:54060"} -{"time":"2025-05-05T00:49:33.742592681+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:54060"} -{"time":"2025-05-05T00:49:33.742613896+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_172503-0ictlmwf/logs/debug-internal.log b/wandb/run-20250504_172503-0ictlmwf/logs/debug-internal.log deleted file mode 100644 index 8f66fe3c7ae770c0e93c28ce15a95a46c40e21af..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_172503-0ictlmwf/logs/debug-internal.log +++ /dev/null @@ -1,21 +0,0 @@ -{"time":"2025-05-04T17:25:03.375857654+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} -{"time":"2025-05-04T17:25:03.375905253+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_172503-0ictlmwf/logs/debug-core.log"} -{"time":"2025-05-04T17:25:03.501241143+03:00","level":"INFO","msg":"created new stream","id":"0ictlmwf"} -{"time":"2025-05-04T17:25:03.501294637+03:00","level":"INFO","msg":"stream: started","id":"0ictlmwf"} -{"time":"2025-05-04T17:25:03.501448652+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"0ictlmwf"} -{"time":"2025-05-04T17:25:03.501451145+03:00","level":"INFO","msg":"handler: started","stream_id":"0ictlmwf"} -{"time":"2025-05-04T17:25:03.501574427+03:00","level":"INFO","msg":"sender: started","stream_id":"0ictlmwf"} -{"time":"2025-05-04T17:25:03.865922055+03:00","level":"INFO","msg":"Starting system monitor"} -{"time":"2025-05-04T22:47:43.191425732+03:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/isikz/finetuning-bc-protT5/0ictlmwf/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"} -{"time":"2025-05-05T00:01:47.351449692+03:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/isikz/finetuning-bc-protT5/0ictlmwf/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"} -{"time":"2025-05-05T00:49:32.57779148+03:00","level":"INFO","msg":"stream: closing","id":"0ictlmwf"} -{"time":"2025-05-05T00:49:32.577842715+03:00","level":"INFO","msg":"Stopping system monitor"} -{"time":"2025-05-05T00:49:32.578849729+03:00","level":"INFO","msg":"Stopped system monitor"} -{"time":"2025-05-05T00:49:32.781968337+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} -{"time":"2025-05-05T00:49:32.781997123+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} -{"time":"2025-05-05T00:49:32.782008311+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} -{"time":"2025-05-05T00:49:33.357099059+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} -{"time":"2025-05-05T00:49:33.741524339+03:00","level":"INFO","msg":"handler: closed","stream_id":"0ictlmwf"} -{"time":"2025-05-05T00:49:33.741583153+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"0ictlmwf"} -{"time":"2025-05-05T00:49:33.741593811+03:00","level":"INFO","msg":"sender: closed","stream_id":"0ictlmwf"} -{"time":"2025-05-05T00:49:33.741652369+03:00","level":"INFO","msg":"stream: closed","id":"0ictlmwf"} diff --git a/wandb/run-20250504_172503-0ictlmwf/logs/debug.log b/wandb/run-20250504_172503-0ictlmwf/logs/debug.log deleted file mode 100644 index 627abd37727afa0dddc772a5f08d1d451156833a..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_172503-0ictlmwf/logs/debug.log +++ /dev/null @@ -1,27 +0,0 @@ -2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 -2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Configure stats pid to 3189710 -2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings -2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings -2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} -2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Applying login settings: {} -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_172503-0ictlmwf/logs/debug.log -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_172503-0ictlmwf/logs/debug-internal.log -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():619] calling init triggers -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():626] wandb.init called with sweep_config: {} -config: {} -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():669] starting backend -2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():673] sending inform_init request -2025-05-04 17:25:03,371 INFO MainThread:3189710 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn -2025-05-04 17:25:03,371 INFO MainThread:3189710 [wandb_init.py:init():686] backend started and connected -2025-05-04 17:25:03,379 INFO MainThread:3189710 [wandb_init.py:init():781] updated telemetry -2025-05-04 17:25:03,382 INFO MainThread:3189710 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout -2025-05-04 17:25:03,852 INFO MainThread:3189710 [wandb_init.py:init():867] starting run threads in backend -2025-05-04 17:25:05,277 INFO MainThread:3189710 [wandb_run.py:_console_start():2456] atexit reg -2025-05-04 17:25:05,278 INFO MainThread:3189710 [wandb_run.py:_redirect():2305] redirect: wrap_raw -2025-05-04 17:25:05,278 INFO MainThread:3189710 [wandb_run.py:_redirect():2370] Wrapping output streams. -2025-05-04 17:25:05,278 INFO MainThread:3189710 [wandb_run.py:_redirect():2395] Redirects installed. -2025-05-04 17:25:05,283 INFO MainThread:3189710 [wandb_init.py:init():911] run started, returning control to user process -2025-05-04 17:25:53,069 INFO MainThread:3189710 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_17-25-43_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} -2025-05-05 00:49:32,578 WARNING MsgRouterThr:3189710 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_172503-0ictlmwf/run-0ictlmwf.wandb b/wandb/run-20250504_172503-0ictlmwf/run-0ictlmwf.wandb deleted file mode 100644 index f8cc6f4ca642c6b7dd3b09e91225198cbf4cc952..0000000000000000000000000000000000000000 --- a/wandb/run-20250504_172503-0ictlmwf/run-0ictlmwf.wandb +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:006a1cac0ce47f9249802031630141a9f36a796d14d7c386b77939289e4b498e -size 17270813