diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..b2d83e30ec4844c9ed86a4aee244540440deb515 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +ready_to_train.csv filter=lfs diff=lfs merge=lfs -text +wandb/run-20250504_132912-1agsw1y8/run-1agsw1y8.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20250504_172503-0ictlmwf/run-0ictlmwf.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..ebf40fb15fa27dea5d9d92a580d1bca519b3f4f3 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,102 @@ +{ + "": 127, + "": 117, + "": 116, + "": 115, + "": 114, + "": 113, + "": 112, + "": 111, + "": 110, + "": 109, + "": 108, + "": 126, + "": 107, + "": 106, + "": 105, + "": 104, + "": 103, + "": 102, + "": 101, + "": 100, + "": 99, + "": 98, + "": 125, + "": 97, + "": 96, + "": 95, + "": 94, + "": 93, + "": 92, + "": 91, + "": 90, + "": 89, + "": 88, + "": 124, + "": 87, + "": 86, + "": 85, + "": 84, + "": 83, + "": 82, + "": 81, + "": 80, + "": 79, + "": 78, + "": 123, + "": 77, + "": 76, + "": 75, + "": 74, + "": 73, + "": 72, + "": 71, + "": 70, + "": 69, + "": 68, + "": 122, + "": 67, + "": 66, + "": 65, + "": 64, + "": 63, + "": 62, + "": 61, + "": 60, + "": 59, + "": 58, + "": 121, + "": 57, + "": 56, + "": 55, + "": 54, + "": 53, + "": 52, + "": 51, + "": 50, + "": 49, + "": 48, + "": 120, + "": 47, + "": 46, + "": 45, + "": 44, + "": 43, + "": 42, + "": 41, + "": 40, + "": 39, + "": 38, + "": 119, + "": 37, + "": 36, + "": 35, + "": 34, + "": 33, + "": 32, + "": 31, + "": 30, + "": 29, + "": 28, + "": 118 +} diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..af1c864b04cc7849b7c1b65f535655a300ecbcc5 --- /dev/null +++ b/config.json @@ -0,0 +1,8 @@ +{ + "architectures": [ + "T5BinaryClassifier" + ], + "model_type": "t5", + "d_model": 1024, + "is_encoder_decoder": false +} \ No newline at end of file diff --git a/finetuning_bc_prott5.py b/finetuning_bc_prott5.py new file mode 100644 index 0000000000000000000000000000000000000000..25f436bde3fd6bd925a036dbd5adac6cf0f320ae --- /dev/null +++ b/finetuning_bc_prott5.py @@ -0,0 +1,149 @@ +import torch, torch.nn as nn +from transformers import (T5EncoderModel, T5Tokenizer, + Trainer, TrainingArguments) +from transformers.modeling_outputs import SequenceClassifierOutput +from datasets import load_dataset +from sklearn.metrics import accuracy_score +import pandas as pd +import wandb +from huggingface_hub import login +import re +from datasets import Dataset + +# --------------------------- +# 1. GİRİŞ‑ÇIKIŞ ve LOGIN +# --------------------------- + +wandb.login() +wandb.init(project='finetuning-bc-protT5') + +# --------------------------- +# 2. DATA HAZIRLIK (seninkiler) +# --------------------------- +data = pd.read_csv("ready_to_train.csv") +pos = data.loc[data["SITE_+/-7_AA"].str.len()==15]["SITE_+/-7_AA"].tolist() +neg = data.loc[data["NON_PH_SITE"].str.len()==15]["NON_PH_SITE"].tolist() +labels = [1]*len(pos)+[0]*len(neg) +texts = pos+neg +prep_texts = [" ".join(list(t.upper())) for t in texts] +prep_texts = [re.sub(r"[UZOB]", "X", pt).replace("_","-")for pt in prep_texts] + + +from sklearn.model_selection import train_test_split +X_train, X_temp, y_train, y_temp = train_test_split(prep_texts, labels, test_size=0.30, random_state=42) +X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42) + +tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50") + +def tokenize(batch): + return tokenizer(batch["text"], + padding="max_length", + truncation=True, + max_length=64) + +def to_hf_dataset(texts, labels): + return {"text": texts, "label": labels} + +train_ds = Dataset.from_dict({"text": X_train, "label": y_train}) +val_ds = Dataset.from_dict({"text": X_val, "label": y_val}) + +train_ds = train_ds.map(tokenize, batched=True).with_format("torch") +val_ds = val_ds.map(tokenize, batched=True).with_format("torch") + + + +# --------------------------- +# 3. MODEL: T5 + Classification Head +# --------------------------- +class T5BinaryClassifier(nn.Module): + def __init__(self, model_name, dropout=0.1): + super().__init__() + self.encoder = T5EncoderModel.from_pretrained(model_name) + enc_dim = self.encoder.config.d_model # 1024 (prot_t5_xl) + self.dropout = nn.Dropout(dropout) + self.cls = nn.Linear(enc_dim, 2) # binary + + def forward(self, + input_ids=None, + attention_mask=None, + labels=None, + **kwargs): + enc_out = self.encoder(input_ids=input_ids, + attention_mask=attention_mask, + return_dict=True) + # [CLS]-benzeri vektör: token pozisyonu (id=0) yerine mean‑pool + hidden = enc_out.last_hidden_state # (B, L, D) + pooled = hidden.mean(dim=1) # (B, D) + + logits = self.cls(self.dropout(pooled)) + + loss = None + if labels is not None: + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(logits, labels) + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=enc_out.hidden_states, + attentions=enc_out.attentions, + ) + +model = T5BinaryClassifier("Rostlab/prot_t5_xl_uniref50").cuda() + +# --------------------------- +# 4. TRAINING ARGUMENTS +# --------------------------- +args = TrainingArguments( + output_dir="t5-bc-out", + num_train_epochs=3, + learning_rate=5e-5, + per_device_train_batch_size=8, # prot_t5_xl büyük; 512 yerine 8‑16 önerilir + per_device_eval_batch_size=8, + gradient_accumulation_steps=4, # efektif 32 + evaluation_strategy="epoch", + load_best_model_at_end=True, + save_strategy="epoch", + save_safetensors=False, + report_to=["wandb"], + fp16=True, +) + +def compute_metrics(eval_pred): + logits, labels = eval_pred + preds = logits.argmax(-1) + acc = accuracy_score(labels, preds) + return {"accuracy": acc} + +trainer = Trainer( + model=model, + args=args, + train_dataset=train_ds, + eval_dataset=val_ds, + compute_metrics=compute_metrics, +) + +trainer.train() + +# --------------------------- +# 5. TEST & SAVE +# --------------------------- + +# Python dict → Hugging Face Dataset +test_ds = Dataset.from_dict({"text": X_test, "label": y_test}) + +# Tokenize ve tensor formatına çevir +test_ds = test_ds.map(tokenize, batched=True).with_format("torch") + +metrics = trainer.evaluate(test_ds) +print(metrics) +# ---- Manuel kaydetme ---- +trainer.save_model( + "/arf/scratch/zisik/prott5_bc_ft" +) +tokenizer.save_pretrained("/arf/scratch/zisik/prott5_bc_ft") + + +#model.push_to_hub("isikz/prot_t5_binary_classifier") +#tokenizer.push_to_hub("isikz/prot_t5_binary_classifier") +#wandb.finish() diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..0a92b054ee34321d706fdba0e59cff8e1a6945bb --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb84e54c51f53eb1a49e0d52446d9e470b5ea320ae7174917832ab5aef4d31a2 +size 4832674810 diff --git a/ready_to_train.csv b/ready_to_train.csv new file mode 100644 index 0000000000000000000000000000000000000000..6ccefd360c81076a54229466d245766ab20959ce --- /dev/null +++ b/ready_to_train.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:498eaceee30faf2510396e17a4f8417ce65c37e576c8792a80da432313f03c0e +size 18584710 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..17ade346a1042cbe0c1436f5bedcbd85c099d582 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,125 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ], + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/spiece.model b/spiece.model new file mode 100644 index 0000000000000000000000000000000000000000..6c3f9cd64e8288f59fac6b5ad7c85cbc17938ffd --- /dev/null +++ b/spiece.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74da7b4afcde53faa570114b530c726135bdfcdb813dec3abfb27f9d44db7324 +size 237990 diff --git a/t5-bc-out/checkpoint-47916/optimizer.pt b/t5-bc-out/checkpoint-47916/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b5af208c2eab52b5ee380279553fdfaae416815 --- /dev/null +++ b/t5-bc-out/checkpoint-47916/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3923cb1c3204d99805be4282d57866443cbdd1f5f71ad6af1c81ee4a783d7e9d +size 9665321730 diff --git a/t5-bc-out/checkpoint-47916/pytorch_model.bin b/t5-bc-out/checkpoint-47916/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e2f3cad7a1e11f71e29927be70fe6e98f2aa55e --- /dev/null +++ b/t5-bc-out/checkpoint-47916/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80957033108061961f1d326abe9e2829f4d78524a478d52ecec37db106fbe5cc +size 4832674810 diff --git a/t5-bc-out/checkpoint-47916/rng_state.pth b/t5-bc-out/checkpoint-47916/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8b71d0fca00d67f9c6d59efc1fffbd5a7d79baa --- /dev/null +++ b/t5-bc-out/checkpoint-47916/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1af688f89b64a7c9246d9d5848b03b2543dd68c97861fab57333014cd508ec2 +size 14244 diff --git a/t5-bc-out/checkpoint-47916/scheduler.pt b/t5-bc-out/checkpoint-47916/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..23be9ed08525013187375982b1d0445c0d3cc932 --- /dev/null +++ b/t5-bc-out/checkpoint-47916/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62074fe1abf3e8558aec193d31cdd76f6c2650659b0c8d62d4b5ff6d20fd6edd +size 1064 diff --git a/t5-bc-out/checkpoint-47916/trainer_state.json b/t5-bc-out/checkpoint-47916/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2137097bc4315cb69634f262ff91a78a5ab147ce --- /dev/null +++ b/t5-bc-out/checkpoint-47916/trainer_state.json @@ -0,0 +1,725 @@ +{ + "best_metric": 0.1829579919576645, + "best_model_checkpoint": "t5-bc-out/checkpoint-31944", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 47916, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03130478337089907, + "grad_norm": 1.3348039388656616, + "learning_rate": 4.947825361048502e-05, + "loss": 0.5856, + "step": 500 + }, + { + "epoch": 0.06260956674179814, + "grad_norm": 2.473144292831421, + "learning_rate": 4.8956507220970036e-05, + "loss": 0.5183, + "step": 1000 + }, + { + "epoch": 0.09391435011269722, + "grad_norm": 3.6210598945617676, + "learning_rate": 4.843476083145505e-05, + "loss": 0.4879, + "step": 1500 + }, + { + "epoch": 0.12521913348359628, + "grad_norm": 6.336288928985596, + "learning_rate": 4.791405793471909e-05, + "loss": 0.4579, + "step": 2000 + }, + { + "epoch": 0.15652391685449538, + "grad_norm": 2.6699299812316895, + "learning_rate": 4.739231154520411e-05, + "loss": 0.4421, + "step": 2500 + }, + { + "epoch": 0.18782870022539444, + "grad_norm": 7.918868064880371, + "learning_rate": 4.6870565155689124e-05, + "loss": 0.4205, + "step": 3000 + }, + { + "epoch": 0.2191334835962935, + "grad_norm": 2.9816083908081055, + "learning_rate": 4.634881876617414e-05, + "loss": 0.4044, + "step": 3500 + }, + { + "epoch": 0.25043826696719257, + "grad_norm": 7.581803321838379, + "learning_rate": 4.582707237665916e-05, + "loss": 0.3901, + "step": 4000 + }, + { + "epoch": 0.28174305033809166, + "grad_norm": 6.031352996826172, + "learning_rate": 4.5305325987144174e-05, + "loss": 0.3834, + "step": 4500 + }, + { + "epoch": 0.31304783370899075, + "grad_norm": 2.581623077392578, + "learning_rate": 4.478357959762919e-05, + "loss": 0.3601, + "step": 5000 + }, + { + "epoch": 0.3443526170798898, + "grad_norm": 4.7024245262146, + "learning_rate": 4.42618332081142e-05, + "loss": 0.3492, + "step": 5500 + }, + { + "epoch": 0.3756574004507889, + "grad_norm": 8.929915428161621, + "learning_rate": 4.374217380415728e-05, + "loss": 0.3435, + "step": 6000 + }, + { + "epoch": 0.406962183821688, + "grad_norm": 3.694370985031128, + "learning_rate": 4.32204274146423e-05, + "loss": 0.3366, + "step": 6500 + }, + { + "epoch": 0.438266967192587, + "grad_norm": 5.6961350440979, + "learning_rate": 4.2698681025127307e-05, + "loss": 0.3259, + "step": 7000 + }, + { + "epoch": 0.4695717505634861, + "grad_norm": 2.740339756011963, + "learning_rate": 4.217693463561232e-05, + "loss": 0.3224, + "step": 7500 + }, + { + "epoch": 0.5008765339343851, + "grad_norm": 3.7285494804382324, + "learning_rate": 4.165518824609734e-05, + "loss": 0.3103, + "step": 8000 + }, + { + "epoch": 0.5321813173052843, + "grad_norm": 5.1480326652526855, + "learning_rate": 4.1133441856582356e-05, + "loss": 0.3107, + "step": 8500 + }, + { + "epoch": 0.5634861006761833, + "grad_norm": 4.8817620277404785, + "learning_rate": 4.0611695467067366e-05, + "loss": 0.2945, + "step": 9000 + }, + { + "epoch": 0.5947908840470824, + "grad_norm": 5.003459453582764, + "learning_rate": 4.008994907755238e-05, + "loss": 0.2903, + "step": 9500 + }, + { + "epoch": 0.6260956674179815, + "grad_norm": 6.451533317565918, + "learning_rate": 3.95682026880374e-05, + "loss": 0.284, + "step": 10000 + }, + { + "epoch": 0.6574004507888805, + "grad_norm": 7.442136287689209, + "learning_rate": 3.9046456298522416e-05, + "loss": 0.276, + "step": 10500 + }, + { + "epoch": 0.6887052341597796, + "grad_norm": 3.617513656616211, + "learning_rate": 3.852575340178646e-05, + "loss": 0.27, + "step": 11000 + }, + { + "epoch": 0.7200100175306787, + "grad_norm": 5.776317596435547, + "learning_rate": 3.800400701227148e-05, + "loss": 0.2666, + "step": 11500 + }, + { + "epoch": 0.7513148009015778, + "grad_norm": 6.264099597930908, + "learning_rate": 3.7482260622756494e-05, + "loss": 0.257, + "step": 12000 + }, + { + "epoch": 0.7826195842724768, + "grad_norm": 4.222651481628418, + "learning_rate": 3.6960514233241504e-05, + "loss": 0.2566, + "step": 12500 + }, + { + "epoch": 0.813924367643376, + "grad_norm": 6.953704833984375, + "learning_rate": 3.643876784372652e-05, + "loss": 0.2502, + "step": 13000 + }, + { + "epoch": 0.845229151014275, + "grad_norm": 3.2264351844787598, + "learning_rate": 3.591806494699057e-05, + "loss": 0.2364, + "step": 13500 + }, + { + "epoch": 0.876533934385174, + "grad_norm": 6.233669281005859, + "learning_rate": 3.539631855747558e-05, + "loss": 0.2451, + "step": 14000 + }, + { + "epoch": 0.9078387177560732, + "grad_norm": 8.540342330932617, + "learning_rate": 3.48745721679606e-05, + "loss": 0.2364, + "step": 14500 + }, + { + "epoch": 0.9391435011269722, + "grad_norm": 4.3881516456604, + "learning_rate": 3.4352825778445616e-05, + "loss": 0.2312, + "step": 15000 + }, + { + "epoch": 0.9704482844978712, + "grad_norm": 6.7153167724609375, + "learning_rate": 3.383107938893063e-05, + "loss": 0.2323, + "step": 15500 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.9204725991125071, + "eval_loss": 0.2026778757572174, + "eval_runtime": 180.0542, + "eval_samples_per_second": 608.272, + "eval_steps_per_second": 76.038, + "step": 15972 + }, + { + "epoch": 1.0017530678687703, + "grad_norm": 4.329936504364014, + "learning_rate": 3.331037649219468e-05, + "loss": 0.2163, + "step": 16000 + }, + { + "epoch": 1.0330578512396693, + "grad_norm": 8.806492805480957, + "learning_rate": 3.278863010267969e-05, + "loss": 0.139, + "step": 16500 + }, + { + "epoch": 1.0643626346105686, + "grad_norm": 9.733407020568848, + "learning_rate": 3.226688371316471e-05, + "loss": 0.1419, + "step": 17000 + }, + { + "epoch": 1.0956674179814676, + "grad_norm": 3.5503616333007812, + "learning_rate": 3.174513732364972e-05, + "loss": 0.1361, + "step": 17500 + }, + { + "epoch": 1.1269722013523666, + "grad_norm": 5.853847503662109, + "learning_rate": 3.122339093413474e-05, + "loss": 0.1398, + "step": 18000 + }, + { + "epoch": 1.1582769847232657, + "grad_norm": 1.6936904191970825, + "learning_rate": 3.0701644544619754e-05, + "loss": 0.1373, + "step": 18500 + }, + { + "epoch": 1.1895817680941647, + "grad_norm": 1.5299335718154907, + "learning_rate": 3.017989815510477e-05, + "loss": 0.1423, + "step": 19000 + }, + { + "epoch": 1.220886551465064, + "grad_norm": 3.899322986602783, + "learning_rate": 2.965815176558978e-05, + "loss": 0.1391, + "step": 19500 + }, + { + "epoch": 1.252191334835963, + "grad_norm": 2.3118438720703125, + "learning_rate": 2.913744886885383e-05, + "loss": 0.1408, + "step": 20000 + }, + { + "epoch": 1.283496118206862, + "grad_norm": 0.6930440068244934, + "learning_rate": 2.8615702479338845e-05, + "loss": 0.1408, + "step": 20500 + }, + { + "epoch": 1.314800901577761, + "grad_norm": 2.851909875869751, + "learning_rate": 2.8093956089823858e-05, + "loss": 0.1404, + "step": 21000 + }, + { + "epoch": 1.3461056849486601, + "grad_norm": 0.22848767042160034, + "learning_rate": 2.7572209700308875e-05, + "loss": 0.1382, + "step": 21500 + }, + { + "epoch": 1.3774104683195592, + "grad_norm": 3.973886489868164, + "learning_rate": 2.7050463310793888e-05, + "loss": 0.1396, + "step": 22000 + }, + { + "epoch": 1.4087152516904582, + "grad_norm": 3.140080451965332, + "learning_rate": 2.6529760414057936e-05, + "loss": 0.127, + "step": 22500 + }, + { + "epoch": 1.4400200350613575, + "grad_norm": 5.468123435974121, + "learning_rate": 2.6008014024542953e-05, + "loss": 0.1276, + "step": 23000 + }, + { + "epoch": 1.4713248184322565, + "grad_norm": 0.626640260219574, + "learning_rate": 2.5486267635027966e-05, + "loss": 0.1219, + "step": 23500 + }, + { + "epoch": 1.5026296018031555, + "grad_norm": 3.1899547576904297, + "learning_rate": 2.496452124551298e-05, + "loss": 0.1319, + "step": 24000 + }, + { + "epoch": 1.5339343851740546, + "grad_norm": 3.199150562286377, + "learning_rate": 2.4442774855997996e-05, + "loss": 0.1298, + "step": 24500 + }, + { + "epoch": 1.5652391685449536, + "grad_norm": 5.129565715789795, + "learning_rate": 2.3921028466483013e-05, + "loss": 0.1217, + "step": 25000 + }, + { + "epoch": 1.5965439519158529, + "grad_norm": 4.223311424255371, + "learning_rate": 2.339928207696803e-05, + "loss": 0.1288, + "step": 25500 + }, + { + "epoch": 1.6278487352867517, + "grad_norm": 10.741965293884277, + "learning_rate": 2.2877535687453046e-05, + "loss": 0.1263, + "step": 26000 + }, + { + "epoch": 1.659153518657651, + "grad_norm": 3.0217132568359375, + "learning_rate": 2.235578929793806e-05, + "loss": 0.122, + "step": 26500 + }, + { + "epoch": 1.69045830202855, + "grad_norm": 7.847172737121582, + "learning_rate": 2.1835086401202104e-05, + "loss": 0.122, + "step": 27000 + }, + { + "epoch": 1.721763085399449, + "grad_norm": 9.223713874816895, + "learning_rate": 2.1313340011687117e-05, + "loss": 0.1266, + "step": 27500 + }, + { + "epoch": 1.7530678687703483, + "grad_norm": 2.0706963539123535, + "learning_rate": 2.0791593622172137e-05, + "loss": 0.1274, + "step": 28000 + }, + { + "epoch": 1.784372652141247, + "grad_norm": 3.1475393772125244, + "learning_rate": 2.0270890725436182e-05, + "loss": 0.1214, + "step": 28500 + }, + { + "epoch": 1.8156774355121463, + "grad_norm": 3.7348415851593018, + "learning_rate": 1.9749144335921196e-05, + "loss": 0.1191, + "step": 29000 + }, + { + "epoch": 1.8469822188830454, + "grad_norm": 3.230713129043579, + "learning_rate": 1.9227397946406212e-05, + "loss": 0.1199, + "step": 29500 + }, + { + "epoch": 1.8782870022539444, + "grad_norm": 0.4691683351993561, + "learning_rate": 1.8705651556891226e-05, + "loss": 0.1176, + "step": 30000 + }, + { + "epoch": 1.9095917856248434, + "grad_norm": 4.382262706756592, + "learning_rate": 1.8183905167376242e-05, + "loss": 0.1176, + "step": 30500 + }, + { + "epoch": 1.9408965689957425, + "grad_norm": 9.810182571411133, + "learning_rate": 1.7662158777861255e-05, + "loss": 0.1083, + "step": 31000 + }, + { + "epoch": 1.9722013523666417, + "grad_norm": 8.107538223266602, + "learning_rate": 1.7140412388346275e-05, + "loss": 0.1103, + "step": 31500 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9478369642628878, + "eval_loss": 0.1829579919576645, + "eval_runtime": 179.9731, + "eval_samples_per_second": 608.547, + "eval_steps_per_second": 76.072, + "step": 31944 + }, + { + "epoch": 2.0035061357375405, + "grad_norm": 0.5452843308448792, + "learning_rate": 1.661866599883129e-05, + "loss": 0.1087, + "step": 32000 + }, + { + "epoch": 2.03481091910844, + "grad_norm": 1.0569943189620972, + "learning_rate": 1.6097963102095334e-05, + "loss": 0.0456, + "step": 32500 + }, + { + "epoch": 2.0661157024793386, + "grad_norm": 0.22022764384746552, + "learning_rate": 1.557621671258035e-05, + "loss": 0.0523, + "step": 33000 + }, + { + "epoch": 2.097420485850238, + "grad_norm": 9.75222396850586, + "learning_rate": 1.5054470323065365e-05, + "loss": 0.0492, + "step": 33500 + }, + { + "epoch": 2.128725269221137, + "grad_norm": 3.1281306743621826, + "learning_rate": 1.453272393355038e-05, + "loss": 0.0498, + "step": 34000 + }, + { + "epoch": 2.160030052592036, + "grad_norm": 0.012396792881190777, + "learning_rate": 1.4012021036814427e-05, + "loss": 0.0506, + "step": 34500 + }, + { + "epoch": 2.191334835962935, + "grad_norm": 6.527154922485352, + "learning_rate": 1.3490274647299442e-05, + "loss": 0.0569, + "step": 35000 + }, + { + "epoch": 2.222639619333834, + "grad_norm": 3.5429670810699463, + "learning_rate": 1.2968528257784457e-05, + "loss": 0.0548, + "step": 35500 + }, + { + "epoch": 2.2539444027047333, + "grad_norm": 1.333369255065918, + "learning_rate": 1.2446781868269472e-05, + "loss": 0.0558, + "step": 36000 + }, + { + "epoch": 2.2852491860756325, + "grad_norm": 0.10260029882192612, + "learning_rate": 1.1926078971533518e-05, + "loss": 0.0464, + "step": 36500 + }, + { + "epoch": 2.3165539694465314, + "grad_norm": 0.14060164988040924, + "learning_rate": 1.1404332582018533e-05, + "loss": 0.0515, + "step": 37000 + }, + { + "epoch": 2.3478587528174306, + "grad_norm": 1.031032919883728, + "learning_rate": 1.0882586192503548e-05, + "loss": 0.0448, + "step": 37500 + }, + { + "epoch": 2.3791635361883294, + "grad_norm": 0.20121368765830994, + "learning_rate": 1.0360839802988565e-05, + "loss": 0.0475, + "step": 38000 + }, + { + "epoch": 2.4104683195592287, + "grad_norm": 0.06531311571598053, + "learning_rate": 9.84013690625261e-06, + "loss": 0.0522, + "step": 38500 + }, + { + "epoch": 2.441773102930128, + "grad_norm": 0.04498385637998581, + "learning_rate": 9.318390516737625e-06, + "loss": 0.0434, + "step": 39000 + }, + { + "epoch": 2.4730778863010268, + "grad_norm": 0.3482716679573059, + "learning_rate": 8.796644127222641e-06, + "loss": 0.0468, + "step": 39500 + }, + { + "epoch": 2.504382669671926, + "grad_norm": 4.0475053787231445, + "learning_rate": 8.274897737707656e-06, + "loss": 0.0505, + "step": 40000 + }, + { + "epoch": 2.535687453042825, + "grad_norm": 0.6960127353668213, + "learning_rate": 7.753151348192671e-06, + "loss": 0.0421, + "step": 40500 + }, + { + "epoch": 2.566992236413724, + "grad_norm": 0.8902493119239807, + "learning_rate": 7.231404958677686e-06, + "loss": 0.0451, + "step": 41000 + }, + { + "epoch": 2.5982970197846234, + "grad_norm": 0.46462351083755493, + "learning_rate": 6.710702061941732e-06, + "loss": 0.0522, + "step": 41500 + }, + { + "epoch": 2.629601803155522, + "grad_norm": 0.07463126629590988, + "learning_rate": 6.1889556724267476e-06, + "loss": 0.0468, + "step": 42000 + }, + { + "epoch": 2.660906586526421, + "grad_norm": 0.05138092488050461, + "learning_rate": 5.6672092829117625e-06, + "loss": 0.0429, + "step": 42500 + }, + { + "epoch": 2.6922113698973202, + "grad_norm": 0.06017659977078438, + "learning_rate": 5.145462893396778e-06, + "loss": 0.038, + "step": 43000 + }, + { + "epoch": 2.7235161532682195, + "grad_norm": 3.794154405593872, + "learning_rate": 4.624759996660823e-06, + "loss": 0.0418, + "step": 43500 + }, + { + "epoch": 2.7548209366391183, + "grad_norm": 9.929149627685547, + "learning_rate": 4.103013607145838e-06, + "loss": 0.0418, + "step": 44000 + }, + { + "epoch": 2.7861257200100176, + "grad_norm": 0.10156802833080292, + "learning_rate": 3.5812672176308544e-06, + "loss": 0.0435, + "step": 44500 + }, + { + "epoch": 2.8174305033809164, + "grad_norm": 15.590471267700195, + "learning_rate": 3.0595208281158697e-06, + "loss": 0.039, + "step": 45000 + }, + { + "epoch": 2.8487352867518156, + "grad_norm": 0.1026441678404808, + "learning_rate": 2.5377744386008846e-06, + "loss": 0.0451, + "step": 45500 + }, + { + "epoch": 2.880040070122715, + "grad_norm": 0.08782440423965454, + "learning_rate": 2.0160280490859004e-06, + "loss": 0.0408, + "step": 46000 + }, + { + "epoch": 2.9113448534936137, + "grad_norm": 17.5203857421875, + "learning_rate": 1.494281659570916e-06, + "loss": 0.0372, + "step": 46500 + }, + { + "epoch": 2.942649636864513, + "grad_norm": 0.08832889050245285, + "learning_rate": 9.735787628349612e-07, + "loss": 0.041, + "step": 47000 + }, + { + "epoch": 2.973954420235412, + "grad_norm": 10.057083129882812, + "learning_rate": 4.518323733199766e-07, + "loss": 0.0417, + "step": 47500 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.9541735906941071, + "eval_loss": 0.2335142344236374, + "eval_runtime": 176.4196, + "eval_samples_per_second": 620.804, + "eval_steps_per_second": 77.605, + "step": 47916 + } + ], + "logging_steps": 500, + "max_steps": 47916, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/t5-bc-out/checkpoint-47916/training_args.bin b/t5-bc-out/checkpoint-47916/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c3466fd00b79452242858a747e6bfe168409a38f --- /dev/null +++ b/t5-bc-out/checkpoint-47916/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:705750eb5050da7b859b299363db4324be92a3af2ba4a8530c69e964f52524d7 +size 5176 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a2b787858aa387c7d376edeef2f479d543ca4012 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,941 @@ +{ + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "28": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "29": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "30": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "31": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "33": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "34": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "35": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "36": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "37": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "38": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "39": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "40": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "41": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "42": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "43": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "44": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "45": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "46": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "47": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "48": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "49": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "50": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "51": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "52": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "53": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "54": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "55": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "56": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "57": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "58": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "59": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "60": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "61": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "62": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "63": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "64": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "65": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "66": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "67": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "68": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "69": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "70": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "71": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "72": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "73": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "74": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "75": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "76": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "77": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "78": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "79": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "80": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "81": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "82": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "83": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "84": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "85": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "86": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "87": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "88": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "89": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "90": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "91": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "92": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "93": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "94": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "95": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "96": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "97": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "98": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "99": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "100": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "101": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "102": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "103": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "104": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "105": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "106": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "108": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "109": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "110": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "111": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "112": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "113": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "114": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "115": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "116": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "117": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "118": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "119": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "120": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "121": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "122": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "123": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "124": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "125": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "126": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "127": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ], + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "", + "extra_ids": 100, + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "tokenizer_class": "T5Tokenizer", + "unk_token": "" +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c3466fd00b79452242858a747e6bfe168409a38f --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:705750eb5050da7b859b299363db4324be92a3af2ba4a8530c69e964f52524d7 +size 5176 diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..8f66fe3c7ae770c0e93c28ce15a95a46c40e21af --- /dev/null +++ b/wandb/debug-internal.log @@ -0,0 +1,21 @@ +{"time":"2025-05-04T17:25:03.375857654+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} +{"time":"2025-05-04T17:25:03.375905253+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_172503-0ictlmwf/logs/debug-core.log"} +{"time":"2025-05-04T17:25:03.501241143+03:00","level":"INFO","msg":"created new stream","id":"0ictlmwf"} +{"time":"2025-05-04T17:25:03.501294637+03:00","level":"INFO","msg":"stream: started","id":"0ictlmwf"} +{"time":"2025-05-04T17:25:03.501448652+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"0ictlmwf"} +{"time":"2025-05-04T17:25:03.501451145+03:00","level":"INFO","msg":"handler: started","stream_id":"0ictlmwf"} +{"time":"2025-05-04T17:25:03.501574427+03:00","level":"INFO","msg":"sender: started","stream_id":"0ictlmwf"} +{"time":"2025-05-04T17:25:03.865922055+03:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-05-04T22:47:43.191425732+03:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/isikz/finetuning-bc-protT5/0ictlmwf/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"} +{"time":"2025-05-05T00:01:47.351449692+03:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/isikz/finetuning-bc-protT5/0ictlmwf/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"} +{"time":"2025-05-05T00:49:32.57779148+03:00","level":"INFO","msg":"stream: closing","id":"0ictlmwf"} +{"time":"2025-05-05T00:49:32.577842715+03:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-05-05T00:49:32.578849729+03:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-05-05T00:49:32.781968337+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} +{"time":"2025-05-05T00:49:32.781997123+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} +{"time":"2025-05-05T00:49:32.782008311+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} +{"time":"2025-05-05T00:49:33.357099059+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-05-05T00:49:33.741524339+03:00","level":"INFO","msg":"handler: closed","stream_id":"0ictlmwf"} +{"time":"2025-05-05T00:49:33.741583153+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"0ictlmwf"} +{"time":"2025-05-05T00:49:33.741593811+03:00","level":"INFO","msg":"sender: closed","stream_id":"0ictlmwf"} +{"time":"2025-05-05T00:49:33.741652369+03:00","level":"INFO","msg":"stream: closed","id":"0ictlmwf"} diff --git a/wandb/debug.log b/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..627abd37727afa0dddc772a5f08d1d451156833a --- /dev/null +++ b/wandb/debug.log @@ -0,0 +1,27 @@ +2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 +2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Configure stats pid to 3189710 +2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings +2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings +2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_172503-0ictlmwf/logs/debug.log +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_172503-0ictlmwf/logs/debug-internal.log +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():619] calling init triggers +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():626] wandb.init called with sweep_config: {} +config: {} +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():669] starting backend +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():673] sending inform_init request +2025-05-04 17:25:03,371 INFO MainThread:3189710 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-04 17:25:03,371 INFO MainThread:3189710 [wandb_init.py:init():686] backend started and connected +2025-05-04 17:25:03,379 INFO MainThread:3189710 [wandb_init.py:init():781] updated telemetry +2025-05-04 17:25:03,382 INFO MainThread:3189710 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout +2025-05-04 17:25:03,852 INFO MainThread:3189710 [wandb_init.py:init():867] starting run threads in backend +2025-05-04 17:25:05,277 INFO MainThread:3189710 [wandb_run.py:_console_start():2456] atexit reg +2025-05-04 17:25:05,278 INFO MainThread:3189710 [wandb_run.py:_redirect():2305] redirect: wrap_raw +2025-05-04 17:25:05,278 INFO MainThread:3189710 [wandb_run.py:_redirect():2370] Wrapping output streams. +2025-05-04 17:25:05,278 INFO MainThread:3189710 [wandb_run.py:_redirect():2395] Redirects installed. +2025-05-04 17:25:05,283 INFO MainThread:3189710 [wandb_init.py:init():911] run started, returning control to user process +2025-05-04 17:25:53,069 INFO MainThread:3189710 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_17-25-43_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2025-05-05 00:49:32,578 WARNING MsgRouterThr:3189710 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_132610-pxg645u5/files/config.yaml b/wandb/run-20250504_132610-pxg645u5/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7e7549dbe318b236ac4d168d1610ec259f3f67e0 --- /dev/null +++ b/wandb/run-20250504_132610-pxg645u5/files/config.yaml @@ -0,0 +1,44 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.10.15 + t: + "1": + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "2": + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "3": + - 23 + - 55 + "4": 3.10.15 + "5": 0.18.7 + "6": 4.45.2 + "8": + - 5 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250504_132610-pxg645u5/files/output.log b/wandb/run-20250504_132610-pxg645u5/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..f32d8969878a7a0628870456700492bec8448c62 --- /dev/null +++ b/wandb/run-20250504_132610-pxg645u5/files/output.log @@ -0,0 +1,37 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Traceback (most recent call last): + File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 45, in + train_ds = load_dataset("json", data_files={"train": "-"}, + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 2132, in load_dataset + builder_instance = load_dataset_builder( + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 1853, in load_dataset_builder + dataset_module = dataset_module_factory( + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 1562, in dataset_module_factory + ).get_module() + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 942, in get_module + data_files = DataFilesDict.from_patterns( + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/data_files.py", line 721, in from_patterns + else DataFilesList.from_patterns( + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/data_files.py", line 624, in from_patterns + resolve_pattern( + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/data_files.py", line 411, in resolve_pattern + raise FileNotFoundError(error_msg) +FileNotFoundError: Unable to find '/arf/scratch/zisik/prott5_bc_ft/-' +Traceback (most recent call last): + File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 45, in + train_ds = load_dataset("json", data_files={"train": "-"}, + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 2132, in load_dataset + builder_instance = load_dataset_builder( + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 1853, in load_dataset_builder + dataset_module = dataset_module_factory( + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 1562, in dataset_module_factory + ).get_module() + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/load.py", line 942, in get_module + data_files = DataFilesDict.from_patterns( + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/data_files.py", line 721, in from_patterns + else DataFilesList.from_patterns( + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/data_files.py", line 624, in from_patterns + resolve_pattern( + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/datasets/data_files.py", line 411, in resolve_pattern + raise FileNotFoundError(error_msg) +FileNotFoundError: Unable to find '/arf/scratch/zisik/prott5_bc_ft/-' diff --git a/wandb/run-20250504_132610-pxg645u5/files/requirements.txt b/wandb/run-20250504_132610-pxg645u5/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..847c45ecccb522de294762faeeb01fe5fb02f7ac --- /dev/null +++ b/wandb/run-20250504_132610-pxg645u5/files/requirements.txt @@ -0,0 +1,541 @@ +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +pyg-lib==0.4.0+pt20cu117 +biopython==1.85 +iniconfig==2.0.0 +tokenizers==0.20.0 +accelerate==1.3.0 +torch==2.6.0 +nvidia-nccl-cu12==2.21.5 +transformers==4.45.2 +nvidia-cusparse-cu12==12.3.1.170 +torch-scatter==2.1.2+pt20cu117 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nvtx-cu12==12.4.127 +zstd==1.5.6.6 +fair-esm==2.0.0 +omegaconf==2.3.0 +pluggy==1.5.0 +pytest==8.3.5 +nvidia-curand-cu12==10.3.5.147 +nvidia-cufft-cu12==11.2.1.3 +torch-cluster==1.6.3+pt20cu117 +regex==2024.9.11 +nvidia-cudnn-cu12==9.1.0.70 +torch-spline-conv==1.2.2+pt20cu117 +nvidia-cusolver-cu12==11.6.1.9 +antlr4-python3-runtime==4.9.3 +msgpack-numpy==0.4.8 +nlp==0.2.0 +einops==0.8.1 +nvidia-cublas-cu12==12.4.5.8 +triton==3.2.0 +ninja==1.11.1.3 +hydra-core==1.3.2 +nvidia-nvjitlink-cu12==12.4.127 +biotite==0.41.2 +torch-sparse==0.6.18+pt20cu117 +esm==3.1.4 +sympy==1.13.1 +nvidia-cuda-runtime-cu12==12.4.127 +jupyter-lsp==2.2.5 +jupyter-events==0.10.0 +ipykernel==6.29.5 +Mako==1.3.5 +proto-plus==1.25.0 +fst-pso==1.8.1 +gensim==4.3.3 +htmlmin==0.1.12 +tokenizers==0.13.3 +timm==1.0.11 +MarkupSafe==3.0.2 +safetensors==0.4.5 +requests==2.32.3 +gast==0.5.5 +cuml==24.12.0a33 +jaxlib==0.4.23.dev20240214 +spacy-loggers==1.0.5 +pytz==2024.1 +idna==3.10 +python-dateutil==2.9.0 +mdurl==0.1.2 +blis==0.7.10 +jupyter==1.1.1 +pyerfa==2.0.1.5 +comm==0.2.2 +pygraphviz==1.14 +dill==0.3.8 +paramiko==3.5.0 +llama-index==0.8.36 +mdit-py-plugins==0.4.2 +Werkzeug==3.1.3 +pyu2f==0.1.5 +dask-glm==0.2.0 +httpx==0.27.2 +typeguard==4.4.1 +mypy-extensions==1.0.0 +kmodes==0.12.2 +keras==2.15.0 +ydata-profiling==0.0.dev0 +regex==2024.11.6 +xarray==2024.11.0 +setuptools==75.3.0 +charset-normalizer==3.4.0 +jupyterlab_nvdashboard==0.11.0 +pylibraft==24.12.0a36 +spacy==3.7.6 +mlflow-skinny==2.17.2 +nvtx==0.2.10 +multimethod==1.12 +pexpect==4.9.0 +torch==2.1.0.post301 +flatbuffers==24.3.25 +python-json-logger==2.0.7 +PyJWT==2.9.0 +multiprocess==0.70.16 +colorlover==0.3.0 +yarl==1.16.0 +locket==1.0.0 +patsy==1.0.0 +rapids-dask-dependency==24.12.0a0 +stanza==1.9.2 +debugpy==1.8.8 +jupyterlab_pygments==0.3.0 +pylibcudf==24.12.0a337 +lz4==4.3.3 +pandas==2.2.3 +tifffile==2024.9.20 +pynvml==11.4.1 +cufflinks==0.17.3 +ipywidgets==8.1.5 +requests-oauthlib==2.0.0 +google-auth-oauthlib==1.2.1 +rsa==4.9 +webcolors==24.8.0 +jsonschema-specifications==2024.10.1 +scikit-learn==1.5.2 +langchain-text-splitters==0.3.2 +pandas-datareader==0.10.0 +tomli==2.0.2 +tzdata==2024.2 +scikit-image==0.24.0 +tensorboard_data_server==0.7.0 +kiwisolver==1.4.7 +cloudpathlib==0.20.0 +isodate==0.6.1 +adversarial-robustness-toolbox==1.19.1 +SQLAlchemy==2.0.36 +pytest-runner==6.0.0 +pycairo==1.27.0 +treelite==4.3.0 +jiter==0.7.0 +threadpoolctl==3.5.0 +pandocfilters==1.5.0 +loguru==0.7.2 +smart_open==7.0.5 +shellingham==1.5.4 +deepspeed==0.15.4 +prompt_toolkit==3.0.48 +databricks-sdk==0.34.0 +langchain-core==0.3.15 +imageio==2.36.0 +openapi-schema-pydantic==1.2.4 +zict==3.0.0 +cachetools==5.5.0 +colorful==0.5.6 +mpmath==1.3.0 +nest_asyncio==1.6.0 +pyFUME==0.2.25 +opencv-python-headless==4.9.0 +fastai==2.7.18 +importlib_resources==6.4.5 +binaryornot==0.4.4 +evaluate==0.4.1 +matplotlib-inline==0.1.7 +wasabi==1.1.2 +pycparser==2.22 +GitPython==3.1.43 +pluggy==1.5.0 +async-lru==2.0.4 +pgmpy==0.1.24 +anyio==4.4.0 +executing==2.1.0 +orjson==3.10.11 +humanfriendly==10.0 +tornado==6.4.1 +gmpy2==2.1.5 +rlPyCairo==0.2.0 +distributed==2024.11.0 +FuzzyTM==2.0.5 +torchtext==0.15.2a0+5ce3163 +pytest==8.3.5 +pyod==2.0.2 +ImageHash==4.3.1 +soupsieve==2.5 +tblib==3.0.0 +emoji==2.14.0 +aiohappyeyeballs==2.4.3 +uri-template==1.3.0 +tensorflow_estimator==2.15.0 +babel==2.16.0 +dask-cuda==24.12.0a12 +overrides==7.7.0 +opencensus==0.11.3 +openai==0.28.1 +language_data==1.2.0 +jedi==0.19.2 +cookiecutter==2.6.0 +entrypoints==0.4 +exceptiongroup==1.2.2 +marisa-trie==1.2.0 +uvloop==0.20.0 +aiosignal==1.3.1 +Flask==3.0.3 +tensorboard==2.15.2 +cffi==1.17.1 +tf_keras==2.15.0 +absl-py==2.1.0 +blinker==1.9.0 +types-python-dateutil==2.9.0.20241003 +opencv-python==4.9.0 +frozendict==2.4.6 +aiohttp-cors==0.7.0 +statsmodels==0.14.4 +tinycss2==1.4.0 +terminado==0.18.1 +pycaret==2.2.3 +aiohttp==3.10.10 +distributed-ucxx==0.41.0 +prometheus_client==0.21.0 +fastdownload==0.0.7 +grpcio==1.59.3 +google-api-core==2.22.0 +jupyterlab_widgets==3.0.13 +appdirs==1.4.4 +littleutils==0.0.0 +ray==2.24.0 +kaggle==1.6.17 +jsonschema==4.23.0 +google-auth==2.36.0 +scikit-base==0.11.0 +visions==0.7.6 +pyarrow==15.0.0 +transformers==4.33.0 +prometheus_flask_exporter==0.23.1 +dm-tree==0.1.8 +colorama==0.4.6 +requests-toolbelt==1.0.0 +cached-property==1.5.2 +cymem==2.0.8 +PyNaCl==1.5.0 +PyWavelets==1.7.0 +httptools==0.6.1 +typing-utils==0.1.0 +email_validator==2.2.0 +marshmallow==3.23.1 +Deprecated==1.2.14 +virtualenv==20.4.7 +optuna==3.6.1 +jupyter_server==2.14.2 +termcolor==2.5.0 +mpi4py==4.0.1 +torchdata==0.7.1+8cea82f +dataclasses==0.8 +cloudpickle==3.1.0 +tree_sitter_languages==1.10.2 +tabulate==0.9.0 +ipython==8.29.0 +lightgbm==4.3.0 +captum==0.6.0 +confuse==2.0.1 +torchvision==0.16.1+adc3221 +lxml==4.9.4 +fastapi==0.115.4 +python-multipart==0.0.17 +dnspython==2.7.0 +jupyter-console==6.6.3 +preshed==3.0.9 +py-cpuinfo==9.0.0 +Send2Trash==1.8.3 +murmurhash==1.0.10 +sniffio==1.3.1 +websockets==13.1 +h11==0.14.0 +smmap==5.0.0 +textual==0.85.2 +jsonpatch==1.33 +opencensus-context==0.1.3 +nbconvert==7.16.4 +sentry-sdk==2.19.0 +opentelemetry-semantic-conventions==0.37b0 +pandas-profiling==2.8.0 +pillow==10.3.0 +peft==0.13.2 +rpds-py==0.21.0 +bokeh==3.6.1 +distro==1.9.0 +itsdangerous==2.2.0 +wandb==0.18.7 +jsonpointer==3.0.0 +astropy-iers-data==0.2024.11.11.0.32.38 +horovod==0.28.1 +graphviz==0.20.3 +vtk==9.3.1 +bleach==6.2.0 +numexpr==2.8.7 +pydantic_core==2.23.4 +Jinja2==3.1.4 +widgetsnbextension==4.0.13 +filelock==3.16.1 +catboost==1.2.7 +raft-dask==24.12.0a36 +async-timeout==4.0.3 +datefinder==0.7.3 +coloredlogs==15.0.1 +platformdirs==4.3.6 +spacy-legacy==3.0.12 +chardet==5.2.0 +jupyter_client==8.6.3 +importlib_metadata==8.5.0 +rfc3986-validator==0.1.1 +huggingface_hub==0.26.2 +PySocks==1.7.1 +mlxtend==0.23.2 +outdated==0.2.2 +partd==1.4.2 +thinc==8.2.5 +astropy==6.1.6 +rdflib==6.3.2 +h2==4.1.0 +typer==0.13.0 +xyzservices==2024.9.0 +toolz==0.12.1 +frozenlist==1.5.0 +rdkit==2024.9.2 +pyasn1==0.6.1 +jupyter_server_terminals==0.5.3 +ucx-py==0.41.0a11 +astunparse==1.6.3 +simpful==2.12.0 +notebook_shim==0.2.4 +scipy==1.13.1 +colorlog==6.9.0 +tiktoken==0.3.3 +plotly==5.24.1 +fastrlock==0.8.2 +chart-studio==1.1.0 +stack-data==0.6.2 +google-pasta==0.2.0 +sktime==0.34.0 +PyYAML==6.0.2 +sympy==1.13.3 +multidict==6.1.0 +ml-dtypes==0.2.0 +tensorboardX==2.6.2.2 +decorator==5.1.1 +cytoolz==1.0.0 +ase==3.23.0 +isoduration==20.11.0 +html5lib==1.1 +langsmith==0.1.142 +future==1.0.0 +onnx2torch==1.5.15 +multipledispatch==0.6.0 +protobuf==4.24.4 +ucxx==0.41.0 +pandas_flavor==0.6.0 +msgpack==1.1.0 +pyasn1_modules==0.4.1 +imagecodecs==2024.1.1 +mlflow==2.17.2 +watchfiles==0.24.0 +dm-sonnet==2.0.2 +langcodes==3.4.1 +freetype-py==2.3.0 +argon2-cffi-bindings==21.2.0 +trimesh==4.5.2 +opt_einsum==3.4.0 +tenacity==8.5.0 +h5py==3.12.1 +fastapi-cli==0.0.5 +oauthlib==3.2.2 +parso==0.8.4 +weasel==0.4.1 +yfinance==0.2.49 +networkx==2.8.8 +bitsandbytes==0.44.1 +lazy_loader==0.4 +querystring_parser==1.2.4 +contourpy==1.3.0 +unicodedata2==15.1.0 +bcrypt==4.2.0 +munkres==1.1.4 +langchain==0.0.298 +hpack==4.0.0 +cryptography==43.0.3 +umap-learn==0.5.7 +arrow==1.3.0 +docker==7.1.0 +certifi==2025.1.31 +fastjsonschema==2.20.0 +tensorflow==2.15.0 +googleapis-common-protos==1.65.0 +iniconfig==2.0.0 +Markdown==3.6 +llvmlite==0.43.0 +wslink==2.3.2 +attrs==24.2.0 +rich==13.9.4 +cupy==13.3.0 +uc-micro-py==1.0.3 +alembic==1.14.0 +joblib==1.4.2 +reportlab==4.2.5 +miniful==0.0.6 +jupyter_core==5.7.2 +wheel==0.45.0 +phik==0.12.3 +mistune==3.0.2 +wcwidth==0.2.13 +dacite==1.8.1 +accelerate==0.22.0 +sacremoses==0.0.53 +revtok==0.0.3 +python-slugify==8.0.4 +tangled-up-in-unicode==0.2.0 +dask==2024.11.0 +markdown-it-py==3.0.0 +sentencepiece==0.1.99 +beautifulsoup4==4.12.3 +six==1.16.0 +numba-cuda==0.0.17 +argon2-cffi==23.1.0 +xxhash==3.5.0 +hjson==3.1.0 +fonttools==4.54.1 +graphql-core==3.2.5 +pyparsing==3.2.0 +pure_eval==0.2.3 +distlib==0.3.9 +lightning==2.4.0 +wordcloud==0.0.0 +catalogue==2.0.10 +jax==0.4.27 +tree-sitter==0.23.2 +notebook==7.2.2 +dataclasses-json==0.6.7 +propcache==0.2.0 +numba==0.60.0 +dask-expr==1.1.17 +pydantic==2.9.2 +gunicorn==22.0.0 +missingno==0.5.2 +pyOpenSSL==24.2.1 +openpyxl==3.1.5 +packaging==24.1 +python-dotenv==1.0.1 +cycler==0.12.1 +types-pytz==2024.2.0.20241003 +yellowbrick==1.5 +referencing==0.35.1 +pyLDAvis==3.4.1 +lazypredict==0.2.16 +fqdn==1.5.1 +websocket-client==1.8.0 +fastcore==1.7.19 +pynvjitlink-cu12==0.3.0 +pingouin==0.5.5 +numpy==1.26.4 +typing-inspect==0.9.0 +nltk==3.9.1 +onnxruntime==1.19.2 +tensorflow-probability==0.23.0 +datasets==3.0.2 +pickleshare==0.7.5 +peewee==3.17.7 +torch-geometric==2.6.1 +ptyprocess==0.7.0 +greenlet==3.1.1 +graphql-relay==3.2.0 +graphene==3.4.3 +et_xmlfile==2.0.0 +webencodings==0.5.1 +hyperframe==6.0.1 +multitasking==0.0.9 +typer-slim==0.13.0 +onnx==1.15.0 +uvicorn==0.32.0 +memray==1.13.4 +xgboost==2.1.2 +Brotli==1.1.0 +zipp==3.21.0 +nbformat==5.10.4 +responses==0.18.0 +funcy==2.0 +Pygments==2.18.0 +tqdm==4.67.0 +linkify-it-py==2.0.3 +srsly==2.4.8 +cuda-python==12.6.0 +lightning-utilities==0.11.8 +cudf==24.12.0a337 +dask-ml==2024.4.4 +docker-pycreds==0.4.0 +pkgutil_resolve_name==1.3.10 +opentelemetry-api==1.16.0 +fsspec==2024.9.0 +nbclient==0.10.0 +psutil==5.9.8 +pytorch-lightning==2.4.0 +sortedcontainers==2.4.0 +matplotlib==3.9.2 +defusedxml==0.7.1 +urllib3==1.26.19 +jupyterlab_server==2.27.3 +retrying==1.3.3 +dask-cudf==24.12.0a337 +sqlparse==0.5.1 +text-unidecode==1.3 +seaborn==0.13.2 +typing_extensions==4.12.2 +pyzmq==26.2.0 +rfc3339-validator==0.1.4 +pynndescent==0.5.13 +pip==24.3.1 +confection==0.1.4 +wrapt==1.14.1 +fastprogress==1.0.3 +traitlets==5.14.3 +asttokens==2.4.1 +json5==0.9.28 +pandas-stubs==2.2.3.241126 +torchmetrics==1.2.1 +gitdb==4.0.11 +annotated-types==0.7.0 +ipython-autotime==0.1 +httpcore==1.0.6 +click==8.1.7 +setproctitle==1.3.3 +starlette==0.41.2 +jupyterlab==4.2.5 +rmm==24.12.0a27 +opentelemetry-sdk==1.16.0 +textblob==0.15.3 +imbalanced-learn==0.12.4 +typeguard==4.3.0 +more-itertools==10.3.0 +zipp==3.19.2 +autocommand==2.2.2 +jaraco.context==5.3.0 +packaging==24.1 +importlib_metadata==8.0.0 +platformdirs==4.2.2 +jaraco.functools==4.0.1 +importlib_resources==6.4.0 +tomli==2.0.1 +jaraco.text==3.12.1 +wheel==0.43.0 +jaraco.collections==5.1.0 +typing_extensions==4.12.2 +inflect==7.3.1 +backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_132610-pxg645u5/files/wandb-metadata.json b/wandb/run-20250504_132610-pxg645u5/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..448328b179970362f2471973f31fb58da4f76b55 --- /dev/null +++ b/wandb/run-20250504_132610-pxg645u5/files/wandb-metadata.json @@ -0,0 +1,77 @@ +{ + "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", + "python": "3.10.15", + "startedAt": "2025-05-04T10:26:10.053836Z", + "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", + "codePath": "finetuning_bc_prott5.py", + "email": "zeynep.isik1@sabanciuniv.edu", + "root": "/arf/scratch/zisik/prott5_bc_ft", + "host": "kolyoz1", + "username": "zisik", + "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", + "codePathLocal": "finetuning_bc_prott5.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "7643995308032", + "used": "274767593472" + } + }, + "memory": { + "total": "1081373220864" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "cluster_name": "cuda", + "conf": "/etc/slurm/slurm.conf", + "cpus_on_node": "16", + "cpus_per_task": "16", + "gpus_on_node": "1", + "gtids": "0", + "job_account": "tbag154", + "job_cpus_per_node": "16", + "job_end_time": "1746613538", + "job_gid": "11636", + "job_gpus": "1", + "job_id": "1027932", + "job_name": "msa_ph_pt", + "job_nodelist": "kolyoz1", + "job_num_nodes": "1", + "job_partition": "kolyoz-cuda", + "job_qos": "tbag", + "job_start_time": "1746354338", + "job_uid": "11636", + "job_user": "zisik", + "jobid": "1027932", + "localid": "0", + "mem_per_cpu": "14000", + "nnodes": "1", + "node_aliases": "(null)", + "nodeid": "0", + "nodelist": "kolyoz1", + "prio_process": "0", + "procid": "0", + "submit_dir": "/arf/scratch/zisik", + "submit_host": "cuda-ui", + "task_pid": "3156950", + "tasks_per_node": "1", + "topology_addr": "kolyoz1", + "topology_addr_pattern": "node", + "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" + }, + "cudaVersion": "12.6" +} \ No newline at end of file diff --git a/wandb/run-20250504_132610-pxg645u5/files/wandb-summary.json b/wandb/run-20250504_132610-pxg645u5/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..abe7f35e04106235b4471ed10391e2de502bf8a5 --- /dev/null +++ b/wandb/run-20250504_132610-pxg645u5/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":6}} \ No newline at end of file diff --git a/wandb/run-20250504_132610-pxg645u5/logs/debug-core.log b/wandb/run-20250504_132610-pxg645u5/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..d8927ec645e582bb16b497af54aed2f51506dd14 --- /dev/null +++ b/wandb/run-20250504_132610-pxg645u5/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-05-04T13:26:09.392354119+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmppack6571/port-3156976.txt","pid":3156976,"debug":false,"disable-analytics":false} +{"time":"2025-05-04T13:26:09.392402628+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2025-05-04T13:26:09.393200765+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":36685,"Zone":""}} +{"time":"2025-05-04T13:26:09.393299078+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3156976} +{"time":"2025-05-04T13:26:09.570123715+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:37852"} +{"time":"2025-05-04T13:26:10.055349971+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"pxg645u5","id":"127.0.0.1:37852"} +{"time":"2025-05-04T13:26:10.180212249+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"pxg645u5","id":"127.0.0.1:37852"} +{"time":"2025-05-04T13:26:16.993053475+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:37852"} +{"time":"2025-05-04T13:26:16.994546738+03:00","level":"INFO","msg":"server is shutting down"} +{"time":"2025-05-04T13:26:16.993862146+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:37852"} +{"time":"2025-05-04T13:26:16.994899765+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:37852"} +{"time":"2025-05-04T13:26:17.953982632+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:37852"} +{"time":"2025-05-04T13:26:17.954000039+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:37852"} +{"time":"2025-05-04T13:26:17.954015604+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_132610-pxg645u5/logs/debug-internal.log b/wandb/run-20250504_132610-pxg645u5/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..90be0a8f62ab298af46980179616b6b5c91f3e29 --- /dev/null +++ b/wandb/run-20250504_132610-pxg645u5/logs/debug-internal.log @@ -0,0 +1,19 @@ +{"time":"2025-05-04T13:26:10.056874799+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} +{"time":"2025-05-04T13:26:10.056920353+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_132610-pxg645u5/logs/debug-core.log"} +{"time":"2025-05-04T13:26:10.180146537+03:00","level":"INFO","msg":"created new stream","id":"pxg645u5"} +{"time":"2025-05-04T13:26:10.180200098+03:00","level":"INFO","msg":"stream: started","id":"pxg645u5"} +{"time":"2025-05-04T13:26:10.180372555+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"pxg645u5"} +{"time":"2025-05-04T13:26:10.180478207+03:00","level":"INFO","msg":"sender: started","stream_id":"pxg645u5"} +{"time":"2025-05-04T13:26:10.18057531+03:00","level":"INFO","msg":"handler: started","stream_id":"pxg645u5"} +{"time":"2025-05-04T13:26:10.587540794+03:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-05-04T13:26:16.993666261+03:00","level":"INFO","msg":"stream: closing","id":"pxg645u5"} +{"time":"2025-05-04T13:26:16.993748173+03:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-05-04T13:26:16.995793958+03:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-05-04T13:26:17.198876326+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} +{"time":"2025-05-04T13:26:17.198909473+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} +{"time":"2025-05-04T13:26:17.198920913+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} +{"time":"2025-05-04T13:26:17.694743818+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-05-04T13:26:17.953755664+03:00","level":"INFO","msg":"handler: closed","stream_id":"pxg645u5"} +{"time":"2025-05-04T13:26:17.953802728+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"pxg645u5"} +{"time":"2025-05-04T13:26:17.953828101+03:00","level":"INFO","msg":"sender: closed","stream_id":"pxg645u5"} +{"time":"2025-05-04T13:26:17.953904675+03:00","level":"INFO","msg":"stream: closed","id":"pxg645u5"} diff --git a/wandb/run-20250504_132610-pxg645u5/logs/debug.log b/wandb/run-20250504_132610-pxg645u5/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..468c0395d71efd915d75073afc6774b985f26212 --- /dev/null +++ b/wandb/run-20250504_132610-pxg645u5/logs/debug.log @@ -0,0 +1,26 @@ +2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 +2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Configure stats pid to 3156976 +2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings +2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings +2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} +2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 13:26:10,046 INFO MainThread:3156976 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_132610-pxg645u5/logs/debug.log +2025-05-04 13:26:10,047 INFO MainThread:3156976 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_132610-pxg645u5/logs/debug-internal.log +2025-05-04 13:26:10,047 INFO MainThread:3156976 [wandb_init.py:init():619] calling init triggers +2025-05-04 13:26:10,047 INFO MainThread:3156976 [wandb_init.py:init():626] wandb.init called with sweep_config: {} +config: {} +2025-05-04 13:26:10,047 INFO MainThread:3156976 [wandb_init.py:init():669] starting backend +2025-05-04 13:26:10,047 INFO MainThread:3156976 [wandb_init.py:init():673] sending inform_init request +2025-05-04 13:26:10,052 INFO MainThread:3156976 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-04 13:26:10,053 INFO MainThread:3156976 [wandb_init.py:init():686] backend started and connected +2025-05-04 13:26:10,061 INFO MainThread:3156976 [wandb_init.py:init():781] updated telemetry +2025-05-04 13:26:10,064 INFO MainThread:3156976 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout +2025-05-04 13:26:10,574 INFO MainThread:3156976 [wandb_init.py:init():867] starting run threads in backend +2025-05-04 13:26:12,208 INFO MainThread:3156976 [wandb_run.py:_console_start():2456] atexit reg +2025-05-04 13:26:12,209 INFO MainThread:3156976 [wandb_run.py:_redirect():2305] redirect: wrap_raw +2025-05-04 13:26:12,209 INFO MainThread:3156976 [wandb_run.py:_redirect():2370] Wrapping output streams. +2025-05-04 13:26:12,209 INFO MainThread:3156976 [wandb_run.py:_redirect():2395] Redirects installed. +2025-05-04 13:26:12,220 INFO MainThread:3156976 [wandb_init.py:init():911] run started, returning control to user process +2025-05-04 13:26:16,995 WARNING MsgRouterThr:3156976 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_132610-pxg645u5/run-pxg645u5.wandb b/wandb/run-20250504_132610-pxg645u5/run-pxg645u5.wandb new file mode 100644 index 0000000000000000000000000000000000000000..ebcf26b6563d253be1738d7c6c5bd6f413bdaf9a Binary files /dev/null and b/wandb/run-20250504_132610-pxg645u5/run-pxg645u5.wandb differ diff --git a/wandb/run-20250504_132912-1agsw1y8/files/config.yaml b/wandb/run-20250504_132912-1agsw1y8/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..89a4f38c983e370e131179dcc4d572a4d25e65b6 --- /dev/null +++ b/wandb/run-20250504_132912-1agsw1y8/files/config.yaml @@ -0,0 +1,374 @@ +_wandb: + value: + cli_version: 0.18.7 + m: + - "1": train/epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/global_step + "6": + - 3 + "7": [] + - "1": eval/runtime + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/grad_norm + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/learning_rate + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/samples_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/steps_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/accuracy + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.10.15 + t: + "1": + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "2": + - 1 + - 2 + - 3 + - 5 + - 6 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "3": + - 7 + - 23 + - 55 + - 66 + "4": 3.10.15 + "5": 0.18.7 + "6": 4.45.2 + "8": + - 5 + "9": + "1": transformers_trainer + "12": 0.18.7 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +auto_find_batch_size: + value: false +batch_eval_metrics: + value: false +bf16: + value: false +bf16_full_eval: + value: false +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +dispatch_batches: + value: null +do_eval: + value: true +do_predict: + value: false +do_train: + value: false +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: epoch +eval_use_gather_object: + value: false +evaluation_strategy: + value: epoch +fp16: + value: true +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: false +group_by_length: + value: false +half_precision_backend: + value: auto +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +ignore_data_skip: + value: false +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +learning_rate: + value: 5e-05 +length_column_name: + value: length +load_best_model_at_end: + value: true +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: t5-bc-out/runs/May04_13-33-08_kolyoz1 +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 500 +logging_strategy: + value: steps +lr_scheduler_type: + value: linear +max_grad_norm: + value: 1 +max_steps: + value: -1 +metric_for_best_model: + value: loss +mp_parameters: + value: "" +neftune_noise_alpha: + value: null +no_cuda: + value: false +num_train_epochs: + value: 3 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_dir: + value: t5-bc-out +overwrite_output_dir: + value: false +past_index: + value: -1 +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 8 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +prediction_loss_only: + value: false +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_unused_columns: + value: true +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +run_name: + value: t5-bc-out +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: true +save_steps: + value: 500 +save_strategy: + value: epoch +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +split_batches: + value: null +tf32: + value: null +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_liger_kernel: + value: false +use_mps_device: + value: false +warmup_ratio: + value: 0 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/wandb/run-20250504_132912-1agsw1y8/files/output.log b/wandb/run-20250504_132912-1agsw1y8/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..8ca93eec2346930dfe72e70314a1388aa43e22d8 --- /dev/null +++ b/wandb/run-20250504_132912-1agsw1y8/files/output.log @@ -0,0 +1,87 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Map: 100%|██████████| 511104/511104 [00:20<00:00, 25525.81 examples/s] +Map: 100%|██████████| 109522/109522 [00:04<00:00, 26956.64 examples/s] +/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2025-05-04 13:33:14,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. + 33%|███▎ | 15972/47916 [2:22:01<4:54:49, 1.81it/s] +{'loss': 0.6947, 'grad_norm': 0.09912440180778503, 'learning_rate': 4.947825361048502e-05, 'epoch': 0.03} +{'loss': 0.6939, 'grad_norm': 0.23786939680576324, 'learning_rate': 4.8956507220970036e-05, 'epoch': 0.06} +{'loss': 0.6936, 'grad_norm': 0.10555226355791092, 'learning_rate': 4.843476083145505e-05, 'epoch': 0.09} +{'loss': 0.6935, 'grad_norm': 0.28058305382728577, 'learning_rate': 4.791301444194006e-05, 'epoch': 0.13} +{'loss': 0.6937, 'grad_norm': 0.13599741458892822, 'learning_rate': 4.739126805242508e-05, 'epoch': 0.16} +{'loss': 0.6935, 'grad_norm': 0.13076388835906982, 'learning_rate': 4.6869521662910095e-05, 'epoch': 0.19} +{'loss': 0.6934, 'grad_norm': 0.1778457760810852, 'learning_rate': 4.634777527339511e-05, 'epoch': 0.22} +{'loss': 0.6935, 'grad_norm': 0.4112167954444885, 'learning_rate': 4.582602888388012e-05, 'epoch': 0.25} +{'loss': 0.6934, 'grad_norm': 0.1330016702413559, 'learning_rate': 4.530428249436514e-05, 'epoch': 0.28} +{'loss': 0.6935, 'grad_norm': 0.09426847100257874, 'learning_rate': 4.478253610485016e-05, 'epoch': 0.31} +{'loss': 0.6933, 'grad_norm': 0.3686296343803406, 'learning_rate': 4.426078971533517e-05, 'epoch': 0.34} +{'loss': 0.6933, 'grad_norm': 0.21278153359889984, 'learning_rate': 4.373904332582019e-05, 'epoch': 0.38} +{'loss': 0.6935, 'grad_norm': 0.23074378073215485, 'learning_rate': 4.321834042908423e-05, 'epoch': 0.41} +{'loss': 0.6932, 'grad_norm': 0.5192509293556213, 'learning_rate': 4.269659403956925e-05, 'epoch': 0.44} +{'loss': 0.6932, 'grad_norm': 0.07643919438123703, 'learning_rate': 4.217484765005426e-05, 'epoch': 0.47} +{'loss': 0.6935, 'grad_norm': 0.09435634315013885, 'learning_rate': 4.1653101260539276e-05, 'epoch': 0.5} +{'loss': 0.6932, 'grad_norm': 0.3456329107284546, 'learning_rate': 4.113239836380333e-05, 'epoch': 0.53} +{'loss': 0.6934, 'grad_norm': 0.11689063161611557, 'learning_rate': 4.061065197428834e-05, 'epoch': 0.56} +{'loss': 0.6934, 'grad_norm': 0.25019219517707825, 'learning_rate': 4.0088905584773355e-05, 'epoch': 0.59} +{'loss': 0.6933, 'grad_norm': 0.12248441576957703, 'learning_rate': 3.956715919525837e-05, 'epoch': 0.63} +{'loss': 0.6933, 'grad_norm': 0.11549345403909683, 'learning_rate': 3.9046456298522416e-05, 'epoch': 0.66} +{'loss': 0.6934, 'grad_norm': 0.27383607625961304, 'learning_rate': 3.852470990900743e-05, 'epoch': 0.69} +{'loss': 0.6935, 'grad_norm': 0.21311810612678528, 'learning_rate': 3.800296351949245e-05, 'epoch': 0.72} +{'loss': 0.6933, 'grad_norm': 0.25916823744773865, 'learning_rate': 3.7481217129977466e-05, 'epoch': 0.75} +{'loss': 0.6934, 'grad_norm': 0.13208124041557312, 'learning_rate': 3.6960514233241504e-05, 'epoch': 0.78} +{'loss': 0.6934, 'grad_norm': 0.4182877242565155, 'learning_rate': 3.643876784372652e-05, 'epoch': 0.81} +{'loss': 0.6933, 'grad_norm': 0.19375275075435638, 'learning_rate': 3.5917021454211544e-05, 'epoch': 0.85} +{'loss': 0.6933, 'grad_norm': 0.1647150218486786, 'learning_rate': 3.5395275064696554e-05, 'epoch': 0.88} +{'loss': 0.6933, 'grad_norm': 0.458692729473114, 'learning_rate': 3.48745721679606e-05, 'epoch': 0.91} +{'loss': 0.6933, 'grad_norm': 0.24417555332183838, 'learning_rate': 3.4352825778445616e-05, 'epoch': 0.94} +{'loss': 0.6932, 'grad_norm': 0.10788150876760483, 'learning_rate': 3.383107938893063e-05, 'epoch': 0.97} + File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 125, in +{'eval_loss': 0.6931192278862, 'eval_accuracy': 0.4992604225635032, 'eval_runtime': 182.4166, 'eval_samples_per_second': 600.395, 'eval_steps_per_second': 75.053, 'epoch': 1.0} + trainer.train() + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2052, in train + return inner_training_loop( + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2487, in _inner_training_loop + self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2918, in _maybe_log_save_evaluate + self._save_checkpoint(model, trial, metrics=metrics) + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3008, in _save_checkpoint + self.save_model(output_dir, _internal_call=True) + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3623, in save_model + self._save(output_dir) + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3721, in _save + safetensors.torch.save_file( + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/safetensors/torch.py", line 286, in save_file + serialize_file(_flatten(tensors), filename, metadata=metadata) + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/safetensors/torch.py", line 488, in _flatten + raise RuntimeError( +RuntimeError: + Some tensors share memory, this will lead to duplicate memory on disk and potential differences when loading them again: [{'encoder.encoder.embed_tokens.weight', 'encoder.shared.weight'}]. + A potential way to correctly save your model is to use `save_model`. + More information at https://huggingface.co/docs/safetensors/torch_shared_tensors + +Traceback (most recent call last): + File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 125, in + trainer.train() + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2052, in train + return inner_training_loop( + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2487, in _inner_training_loop + self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2918, in _maybe_log_save_evaluate + self._save_checkpoint(model, trial, metrics=metrics) + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3008, in _save_checkpoint + self.save_model(output_dir, _internal_call=True) + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3623, in save_model + self._save(output_dir) + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3721, in _save + safetensors.torch.save_file( + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/safetensors/torch.py", line 286, in save_file + serialize_file(_flatten(tensors), filename, metadata=metadata) + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/safetensors/torch.py", line 488, in _flatten + raise RuntimeError( +RuntimeError: + Some tensors share memory, this will lead to duplicate memory on disk and potential differences when loading them again: [{'encoder.encoder.embed_tokens.weight', 'encoder.shared.weight'}]. + A potential way to correctly save your model is to use `save_model`. + More information at https://huggingface.co/docs/safetensors/torch_shared_tensors + diff --git a/wandb/run-20250504_132912-1agsw1y8/files/requirements.txt b/wandb/run-20250504_132912-1agsw1y8/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..847c45ecccb522de294762faeeb01fe5fb02f7ac --- /dev/null +++ b/wandb/run-20250504_132912-1agsw1y8/files/requirements.txt @@ -0,0 +1,541 @@ +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +pyg-lib==0.4.0+pt20cu117 +biopython==1.85 +iniconfig==2.0.0 +tokenizers==0.20.0 +accelerate==1.3.0 +torch==2.6.0 +nvidia-nccl-cu12==2.21.5 +transformers==4.45.2 +nvidia-cusparse-cu12==12.3.1.170 +torch-scatter==2.1.2+pt20cu117 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nvtx-cu12==12.4.127 +zstd==1.5.6.6 +fair-esm==2.0.0 +omegaconf==2.3.0 +pluggy==1.5.0 +pytest==8.3.5 +nvidia-curand-cu12==10.3.5.147 +nvidia-cufft-cu12==11.2.1.3 +torch-cluster==1.6.3+pt20cu117 +regex==2024.9.11 +nvidia-cudnn-cu12==9.1.0.70 +torch-spline-conv==1.2.2+pt20cu117 +nvidia-cusolver-cu12==11.6.1.9 +antlr4-python3-runtime==4.9.3 +msgpack-numpy==0.4.8 +nlp==0.2.0 +einops==0.8.1 +nvidia-cublas-cu12==12.4.5.8 +triton==3.2.0 +ninja==1.11.1.3 +hydra-core==1.3.2 +nvidia-nvjitlink-cu12==12.4.127 +biotite==0.41.2 +torch-sparse==0.6.18+pt20cu117 +esm==3.1.4 +sympy==1.13.1 +nvidia-cuda-runtime-cu12==12.4.127 +jupyter-lsp==2.2.5 +jupyter-events==0.10.0 +ipykernel==6.29.5 +Mako==1.3.5 +proto-plus==1.25.0 +fst-pso==1.8.1 +gensim==4.3.3 +htmlmin==0.1.12 +tokenizers==0.13.3 +timm==1.0.11 +MarkupSafe==3.0.2 +safetensors==0.4.5 +requests==2.32.3 +gast==0.5.5 +cuml==24.12.0a33 +jaxlib==0.4.23.dev20240214 +spacy-loggers==1.0.5 +pytz==2024.1 +idna==3.10 +python-dateutil==2.9.0 +mdurl==0.1.2 +blis==0.7.10 +jupyter==1.1.1 +pyerfa==2.0.1.5 +comm==0.2.2 +pygraphviz==1.14 +dill==0.3.8 +paramiko==3.5.0 +llama-index==0.8.36 +mdit-py-plugins==0.4.2 +Werkzeug==3.1.3 +pyu2f==0.1.5 +dask-glm==0.2.0 +httpx==0.27.2 +typeguard==4.4.1 +mypy-extensions==1.0.0 +kmodes==0.12.2 +keras==2.15.0 +ydata-profiling==0.0.dev0 +regex==2024.11.6 +xarray==2024.11.0 +setuptools==75.3.0 +charset-normalizer==3.4.0 +jupyterlab_nvdashboard==0.11.0 +pylibraft==24.12.0a36 +spacy==3.7.6 +mlflow-skinny==2.17.2 +nvtx==0.2.10 +multimethod==1.12 +pexpect==4.9.0 +torch==2.1.0.post301 +flatbuffers==24.3.25 +python-json-logger==2.0.7 +PyJWT==2.9.0 +multiprocess==0.70.16 +colorlover==0.3.0 +yarl==1.16.0 +locket==1.0.0 +patsy==1.0.0 +rapids-dask-dependency==24.12.0a0 +stanza==1.9.2 +debugpy==1.8.8 +jupyterlab_pygments==0.3.0 +pylibcudf==24.12.0a337 +lz4==4.3.3 +pandas==2.2.3 +tifffile==2024.9.20 +pynvml==11.4.1 +cufflinks==0.17.3 +ipywidgets==8.1.5 +requests-oauthlib==2.0.0 +google-auth-oauthlib==1.2.1 +rsa==4.9 +webcolors==24.8.0 +jsonschema-specifications==2024.10.1 +scikit-learn==1.5.2 +langchain-text-splitters==0.3.2 +pandas-datareader==0.10.0 +tomli==2.0.2 +tzdata==2024.2 +scikit-image==0.24.0 +tensorboard_data_server==0.7.0 +kiwisolver==1.4.7 +cloudpathlib==0.20.0 +isodate==0.6.1 +adversarial-robustness-toolbox==1.19.1 +SQLAlchemy==2.0.36 +pytest-runner==6.0.0 +pycairo==1.27.0 +treelite==4.3.0 +jiter==0.7.0 +threadpoolctl==3.5.0 +pandocfilters==1.5.0 +loguru==0.7.2 +smart_open==7.0.5 +shellingham==1.5.4 +deepspeed==0.15.4 +prompt_toolkit==3.0.48 +databricks-sdk==0.34.0 +langchain-core==0.3.15 +imageio==2.36.0 +openapi-schema-pydantic==1.2.4 +zict==3.0.0 +cachetools==5.5.0 +colorful==0.5.6 +mpmath==1.3.0 +nest_asyncio==1.6.0 +pyFUME==0.2.25 +opencv-python-headless==4.9.0 +fastai==2.7.18 +importlib_resources==6.4.5 +binaryornot==0.4.4 +evaluate==0.4.1 +matplotlib-inline==0.1.7 +wasabi==1.1.2 +pycparser==2.22 +GitPython==3.1.43 +pluggy==1.5.0 +async-lru==2.0.4 +pgmpy==0.1.24 +anyio==4.4.0 +executing==2.1.0 +orjson==3.10.11 +humanfriendly==10.0 +tornado==6.4.1 +gmpy2==2.1.5 +rlPyCairo==0.2.0 +distributed==2024.11.0 +FuzzyTM==2.0.5 +torchtext==0.15.2a0+5ce3163 +pytest==8.3.5 +pyod==2.0.2 +ImageHash==4.3.1 +soupsieve==2.5 +tblib==3.0.0 +emoji==2.14.0 +aiohappyeyeballs==2.4.3 +uri-template==1.3.0 +tensorflow_estimator==2.15.0 +babel==2.16.0 +dask-cuda==24.12.0a12 +overrides==7.7.0 +opencensus==0.11.3 +openai==0.28.1 +language_data==1.2.0 +jedi==0.19.2 +cookiecutter==2.6.0 +entrypoints==0.4 +exceptiongroup==1.2.2 +marisa-trie==1.2.0 +uvloop==0.20.0 +aiosignal==1.3.1 +Flask==3.0.3 +tensorboard==2.15.2 +cffi==1.17.1 +tf_keras==2.15.0 +absl-py==2.1.0 +blinker==1.9.0 +types-python-dateutil==2.9.0.20241003 +opencv-python==4.9.0 +frozendict==2.4.6 +aiohttp-cors==0.7.0 +statsmodels==0.14.4 +tinycss2==1.4.0 +terminado==0.18.1 +pycaret==2.2.3 +aiohttp==3.10.10 +distributed-ucxx==0.41.0 +prometheus_client==0.21.0 +fastdownload==0.0.7 +grpcio==1.59.3 +google-api-core==2.22.0 +jupyterlab_widgets==3.0.13 +appdirs==1.4.4 +littleutils==0.0.0 +ray==2.24.0 +kaggle==1.6.17 +jsonschema==4.23.0 +google-auth==2.36.0 +scikit-base==0.11.0 +visions==0.7.6 +pyarrow==15.0.0 +transformers==4.33.0 +prometheus_flask_exporter==0.23.1 +dm-tree==0.1.8 +colorama==0.4.6 +requests-toolbelt==1.0.0 +cached-property==1.5.2 +cymem==2.0.8 +PyNaCl==1.5.0 +PyWavelets==1.7.0 +httptools==0.6.1 +typing-utils==0.1.0 +email_validator==2.2.0 +marshmallow==3.23.1 +Deprecated==1.2.14 +virtualenv==20.4.7 +optuna==3.6.1 +jupyter_server==2.14.2 +termcolor==2.5.0 +mpi4py==4.0.1 +torchdata==0.7.1+8cea82f +dataclasses==0.8 +cloudpickle==3.1.0 +tree_sitter_languages==1.10.2 +tabulate==0.9.0 +ipython==8.29.0 +lightgbm==4.3.0 +captum==0.6.0 +confuse==2.0.1 +torchvision==0.16.1+adc3221 +lxml==4.9.4 +fastapi==0.115.4 +python-multipart==0.0.17 +dnspython==2.7.0 +jupyter-console==6.6.3 +preshed==3.0.9 +py-cpuinfo==9.0.0 +Send2Trash==1.8.3 +murmurhash==1.0.10 +sniffio==1.3.1 +websockets==13.1 +h11==0.14.0 +smmap==5.0.0 +textual==0.85.2 +jsonpatch==1.33 +opencensus-context==0.1.3 +nbconvert==7.16.4 +sentry-sdk==2.19.0 +opentelemetry-semantic-conventions==0.37b0 +pandas-profiling==2.8.0 +pillow==10.3.0 +peft==0.13.2 +rpds-py==0.21.0 +bokeh==3.6.1 +distro==1.9.0 +itsdangerous==2.2.0 +wandb==0.18.7 +jsonpointer==3.0.0 +astropy-iers-data==0.2024.11.11.0.32.38 +horovod==0.28.1 +graphviz==0.20.3 +vtk==9.3.1 +bleach==6.2.0 +numexpr==2.8.7 +pydantic_core==2.23.4 +Jinja2==3.1.4 +widgetsnbextension==4.0.13 +filelock==3.16.1 +catboost==1.2.7 +raft-dask==24.12.0a36 +async-timeout==4.0.3 +datefinder==0.7.3 +coloredlogs==15.0.1 +platformdirs==4.3.6 +spacy-legacy==3.0.12 +chardet==5.2.0 +jupyter_client==8.6.3 +importlib_metadata==8.5.0 +rfc3986-validator==0.1.1 +huggingface_hub==0.26.2 +PySocks==1.7.1 +mlxtend==0.23.2 +outdated==0.2.2 +partd==1.4.2 +thinc==8.2.5 +astropy==6.1.6 +rdflib==6.3.2 +h2==4.1.0 +typer==0.13.0 +xyzservices==2024.9.0 +toolz==0.12.1 +frozenlist==1.5.0 +rdkit==2024.9.2 +pyasn1==0.6.1 +jupyter_server_terminals==0.5.3 +ucx-py==0.41.0a11 +astunparse==1.6.3 +simpful==2.12.0 +notebook_shim==0.2.4 +scipy==1.13.1 +colorlog==6.9.0 +tiktoken==0.3.3 +plotly==5.24.1 +fastrlock==0.8.2 +chart-studio==1.1.0 +stack-data==0.6.2 +google-pasta==0.2.0 +sktime==0.34.0 +PyYAML==6.0.2 +sympy==1.13.3 +multidict==6.1.0 +ml-dtypes==0.2.0 +tensorboardX==2.6.2.2 +decorator==5.1.1 +cytoolz==1.0.0 +ase==3.23.0 +isoduration==20.11.0 +html5lib==1.1 +langsmith==0.1.142 +future==1.0.0 +onnx2torch==1.5.15 +multipledispatch==0.6.0 +protobuf==4.24.4 +ucxx==0.41.0 +pandas_flavor==0.6.0 +msgpack==1.1.0 +pyasn1_modules==0.4.1 +imagecodecs==2024.1.1 +mlflow==2.17.2 +watchfiles==0.24.0 +dm-sonnet==2.0.2 +langcodes==3.4.1 +freetype-py==2.3.0 +argon2-cffi-bindings==21.2.0 +trimesh==4.5.2 +opt_einsum==3.4.0 +tenacity==8.5.0 +h5py==3.12.1 +fastapi-cli==0.0.5 +oauthlib==3.2.2 +parso==0.8.4 +weasel==0.4.1 +yfinance==0.2.49 +networkx==2.8.8 +bitsandbytes==0.44.1 +lazy_loader==0.4 +querystring_parser==1.2.4 +contourpy==1.3.0 +unicodedata2==15.1.0 +bcrypt==4.2.0 +munkres==1.1.4 +langchain==0.0.298 +hpack==4.0.0 +cryptography==43.0.3 +umap-learn==0.5.7 +arrow==1.3.0 +docker==7.1.0 +certifi==2025.1.31 +fastjsonschema==2.20.0 +tensorflow==2.15.0 +googleapis-common-protos==1.65.0 +iniconfig==2.0.0 +Markdown==3.6 +llvmlite==0.43.0 +wslink==2.3.2 +attrs==24.2.0 +rich==13.9.4 +cupy==13.3.0 +uc-micro-py==1.0.3 +alembic==1.14.0 +joblib==1.4.2 +reportlab==4.2.5 +miniful==0.0.6 +jupyter_core==5.7.2 +wheel==0.45.0 +phik==0.12.3 +mistune==3.0.2 +wcwidth==0.2.13 +dacite==1.8.1 +accelerate==0.22.0 +sacremoses==0.0.53 +revtok==0.0.3 +python-slugify==8.0.4 +tangled-up-in-unicode==0.2.0 +dask==2024.11.0 +markdown-it-py==3.0.0 +sentencepiece==0.1.99 +beautifulsoup4==4.12.3 +six==1.16.0 +numba-cuda==0.0.17 +argon2-cffi==23.1.0 +xxhash==3.5.0 +hjson==3.1.0 +fonttools==4.54.1 +graphql-core==3.2.5 +pyparsing==3.2.0 +pure_eval==0.2.3 +distlib==0.3.9 +lightning==2.4.0 +wordcloud==0.0.0 +catalogue==2.0.10 +jax==0.4.27 +tree-sitter==0.23.2 +notebook==7.2.2 +dataclasses-json==0.6.7 +propcache==0.2.0 +numba==0.60.0 +dask-expr==1.1.17 +pydantic==2.9.2 +gunicorn==22.0.0 +missingno==0.5.2 +pyOpenSSL==24.2.1 +openpyxl==3.1.5 +packaging==24.1 +python-dotenv==1.0.1 +cycler==0.12.1 +types-pytz==2024.2.0.20241003 +yellowbrick==1.5 +referencing==0.35.1 +pyLDAvis==3.4.1 +lazypredict==0.2.16 +fqdn==1.5.1 +websocket-client==1.8.0 +fastcore==1.7.19 +pynvjitlink-cu12==0.3.0 +pingouin==0.5.5 +numpy==1.26.4 +typing-inspect==0.9.0 +nltk==3.9.1 +onnxruntime==1.19.2 +tensorflow-probability==0.23.0 +datasets==3.0.2 +pickleshare==0.7.5 +peewee==3.17.7 +torch-geometric==2.6.1 +ptyprocess==0.7.0 +greenlet==3.1.1 +graphql-relay==3.2.0 +graphene==3.4.3 +et_xmlfile==2.0.0 +webencodings==0.5.1 +hyperframe==6.0.1 +multitasking==0.0.9 +typer-slim==0.13.0 +onnx==1.15.0 +uvicorn==0.32.0 +memray==1.13.4 +xgboost==2.1.2 +Brotli==1.1.0 +zipp==3.21.0 +nbformat==5.10.4 +responses==0.18.0 +funcy==2.0 +Pygments==2.18.0 +tqdm==4.67.0 +linkify-it-py==2.0.3 +srsly==2.4.8 +cuda-python==12.6.0 +lightning-utilities==0.11.8 +cudf==24.12.0a337 +dask-ml==2024.4.4 +docker-pycreds==0.4.0 +pkgutil_resolve_name==1.3.10 +opentelemetry-api==1.16.0 +fsspec==2024.9.0 +nbclient==0.10.0 +psutil==5.9.8 +pytorch-lightning==2.4.0 +sortedcontainers==2.4.0 +matplotlib==3.9.2 +defusedxml==0.7.1 +urllib3==1.26.19 +jupyterlab_server==2.27.3 +retrying==1.3.3 +dask-cudf==24.12.0a337 +sqlparse==0.5.1 +text-unidecode==1.3 +seaborn==0.13.2 +typing_extensions==4.12.2 +pyzmq==26.2.0 +rfc3339-validator==0.1.4 +pynndescent==0.5.13 +pip==24.3.1 +confection==0.1.4 +wrapt==1.14.1 +fastprogress==1.0.3 +traitlets==5.14.3 +asttokens==2.4.1 +json5==0.9.28 +pandas-stubs==2.2.3.241126 +torchmetrics==1.2.1 +gitdb==4.0.11 +annotated-types==0.7.0 +ipython-autotime==0.1 +httpcore==1.0.6 +click==8.1.7 +setproctitle==1.3.3 +starlette==0.41.2 +jupyterlab==4.2.5 +rmm==24.12.0a27 +opentelemetry-sdk==1.16.0 +textblob==0.15.3 +imbalanced-learn==0.12.4 +typeguard==4.3.0 +more-itertools==10.3.0 +zipp==3.19.2 +autocommand==2.2.2 +jaraco.context==5.3.0 +packaging==24.1 +importlib_metadata==8.0.0 +platformdirs==4.2.2 +jaraco.functools==4.0.1 +importlib_resources==6.4.0 +tomli==2.0.1 +jaraco.text==3.12.1 +wheel==0.43.0 +jaraco.collections==5.1.0 +typing_extensions==4.12.2 +inflect==7.3.1 +backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_132912-1agsw1y8/files/wandb-metadata.json b/wandb/run-20250504_132912-1agsw1y8/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e27daebbacd3a1b6062dd305d598e9d1014c3f16 --- /dev/null +++ b/wandb/run-20250504_132912-1agsw1y8/files/wandb-metadata.json @@ -0,0 +1,77 @@ +{ + "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", + "python": "3.10.15", + "startedAt": "2025-05-04T10:29:13.019628Z", + "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", + "codePath": "finetuning_bc_prott5.py", + "email": "zeynep.isik1@sabanciuniv.edu", + "root": "/arf/scratch/zisik/prott5_bc_ft", + "host": "kolyoz1", + "username": "zisik", + "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", + "codePathLocal": "finetuning_bc_prott5.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "7643995308032", + "used": "274768302080" + } + }, + "memory": { + "total": "1081373220864" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "cluster_name": "cuda", + "conf": "/etc/slurm/slurm.conf", + "cpus_on_node": "16", + "cpus_per_task": "16", + "gpus_on_node": "1", + "gtids": "0", + "job_account": "tbag154", + "job_cpus_per_node": "16", + "job_end_time": "1746613727", + "job_gid": "11636", + "job_gpus": "1", + "job_id": "1027934", + "job_name": "msa_ph_pt", + "job_nodelist": "kolyoz1", + "job_num_nodes": "1", + "job_partition": "kolyoz-cuda", + "job_qos": "tbag", + "job_start_time": "1746354527", + "job_uid": "11636", + "job_user": "zisik", + "jobid": "1027934", + "localid": "0", + "mem_per_cpu": "14000", + "nnodes": "1", + "node_aliases": "(null)", + "nodeid": "0", + "nodelist": "kolyoz1", + "prio_process": "0", + "procid": "0", + "submit_dir": "/arf/scratch/zisik", + "submit_host": "cuda-ui", + "task_pid": "3157550", + "tasks_per_node": "1", + "topology_addr": "kolyoz1", + "topology_addr_pattern": "node", + "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" + }, + "cudaVersion": "12.6" +} \ No newline at end of file diff --git a/wandb/run-20250504_132912-1agsw1y8/files/wandb-summary.json b/wandb/run-20250504_132912-1agsw1y8/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..030bbea79bd4d5fc3ae46d01de3f64e2d7ead2c3 --- /dev/null +++ b/wandb/run-20250504_132912-1agsw1y8/files/wandb-summary.json @@ -0,0 +1 @@ +{"train/learning_rate":3.383107938893063e-05,"train/global_step":15972,"eval/steps_per_second":75.053,"_timestamp":1.7463635035359182e+09,"eval/accuracy":0.4992604225635032,"_step":31,"eval/loss":0.6931192278862,"train/grad_norm":0.10788150876760483,"train/epoch":1,"_wandb":{"runtime":8950},"_runtime":8950.516897928,"train/loss":0.6932,"eval/runtime":182.4166,"eval/samples_per_second":600.395} \ No newline at end of file diff --git a/wandb/run-20250504_132912-1agsw1y8/logs/debug-core.log b/wandb/run-20250504_132912-1agsw1y8/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..dad0e4abf15eab93aed95168c60fe6412f76a17e --- /dev/null +++ b/wandb/run-20250504_132912-1agsw1y8/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-05-04T13:29:12.35887463+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp1u83hfoi/port-3157577.txt","pid":3157577,"debug":false,"disable-analytics":false} +{"time":"2025-05-04T13:29:12.358923345+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2025-05-04T13:29:12.35977753+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":45947,"Zone":""}} +{"time":"2025-05-04T13:29:12.359879073+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3157577} +{"time":"2025-05-04T13:29:12.546636547+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:34718"} +{"time":"2025-05-04T13:29:13.02161239+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"1agsw1y8","id":"127.0.0.1:34718"} +{"time":"2025-05-04T13:29:13.145638422+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"1agsw1y8","id":"127.0.0.1:34718"} +{"time":"2025-05-04T15:58:23.607250248+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:34718"} +{"time":"2025-05-04T15:58:23.607435128+03:00","level":"INFO","msg":"server is shutting down"} +{"time":"2025-05-04T15:58:23.607401252+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:34718"} +{"time":"2025-05-04T15:58:23.607720003+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:34718"} +{"time":"2025-05-04T15:58:24.801882716+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:34718"} +{"time":"2025-05-04T15:58:24.801915389+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:34718"} +{"time":"2025-05-04T15:58:24.801937893+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_132912-1agsw1y8/logs/debug-internal.log b/wandb/run-20250504_132912-1agsw1y8/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3e1b9c9e1960c66d21bac86084b75cecf9a700d0 --- /dev/null +++ b/wandb/run-20250504_132912-1agsw1y8/logs/debug-internal.log @@ -0,0 +1,19 @@ +{"time":"2025-05-04T13:29:13.023253759+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} +{"time":"2025-05-04T13:29:13.023302807+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_132912-1agsw1y8/logs/debug-core.log"} +{"time":"2025-05-04T13:29:13.145570529+03:00","level":"INFO","msg":"created new stream","id":"1agsw1y8"} +{"time":"2025-05-04T13:29:13.145625833+03:00","level":"INFO","msg":"stream: started","id":"1agsw1y8"} +{"time":"2025-05-04T13:29:13.145806528+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"1agsw1y8"} +{"time":"2025-05-04T13:29:13.145923955+03:00","level":"INFO","msg":"handler: started","stream_id":"1agsw1y8"} +{"time":"2025-05-04T13:29:13.146011145+03:00","level":"INFO","msg":"sender: started","stream_id":"1agsw1y8"} +{"time":"2025-05-04T13:29:13.51656923+03:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-05-04T15:58:23.607363166+03:00","level":"INFO","msg":"stream: closing","id":"1agsw1y8"} +{"time":"2025-05-04T15:58:23.607412721+03:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-05-04T15:58:23.608736938+03:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-05-04T15:58:23.995834762+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} +{"time":"2025-05-04T15:58:23.995863601+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} +{"time":"2025-05-04T15:58:23.995874256+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} +{"time":"2025-05-04T15:58:24.53730388+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-05-04T15:58:24.801427373+03:00","level":"INFO","msg":"handler: closed","stream_id":"1agsw1y8"} +{"time":"2025-05-04T15:58:24.801476891+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"1agsw1y8"} +{"time":"2025-05-04T15:58:24.801525233+03:00","level":"INFO","msg":"sender: closed","stream_id":"1agsw1y8"} +{"time":"2025-05-04T15:58:24.801589463+03:00","level":"INFO","msg":"stream: closed","id":"1agsw1y8"} diff --git a/wandb/run-20250504_132912-1agsw1y8/logs/debug.log b/wandb/run-20250504_132912-1agsw1y8/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..ea208d69901f2374562663d7c34e15b09373c8f9 --- /dev/null +++ b/wandb/run-20250504_132912-1agsw1y8/logs/debug.log @@ -0,0 +1,27 @@ +2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 +2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Configure stats pid to 3157577 +2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings +2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings +2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} +2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 13:29:13,013 INFO MainThread:3157577 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_132912-1agsw1y8/logs/debug.log +2025-05-04 13:29:13,014 INFO MainThread:3157577 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_132912-1agsw1y8/logs/debug-internal.log +2025-05-04 13:29:13,014 INFO MainThread:3157577 [wandb_init.py:init():619] calling init triggers +2025-05-04 13:29:13,014 INFO MainThread:3157577 [wandb_init.py:init():626] wandb.init called with sweep_config: {} +config: {} +2025-05-04 13:29:13,014 INFO MainThread:3157577 [wandb_init.py:init():669] starting backend +2025-05-04 13:29:13,014 INFO MainThread:3157577 [wandb_init.py:init():673] sending inform_init request +2025-05-04 13:29:13,018 INFO MainThread:3157577 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-04 13:29:13,019 INFO MainThread:3157577 [wandb_init.py:init():686] backend started and connected +2025-05-04 13:29:13,026 INFO MainThread:3157577 [wandb_init.py:init():781] updated telemetry +2025-05-04 13:29:13,030 INFO MainThread:3157577 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout +2025-05-04 13:29:13,503 INFO MainThread:3157577 [wandb_init.py:init():867] starting run threads in backend +2025-05-04 13:29:14,946 INFO MainThread:3157577 [wandb_run.py:_console_start():2456] atexit reg +2025-05-04 13:29:14,946 INFO MainThread:3157577 [wandb_run.py:_redirect():2305] redirect: wrap_raw +2025-05-04 13:29:14,946 INFO MainThread:3157577 [wandb_run.py:_redirect():2370] Wrapping output streams. +2025-05-04 13:29:14,946 INFO MainThread:3157577 [wandb_run.py:_redirect():2395] Redirects installed. +2025-05-04 13:29:14,954 INFO MainThread:3157577 [wandb_init.py:init():911] run started, returning control to user process +2025-05-04 13:33:19,417 INFO MainThread:3157577 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_13-33-08_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2025-05-04 15:58:23,607 WARNING MsgRouterThr:3157577 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_132912-1agsw1y8/run-1agsw1y8.wandb b/wandb/run-20250504_132912-1agsw1y8/run-1agsw1y8.wandb new file mode 100644 index 0000000000000000000000000000000000000000..d0fe86b920af1550de340c5d128c7edf489a6165 --- /dev/null +++ b/wandb/run-20250504_132912-1agsw1y8/run-1agsw1y8.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71cf2569d2e508f45833ce35b1904bcc5325f9369eef0a76ea074fad88d8621d +size 5615901 diff --git a/wandb/run-20250504_160615-f65jh2lv/files/output.log b/wandb/run-20250504_160615-f65jh2lv/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..9f8ad836e7976228186ebb3ee636e8e5558b4888 --- /dev/null +++ b/wandb/run-20250504_160615-f65jh2lv/files/output.log @@ -0,0 +1,8 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Map: 100%|██████████| 511104/511104 [00:20<00:00, 25304.42 examples/s] +Map: 100%|██████████| 109522/109522 [00:02<00:00, 36704.44 examples/s] +/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2025-05-04 16:06:52,248] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. + 1%| | 246/47916 [02:12<7:08:44, 1.85it/s] diff --git a/wandb/run-20250504_160615-f65jh2lv/files/requirements.txt b/wandb/run-20250504_160615-f65jh2lv/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..847c45ecccb522de294762faeeb01fe5fb02f7ac --- /dev/null +++ b/wandb/run-20250504_160615-f65jh2lv/files/requirements.txt @@ -0,0 +1,541 @@ +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +pyg-lib==0.4.0+pt20cu117 +biopython==1.85 +iniconfig==2.0.0 +tokenizers==0.20.0 +accelerate==1.3.0 +torch==2.6.0 +nvidia-nccl-cu12==2.21.5 +transformers==4.45.2 +nvidia-cusparse-cu12==12.3.1.170 +torch-scatter==2.1.2+pt20cu117 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nvtx-cu12==12.4.127 +zstd==1.5.6.6 +fair-esm==2.0.0 +omegaconf==2.3.0 +pluggy==1.5.0 +pytest==8.3.5 +nvidia-curand-cu12==10.3.5.147 +nvidia-cufft-cu12==11.2.1.3 +torch-cluster==1.6.3+pt20cu117 +regex==2024.9.11 +nvidia-cudnn-cu12==9.1.0.70 +torch-spline-conv==1.2.2+pt20cu117 +nvidia-cusolver-cu12==11.6.1.9 +antlr4-python3-runtime==4.9.3 +msgpack-numpy==0.4.8 +nlp==0.2.0 +einops==0.8.1 +nvidia-cublas-cu12==12.4.5.8 +triton==3.2.0 +ninja==1.11.1.3 +hydra-core==1.3.2 +nvidia-nvjitlink-cu12==12.4.127 +biotite==0.41.2 +torch-sparse==0.6.18+pt20cu117 +esm==3.1.4 +sympy==1.13.1 +nvidia-cuda-runtime-cu12==12.4.127 +jupyter-lsp==2.2.5 +jupyter-events==0.10.0 +ipykernel==6.29.5 +Mako==1.3.5 +proto-plus==1.25.0 +fst-pso==1.8.1 +gensim==4.3.3 +htmlmin==0.1.12 +tokenizers==0.13.3 +timm==1.0.11 +MarkupSafe==3.0.2 +safetensors==0.4.5 +requests==2.32.3 +gast==0.5.5 +cuml==24.12.0a33 +jaxlib==0.4.23.dev20240214 +spacy-loggers==1.0.5 +pytz==2024.1 +idna==3.10 +python-dateutil==2.9.0 +mdurl==0.1.2 +blis==0.7.10 +jupyter==1.1.1 +pyerfa==2.0.1.5 +comm==0.2.2 +pygraphviz==1.14 +dill==0.3.8 +paramiko==3.5.0 +llama-index==0.8.36 +mdit-py-plugins==0.4.2 +Werkzeug==3.1.3 +pyu2f==0.1.5 +dask-glm==0.2.0 +httpx==0.27.2 +typeguard==4.4.1 +mypy-extensions==1.0.0 +kmodes==0.12.2 +keras==2.15.0 +ydata-profiling==0.0.dev0 +regex==2024.11.6 +xarray==2024.11.0 +setuptools==75.3.0 +charset-normalizer==3.4.0 +jupyterlab_nvdashboard==0.11.0 +pylibraft==24.12.0a36 +spacy==3.7.6 +mlflow-skinny==2.17.2 +nvtx==0.2.10 +multimethod==1.12 +pexpect==4.9.0 +torch==2.1.0.post301 +flatbuffers==24.3.25 +python-json-logger==2.0.7 +PyJWT==2.9.0 +multiprocess==0.70.16 +colorlover==0.3.0 +yarl==1.16.0 +locket==1.0.0 +patsy==1.0.0 +rapids-dask-dependency==24.12.0a0 +stanza==1.9.2 +debugpy==1.8.8 +jupyterlab_pygments==0.3.0 +pylibcudf==24.12.0a337 +lz4==4.3.3 +pandas==2.2.3 +tifffile==2024.9.20 +pynvml==11.4.1 +cufflinks==0.17.3 +ipywidgets==8.1.5 +requests-oauthlib==2.0.0 +google-auth-oauthlib==1.2.1 +rsa==4.9 +webcolors==24.8.0 +jsonschema-specifications==2024.10.1 +scikit-learn==1.5.2 +langchain-text-splitters==0.3.2 +pandas-datareader==0.10.0 +tomli==2.0.2 +tzdata==2024.2 +scikit-image==0.24.0 +tensorboard_data_server==0.7.0 +kiwisolver==1.4.7 +cloudpathlib==0.20.0 +isodate==0.6.1 +adversarial-robustness-toolbox==1.19.1 +SQLAlchemy==2.0.36 +pytest-runner==6.0.0 +pycairo==1.27.0 +treelite==4.3.0 +jiter==0.7.0 +threadpoolctl==3.5.0 +pandocfilters==1.5.0 +loguru==0.7.2 +smart_open==7.0.5 +shellingham==1.5.4 +deepspeed==0.15.4 +prompt_toolkit==3.0.48 +databricks-sdk==0.34.0 +langchain-core==0.3.15 +imageio==2.36.0 +openapi-schema-pydantic==1.2.4 +zict==3.0.0 +cachetools==5.5.0 +colorful==0.5.6 +mpmath==1.3.0 +nest_asyncio==1.6.0 +pyFUME==0.2.25 +opencv-python-headless==4.9.0 +fastai==2.7.18 +importlib_resources==6.4.5 +binaryornot==0.4.4 +evaluate==0.4.1 +matplotlib-inline==0.1.7 +wasabi==1.1.2 +pycparser==2.22 +GitPython==3.1.43 +pluggy==1.5.0 +async-lru==2.0.4 +pgmpy==0.1.24 +anyio==4.4.0 +executing==2.1.0 +orjson==3.10.11 +humanfriendly==10.0 +tornado==6.4.1 +gmpy2==2.1.5 +rlPyCairo==0.2.0 +distributed==2024.11.0 +FuzzyTM==2.0.5 +torchtext==0.15.2a0+5ce3163 +pytest==8.3.5 +pyod==2.0.2 +ImageHash==4.3.1 +soupsieve==2.5 +tblib==3.0.0 +emoji==2.14.0 +aiohappyeyeballs==2.4.3 +uri-template==1.3.0 +tensorflow_estimator==2.15.0 +babel==2.16.0 +dask-cuda==24.12.0a12 +overrides==7.7.0 +opencensus==0.11.3 +openai==0.28.1 +language_data==1.2.0 +jedi==0.19.2 +cookiecutter==2.6.0 +entrypoints==0.4 +exceptiongroup==1.2.2 +marisa-trie==1.2.0 +uvloop==0.20.0 +aiosignal==1.3.1 +Flask==3.0.3 +tensorboard==2.15.2 +cffi==1.17.1 +tf_keras==2.15.0 +absl-py==2.1.0 +blinker==1.9.0 +types-python-dateutil==2.9.0.20241003 +opencv-python==4.9.0 +frozendict==2.4.6 +aiohttp-cors==0.7.0 +statsmodels==0.14.4 +tinycss2==1.4.0 +terminado==0.18.1 +pycaret==2.2.3 +aiohttp==3.10.10 +distributed-ucxx==0.41.0 +prometheus_client==0.21.0 +fastdownload==0.0.7 +grpcio==1.59.3 +google-api-core==2.22.0 +jupyterlab_widgets==3.0.13 +appdirs==1.4.4 +littleutils==0.0.0 +ray==2.24.0 +kaggle==1.6.17 +jsonschema==4.23.0 +google-auth==2.36.0 +scikit-base==0.11.0 +visions==0.7.6 +pyarrow==15.0.0 +transformers==4.33.0 +prometheus_flask_exporter==0.23.1 +dm-tree==0.1.8 +colorama==0.4.6 +requests-toolbelt==1.0.0 +cached-property==1.5.2 +cymem==2.0.8 +PyNaCl==1.5.0 +PyWavelets==1.7.0 +httptools==0.6.1 +typing-utils==0.1.0 +email_validator==2.2.0 +marshmallow==3.23.1 +Deprecated==1.2.14 +virtualenv==20.4.7 +optuna==3.6.1 +jupyter_server==2.14.2 +termcolor==2.5.0 +mpi4py==4.0.1 +torchdata==0.7.1+8cea82f +dataclasses==0.8 +cloudpickle==3.1.0 +tree_sitter_languages==1.10.2 +tabulate==0.9.0 +ipython==8.29.0 +lightgbm==4.3.0 +captum==0.6.0 +confuse==2.0.1 +torchvision==0.16.1+adc3221 +lxml==4.9.4 +fastapi==0.115.4 +python-multipart==0.0.17 +dnspython==2.7.0 +jupyter-console==6.6.3 +preshed==3.0.9 +py-cpuinfo==9.0.0 +Send2Trash==1.8.3 +murmurhash==1.0.10 +sniffio==1.3.1 +websockets==13.1 +h11==0.14.0 +smmap==5.0.0 +textual==0.85.2 +jsonpatch==1.33 +opencensus-context==0.1.3 +nbconvert==7.16.4 +sentry-sdk==2.19.0 +opentelemetry-semantic-conventions==0.37b0 +pandas-profiling==2.8.0 +pillow==10.3.0 +peft==0.13.2 +rpds-py==0.21.0 +bokeh==3.6.1 +distro==1.9.0 +itsdangerous==2.2.0 +wandb==0.18.7 +jsonpointer==3.0.0 +astropy-iers-data==0.2024.11.11.0.32.38 +horovod==0.28.1 +graphviz==0.20.3 +vtk==9.3.1 +bleach==6.2.0 +numexpr==2.8.7 +pydantic_core==2.23.4 +Jinja2==3.1.4 +widgetsnbextension==4.0.13 +filelock==3.16.1 +catboost==1.2.7 +raft-dask==24.12.0a36 +async-timeout==4.0.3 +datefinder==0.7.3 +coloredlogs==15.0.1 +platformdirs==4.3.6 +spacy-legacy==3.0.12 +chardet==5.2.0 +jupyter_client==8.6.3 +importlib_metadata==8.5.0 +rfc3986-validator==0.1.1 +huggingface_hub==0.26.2 +PySocks==1.7.1 +mlxtend==0.23.2 +outdated==0.2.2 +partd==1.4.2 +thinc==8.2.5 +astropy==6.1.6 +rdflib==6.3.2 +h2==4.1.0 +typer==0.13.0 +xyzservices==2024.9.0 +toolz==0.12.1 +frozenlist==1.5.0 +rdkit==2024.9.2 +pyasn1==0.6.1 +jupyter_server_terminals==0.5.3 +ucx-py==0.41.0a11 +astunparse==1.6.3 +simpful==2.12.0 +notebook_shim==0.2.4 +scipy==1.13.1 +colorlog==6.9.0 +tiktoken==0.3.3 +plotly==5.24.1 +fastrlock==0.8.2 +chart-studio==1.1.0 +stack-data==0.6.2 +google-pasta==0.2.0 +sktime==0.34.0 +PyYAML==6.0.2 +sympy==1.13.3 +multidict==6.1.0 +ml-dtypes==0.2.0 +tensorboardX==2.6.2.2 +decorator==5.1.1 +cytoolz==1.0.0 +ase==3.23.0 +isoduration==20.11.0 +html5lib==1.1 +langsmith==0.1.142 +future==1.0.0 +onnx2torch==1.5.15 +multipledispatch==0.6.0 +protobuf==4.24.4 +ucxx==0.41.0 +pandas_flavor==0.6.0 +msgpack==1.1.0 +pyasn1_modules==0.4.1 +imagecodecs==2024.1.1 +mlflow==2.17.2 +watchfiles==0.24.0 +dm-sonnet==2.0.2 +langcodes==3.4.1 +freetype-py==2.3.0 +argon2-cffi-bindings==21.2.0 +trimesh==4.5.2 +opt_einsum==3.4.0 +tenacity==8.5.0 +h5py==3.12.1 +fastapi-cli==0.0.5 +oauthlib==3.2.2 +parso==0.8.4 +weasel==0.4.1 +yfinance==0.2.49 +networkx==2.8.8 +bitsandbytes==0.44.1 +lazy_loader==0.4 +querystring_parser==1.2.4 +contourpy==1.3.0 +unicodedata2==15.1.0 +bcrypt==4.2.0 +munkres==1.1.4 +langchain==0.0.298 +hpack==4.0.0 +cryptography==43.0.3 +umap-learn==0.5.7 +arrow==1.3.0 +docker==7.1.0 +certifi==2025.1.31 +fastjsonschema==2.20.0 +tensorflow==2.15.0 +googleapis-common-protos==1.65.0 +iniconfig==2.0.0 +Markdown==3.6 +llvmlite==0.43.0 +wslink==2.3.2 +attrs==24.2.0 +rich==13.9.4 +cupy==13.3.0 +uc-micro-py==1.0.3 +alembic==1.14.0 +joblib==1.4.2 +reportlab==4.2.5 +miniful==0.0.6 +jupyter_core==5.7.2 +wheel==0.45.0 +phik==0.12.3 +mistune==3.0.2 +wcwidth==0.2.13 +dacite==1.8.1 +accelerate==0.22.0 +sacremoses==0.0.53 +revtok==0.0.3 +python-slugify==8.0.4 +tangled-up-in-unicode==0.2.0 +dask==2024.11.0 +markdown-it-py==3.0.0 +sentencepiece==0.1.99 +beautifulsoup4==4.12.3 +six==1.16.0 +numba-cuda==0.0.17 +argon2-cffi==23.1.0 +xxhash==3.5.0 +hjson==3.1.0 +fonttools==4.54.1 +graphql-core==3.2.5 +pyparsing==3.2.0 +pure_eval==0.2.3 +distlib==0.3.9 +lightning==2.4.0 +wordcloud==0.0.0 +catalogue==2.0.10 +jax==0.4.27 +tree-sitter==0.23.2 +notebook==7.2.2 +dataclasses-json==0.6.7 +propcache==0.2.0 +numba==0.60.0 +dask-expr==1.1.17 +pydantic==2.9.2 +gunicorn==22.0.0 +missingno==0.5.2 +pyOpenSSL==24.2.1 +openpyxl==3.1.5 +packaging==24.1 +python-dotenv==1.0.1 +cycler==0.12.1 +types-pytz==2024.2.0.20241003 +yellowbrick==1.5 +referencing==0.35.1 +pyLDAvis==3.4.1 +lazypredict==0.2.16 +fqdn==1.5.1 +websocket-client==1.8.0 +fastcore==1.7.19 +pynvjitlink-cu12==0.3.0 +pingouin==0.5.5 +numpy==1.26.4 +typing-inspect==0.9.0 +nltk==3.9.1 +onnxruntime==1.19.2 +tensorflow-probability==0.23.0 +datasets==3.0.2 +pickleshare==0.7.5 +peewee==3.17.7 +torch-geometric==2.6.1 +ptyprocess==0.7.0 +greenlet==3.1.1 +graphql-relay==3.2.0 +graphene==3.4.3 +et_xmlfile==2.0.0 +webencodings==0.5.1 +hyperframe==6.0.1 +multitasking==0.0.9 +typer-slim==0.13.0 +onnx==1.15.0 +uvicorn==0.32.0 +memray==1.13.4 +xgboost==2.1.2 +Brotli==1.1.0 +zipp==3.21.0 +nbformat==5.10.4 +responses==0.18.0 +funcy==2.0 +Pygments==2.18.0 +tqdm==4.67.0 +linkify-it-py==2.0.3 +srsly==2.4.8 +cuda-python==12.6.0 +lightning-utilities==0.11.8 +cudf==24.12.0a337 +dask-ml==2024.4.4 +docker-pycreds==0.4.0 +pkgutil_resolve_name==1.3.10 +opentelemetry-api==1.16.0 +fsspec==2024.9.0 +nbclient==0.10.0 +psutil==5.9.8 +pytorch-lightning==2.4.0 +sortedcontainers==2.4.0 +matplotlib==3.9.2 +defusedxml==0.7.1 +urllib3==1.26.19 +jupyterlab_server==2.27.3 +retrying==1.3.3 +dask-cudf==24.12.0a337 +sqlparse==0.5.1 +text-unidecode==1.3 +seaborn==0.13.2 +typing_extensions==4.12.2 +pyzmq==26.2.0 +rfc3339-validator==0.1.4 +pynndescent==0.5.13 +pip==24.3.1 +confection==0.1.4 +wrapt==1.14.1 +fastprogress==1.0.3 +traitlets==5.14.3 +asttokens==2.4.1 +json5==0.9.28 +pandas-stubs==2.2.3.241126 +torchmetrics==1.2.1 +gitdb==4.0.11 +annotated-types==0.7.0 +ipython-autotime==0.1 +httpcore==1.0.6 +click==8.1.7 +setproctitle==1.3.3 +starlette==0.41.2 +jupyterlab==4.2.5 +rmm==24.12.0a27 +opentelemetry-sdk==1.16.0 +textblob==0.15.3 +imbalanced-learn==0.12.4 +typeguard==4.3.0 +more-itertools==10.3.0 +zipp==3.19.2 +autocommand==2.2.2 +jaraco.context==5.3.0 +packaging==24.1 +importlib_metadata==8.0.0 +platformdirs==4.2.2 +jaraco.functools==4.0.1 +importlib_resources==6.4.0 +tomli==2.0.1 +jaraco.text==3.12.1 +wheel==0.43.0 +jaraco.collections==5.1.0 +typing_extensions==4.12.2 +inflect==7.3.1 +backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_160615-f65jh2lv/files/wandb-metadata.json b/wandb/run-20250504_160615-f65jh2lv/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..fd7b856b87ae9094e8c7410b93fc44a222546cc3 --- /dev/null +++ b/wandb/run-20250504_160615-f65jh2lv/files/wandb-metadata.json @@ -0,0 +1,77 @@ +{ + "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", + "python": "3.10.15", + "startedAt": "2025-05-04T13:06:15.895027Z", + "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", + "codePath": "finetuning_bc_prott5.py", + "email": "zeynep.isik1@sabanciuniv.edu", + "root": "/arf/scratch/zisik/prott5_bc_ft", + "host": "kolyoz1", + "username": "zisik", + "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", + "codePathLocal": "finetuning_bc_prott5.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "7643995308032", + "used": "274886729728" + } + }, + "memory": { + "total": "1081373220864" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "cluster_name": "cuda", + "conf": "/etc/slurm/slurm.conf", + "cpus_on_node": "16", + "cpus_per_task": "16", + "gpus_on_node": "1", + "gtids": "0", + "job_account": "tbag154", + "job_cpus_per_node": "16", + "job_end_time": "1746623147", + "job_gid": "11636", + "job_gpus": "1", + "job_id": "1027945", + "job_name": "msa_ph_pt", + "job_nodelist": "kolyoz1", + "job_num_nodes": "1", + "job_partition": "kolyoz-cuda", + "job_qos": "tbag", + "job_start_time": "1746363947", + "job_uid": "11636", + "job_user": "zisik", + "jobid": "1027945", + "localid": "0", + "mem_per_cpu": "14000", + "nnodes": "1", + "node_aliases": "(null)", + "nodeid": "0", + "nodelist": "kolyoz1", + "prio_process": "0", + "procid": "0", + "submit_dir": "/arf/scratch/zisik", + "submit_host": "cuda-ui", + "task_pid": "3178532", + "tasks_per_node": "1", + "topology_addr": "kolyoz1", + "topology_addr_pattern": "node", + "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" + }, + "cudaVersion": "12.6" +} \ No newline at end of file diff --git a/wandb/run-20250504_160615-f65jh2lv/logs/debug-core.log b/wandb/run-20250504_160615-f65jh2lv/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..618fc3f61177df7804f2fc4a8f211c7313be9c35 --- /dev/null +++ b/wandb/run-20250504_160615-f65jh2lv/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-05-04T16:06:15.269316376+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp6sywt0mb/port-3178556.txt","pid":3178556,"debug":false,"disable-analytics":false} +{"time":"2025-05-04T16:06:15.269366219+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2025-05-04T16:06:15.2702663+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3178556} +{"time":"2025-05-04T16:06:15.270143057+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":37579,"Zone":""}} +{"time":"2025-05-04T16:06:15.448913658+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:49916"} +{"time":"2025-05-04T16:06:15.898453126+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"f65jh2lv","id":"127.0.0.1:49916"} +{"time":"2025-05-04T16:06:16.021719647+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"f65jh2lv","id":"127.0.0.1:49916"} diff --git a/wandb/run-20250504_160615-f65jh2lv/logs/debug-internal.log b/wandb/run-20250504_160615-f65jh2lv/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..261eef09aa76e080a35f7789b3265f005f6d0225 --- /dev/null +++ b/wandb/run-20250504_160615-f65jh2lv/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2025-05-04T16:06:15.899998659+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} +{"time":"2025-05-04T16:06:15.900045512+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_160615-f65jh2lv/logs/debug-core.log"} +{"time":"2025-05-04T16:06:16.021644692+03:00","level":"INFO","msg":"created new stream","id":"f65jh2lv"} +{"time":"2025-05-04T16:06:16.021706945+03:00","level":"INFO","msg":"stream: started","id":"f65jh2lv"} +{"time":"2025-05-04T16:06:16.021839756+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"f65jh2lv"} +{"time":"2025-05-04T16:06:16.02194891+03:00","level":"INFO","msg":"handler: started","stream_id":"f65jh2lv"} +{"time":"2025-05-04T16:06:16.022034888+03:00","level":"INFO","msg":"sender: started","stream_id":"f65jh2lv"} +{"time":"2025-05-04T16:06:16.421916148+03:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20250504_160615-f65jh2lv/logs/debug.log b/wandb/run-20250504_160615-f65jh2lv/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..06dd2b2a7d6174fa397a32c411642f714082fa74 --- /dev/null +++ b/wandb/run-20250504_160615-f65jh2lv/logs/debug.log @@ -0,0 +1,26 @@ +2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 +2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Configure stats pid to 3178556 +2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings +2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings +2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} +2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 16:06:15,888 INFO MainThread:3178556 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_160615-f65jh2lv/logs/debug.log +2025-05-04 16:06:15,889 INFO MainThread:3178556 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_160615-f65jh2lv/logs/debug-internal.log +2025-05-04 16:06:15,889 INFO MainThread:3178556 [wandb_init.py:init():619] calling init triggers +2025-05-04 16:06:15,889 INFO MainThread:3178556 [wandb_init.py:init():626] wandb.init called with sweep_config: {} +config: {} +2025-05-04 16:06:15,889 INFO MainThread:3178556 [wandb_init.py:init():669] starting backend +2025-05-04 16:06:15,889 INFO MainThread:3178556 [wandb_init.py:init():673] sending inform_init request +2025-05-04 16:06:15,893 INFO MainThread:3178556 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-04 16:06:15,894 INFO MainThread:3178556 [wandb_init.py:init():686] backend started and connected +2025-05-04 16:06:15,902 INFO MainThread:3178556 [wandb_init.py:init():781] updated telemetry +2025-05-04 16:06:15,905 INFO MainThread:3178556 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout +2025-05-04 16:06:16,414 INFO MainThread:3178556 [wandb_init.py:init():867] starting run threads in backend +2025-05-04 16:06:17,992 INFO MainThread:3178556 [wandb_run.py:_console_start():2456] atexit reg +2025-05-04 16:06:17,993 INFO MainThread:3178556 [wandb_run.py:_redirect():2305] redirect: wrap_raw +2025-05-04 16:06:17,993 INFO MainThread:3178556 [wandb_run.py:_redirect():2370] Wrapping output streams. +2025-05-04 16:06:17,993 INFO MainThread:3178556 [wandb_run.py:_redirect():2395] Redirects installed. +2025-05-04 16:06:18,004 INFO MainThread:3178556 [wandb_init.py:init():911] run started, returning control to user process +2025-05-04 16:06:56,772 INFO MainThread:3178556 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_16-06-46_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} diff --git a/wandb/run-20250504_160615-f65jh2lv/run-f65jh2lv.wandb b/wandb/run-20250504_160615-f65jh2lv/run-f65jh2lv.wandb new file mode 100644 index 0000000000000000000000000000000000000000..1e8a5f9b3164571a503e4306a04be53481a4529e Binary files /dev/null and b/wandb/run-20250504_160615-f65jh2lv/run-f65jh2lv.wandb differ diff --git a/wandb/run-20250504_160955-rqk2hbkf/files/config.yaml b/wandb/run-20250504_160955-rqk2hbkf/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7e7549dbe318b236ac4d168d1610ec259f3f67e0 --- /dev/null +++ b/wandb/run-20250504_160955-rqk2hbkf/files/config.yaml @@ -0,0 +1,44 @@ +_wandb: + value: + cli_version: 0.18.7 + m: [] + python_version: 3.10.15 + t: + "1": + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "2": + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "3": + - 23 + - 55 + "4": 3.10.15 + "5": 0.18.7 + "6": 4.45.2 + "8": + - 5 + "12": 0.18.7 + "13": linux-x86_64 diff --git a/wandb/run-20250504_160955-rqk2hbkf/files/output.log b/wandb/run-20250504_160955-rqk2hbkf/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..54e81f72adc802bd17a6e8b3e973b2290acd5201 --- /dev/null +++ b/wandb/run-20250504_160955-rqk2hbkf/files/output.log @@ -0,0 +1,24 @@ +Traceback (most recent call last): + File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 33, in + X_train, X_temp, y_train, y_temp = train_test_split(prep_texts, labels, test_size=0.30, random_state=42) + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper + return func(*args, **kwargs) + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/model_selection/_split.py", line 2782, in train_test_split + arrays = indexable(*arrays) + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/utils/validation.py", line 514, in indexable + check_consistent_length(*result) + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/utils/validation.py", line 457, in check_consistent_length + raise ValueError( +ValueError: Found input variables with inconsistent numbers of samples: [10, 730149] +Traceback (most recent call last): + File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 33, in + X_train, X_temp, y_train, y_temp = train_test_split(prep_texts, labels, test_size=0.30, random_state=42) + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper + return func(*args, **kwargs) + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/model_selection/_split.py", line 2782, in train_test_split + arrays = indexable(*arrays) + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/utils/validation.py", line 514, in indexable + check_consistent_length(*result) + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/site-packages/sklearn/utils/validation.py", line 457, in check_consistent_length + raise ValueError( +ValueError: Found input variables with inconsistent numbers of samples: [10, 730149] diff --git a/wandb/run-20250504_160955-rqk2hbkf/files/requirements.txt b/wandb/run-20250504_160955-rqk2hbkf/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..847c45ecccb522de294762faeeb01fe5fb02f7ac --- /dev/null +++ b/wandb/run-20250504_160955-rqk2hbkf/files/requirements.txt @@ -0,0 +1,541 @@ +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +pyg-lib==0.4.0+pt20cu117 +biopython==1.85 +iniconfig==2.0.0 +tokenizers==0.20.0 +accelerate==1.3.0 +torch==2.6.0 +nvidia-nccl-cu12==2.21.5 +transformers==4.45.2 +nvidia-cusparse-cu12==12.3.1.170 +torch-scatter==2.1.2+pt20cu117 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nvtx-cu12==12.4.127 +zstd==1.5.6.6 +fair-esm==2.0.0 +omegaconf==2.3.0 +pluggy==1.5.0 +pytest==8.3.5 +nvidia-curand-cu12==10.3.5.147 +nvidia-cufft-cu12==11.2.1.3 +torch-cluster==1.6.3+pt20cu117 +regex==2024.9.11 +nvidia-cudnn-cu12==9.1.0.70 +torch-spline-conv==1.2.2+pt20cu117 +nvidia-cusolver-cu12==11.6.1.9 +antlr4-python3-runtime==4.9.3 +msgpack-numpy==0.4.8 +nlp==0.2.0 +einops==0.8.1 +nvidia-cublas-cu12==12.4.5.8 +triton==3.2.0 +ninja==1.11.1.3 +hydra-core==1.3.2 +nvidia-nvjitlink-cu12==12.4.127 +biotite==0.41.2 +torch-sparse==0.6.18+pt20cu117 +esm==3.1.4 +sympy==1.13.1 +nvidia-cuda-runtime-cu12==12.4.127 +jupyter-lsp==2.2.5 +jupyter-events==0.10.0 +ipykernel==6.29.5 +Mako==1.3.5 +proto-plus==1.25.0 +fst-pso==1.8.1 +gensim==4.3.3 +htmlmin==0.1.12 +tokenizers==0.13.3 +timm==1.0.11 +MarkupSafe==3.0.2 +safetensors==0.4.5 +requests==2.32.3 +gast==0.5.5 +cuml==24.12.0a33 +jaxlib==0.4.23.dev20240214 +spacy-loggers==1.0.5 +pytz==2024.1 +idna==3.10 +python-dateutil==2.9.0 +mdurl==0.1.2 +blis==0.7.10 +jupyter==1.1.1 +pyerfa==2.0.1.5 +comm==0.2.2 +pygraphviz==1.14 +dill==0.3.8 +paramiko==3.5.0 +llama-index==0.8.36 +mdit-py-plugins==0.4.2 +Werkzeug==3.1.3 +pyu2f==0.1.5 +dask-glm==0.2.0 +httpx==0.27.2 +typeguard==4.4.1 +mypy-extensions==1.0.0 +kmodes==0.12.2 +keras==2.15.0 +ydata-profiling==0.0.dev0 +regex==2024.11.6 +xarray==2024.11.0 +setuptools==75.3.0 +charset-normalizer==3.4.0 +jupyterlab_nvdashboard==0.11.0 +pylibraft==24.12.0a36 +spacy==3.7.6 +mlflow-skinny==2.17.2 +nvtx==0.2.10 +multimethod==1.12 +pexpect==4.9.0 +torch==2.1.0.post301 +flatbuffers==24.3.25 +python-json-logger==2.0.7 +PyJWT==2.9.0 +multiprocess==0.70.16 +colorlover==0.3.0 +yarl==1.16.0 +locket==1.0.0 +patsy==1.0.0 +rapids-dask-dependency==24.12.0a0 +stanza==1.9.2 +debugpy==1.8.8 +jupyterlab_pygments==0.3.0 +pylibcudf==24.12.0a337 +lz4==4.3.3 +pandas==2.2.3 +tifffile==2024.9.20 +pynvml==11.4.1 +cufflinks==0.17.3 +ipywidgets==8.1.5 +requests-oauthlib==2.0.0 +google-auth-oauthlib==1.2.1 +rsa==4.9 +webcolors==24.8.0 +jsonschema-specifications==2024.10.1 +scikit-learn==1.5.2 +langchain-text-splitters==0.3.2 +pandas-datareader==0.10.0 +tomli==2.0.2 +tzdata==2024.2 +scikit-image==0.24.0 +tensorboard_data_server==0.7.0 +kiwisolver==1.4.7 +cloudpathlib==0.20.0 +isodate==0.6.1 +adversarial-robustness-toolbox==1.19.1 +SQLAlchemy==2.0.36 +pytest-runner==6.0.0 +pycairo==1.27.0 +treelite==4.3.0 +jiter==0.7.0 +threadpoolctl==3.5.0 +pandocfilters==1.5.0 +loguru==0.7.2 +smart_open==7.0.5 +shellingham==1.5.4 +deepspeed==0.15.4 +prompt_toolkit==3.0.48 +databricks-sdk==0.34.0 +langchain-core==0.3.15 +imageio==2.36.0 +openapi-schema-pydantic==1.2.4 +zict==3.0.0 +cachetools==5.5.0 +colorful==0.5.6 +mpmath==1.3.0 +nest_asyncio==1.6.0 +pyFUME==0.2.25 +opencv-python-headless==4.9.0 +fastai==2.7.18 +importlib_resources==6.4.5 +binaryornot==0.4.4 +evaluate==0.4.1 +matplotlib-inline==0.1.7 +wasabi==1.1.2 +pycparser==2.22 +GitPython==3.1.43 +pluggy==1.5.0 +async-lru==2.0.4 +pgmpy==0.1.24 +anyio==4.4.0 +executing==2.1.0 +orjson==3.10.11 +humanfriendly==10.0 +tornado==6.4.1 +gmpy2==2.1.5 +rlPyCairo==0.2.0 +distributed==2024.11.0 +FuzzyTM==2.0.5 +torchtext==0.15.2a0+5ce3163 +pytest==8.3.5 +pyod==2.0.2 +ImageHash==4.3.1 +soupsieve==2.5 +tblib==3.0.0 +emoji==2.14.0 +aiohappyeyeballs==2.4.3 +uri-template==1.3.0 +tensorflow_estimator==2.15.0 +babel==2.16.0 +dask-cuda==24.12.0a12 +overrides==7.7.0 +opencensus==0.11.3 +openai==0.28.1 +language_data==1.2.0 +jedi==0.19.2 +cookiecutter==2.6.0 +entrypoints==0.4 +exceptiongroup==1.2.2 +marisa-trie==1.2.0 +uvloop==0.20.0 +aiosignal==1.3.1 +Flask==3.0.3 +tensorboard==2.15.2 +cffi==1.17.1 +tf_keras==2.15.0 +absl-py==2.1.0 +blinker==1.9.0 +types-python-dateutil==2.9.0.20241003 +opencv-python==4.9.0 +frozendict==2.4.6 +aiohttp-cors==0.7.0 +statsmodels==0.14.4 +tinycss2==1.4.0 +terminado==0.18.1 +pycaret==2.2.3 +aiohttp==3.10.10 +distributed-ucxx==0.41.0 +prometheus_client==0.21.0 +fastdownload==0.0.7 +grpcio==1.59.3 +google-api-core==2.22.0 +jupyterlab_widgets==3.0.13 +appdirs==1.4.4 +littleutils==0.0.0 +ray==2.24.0 +kaggle==1.6.17 +jsonschema==4.23.0 +google-auth==2.36.0 +scikit-base==0.11.0 +visions==0.7.6 +pyarrow==15.0.0 +transformers==4.33.0 +prometheus_flask_exporter==0.23.1 +dm-tree==0.1.8 +colorama==0.4.6 +requests-toolbelt==1.0.0 +cached-property==1.5.2 +cymem==2.0.8 +PyNaCl==1.5.0 +PyWavelets==1.7.0 +httptools==0.6.1 +typing-utils==0.1.0 +email_validator==2.2.0 +marshmallow==3.23.1 +Deprecated==1.2.14 +virtualenv==20.4.7 +optuna==3.6.1 +jupyter_server==2.14.2 +termcolor==2.5.0 +mpi4py==4.0.1 +torchdata==0.7.1+8cea82f +dataclasses==0.8 +cloudpickle==3.1.0 +tree_sitter_languages==1.10.2 +tabulate==0.9.0 +ipython==8.29.0 +lightgbm==4.3.0 +captum==0.6.0 +confuse==2.0.1 +torchvision==0.16.1+adc3221 +lxml==4.9.4 +fastapi==0.115.4 +python-multipart==0.0.17 +dnspython==2.7.0 +jupyter-console==6.6.3 +preshed==3.0.9 +py-cpuinfo==9.0.0 +Send2Trash==1.8.3 +murmurhash==1.0.10 +sniffio==1.3.1 +websockets==13.1 +h11==0.14.0 +smmap==5.0.0 +textual==0.85.2 +jsonpatch==1.33 +opencensus-context==0.1.3 +nbconvert==7.16.4 +sentry-sdk==2.19.0 +opentelemetry-semantic-conventions==0.37b0 +pandas-profiling==2.8.0 +pillow==10.3.0 +peft==0.13.2 +rpds-py==0.21.0 +bokeh==3.6.1 +distro==1.9.0 +itsdangerous==2.2.0 +wandb==0.18.7 +jsonpointer==3.0.0 +astropy-iers-data==0.2024.11.11.0.32.38 +horovod==0.28.1 +graphviz==0.20.3 +vtk==9.3.1 +bleach==6.2.0 +numexpr==2.8.7 +pydantic_core==2.23.4 +Jinja2==3.1.4 +widgetsnbextension==4.0.13 +filelock==3.16.1 +catboost==1.2.7 +raft-dask==24.12.0a36 +async-timeout==4.0.3 +datefinder==0.7.3 +coloredlogs==15.0.1 +platformdirs==4.3.6 +spacy-legacy==3.0.12 +chardet==5.2.0 +jupyter_client==8.6.3 +importlib_metadata==8.5.0 +rfc3986-validator==0.1.1 +huggingface_hub==0.26.2 +PySocks==1.7.1 +mlxtend==0.23.2 +outdated==0.2.2 +partd==1.4.2 +thinc==8.2.5 +astropy==6.1.6 +rdflib==6.3.2 +h2==4.1.0 +typer==0.13.0 +xyzservices==2024.9.0 +toolz==0.12.1 +frozenlist==1.5.0 +rdkit==2024.9.2 +pyasn1==0.6.1 +jupyter_server_terminals==0.5.3 +ucx-py==0.41.0a11 +astunparse==1.6.3 +simpful==2.12.0 +notebook_shim==0.2.4 +scipy==1.13.1 +colorlog==6.9.0 +tiktoken==0.3.3 +plotly==5.24.1 +fastrlock==0.8.2 +chart-studio==1.1.0 +stack-data==0.6.2 +google-pasta==0.2.0 +sktime==0.34.0 +PyYAML==6.0.2 +sympy==1.13.3 +multidict==6.1.0 +ml-dtypes==0.2.0 +tensorboardX==2.6.2.2 +decorator==5.1.1 +cytoolz==1.0.0 +ase==3.23.0 +isoduration==20.11.0 +html5lib==1.1 +langsmith==0.1.142 +future==1.0.0 +onnx2torch==1.5.15 +multipledispatch==0.6.0 +protobuf==4.24.4 +ucxx==0.41.0 +pandas_flavor==0.6.0 +msgpack==1.1.0 +pyasn1_modules==0.4.1 +imagecodecs==2024.1.1 +mlflow==2.17.2 +watchfiles==0.24.0 +dm-sonnet==2.0.2 +langcodes==3.4.1 +freetype-py==2.3.0 +argon2-cffi-bindings==21.2.0 +trimesh==4.5.2 +opt_einsum==3.4.0 +tenacity==8.5.0 +h5py==3.12.1 +fastapi-cli==0.0.5 +oauthlib==3.2.2 +parso==0.8.4 +weasel==0.4.1 +yfinance==0.2.49 +networkx==2.8.8 +bitsandbytes==0.44.1 +lazy_loader==0.4 +querystring_parser==1.2.4 +contourpy==1.3.0 +unicodedata2==15.1.0 +bcrypt==4.2.0 +munkres==1.1.4 +langchain==0.0.298 +hpack==4.0.0 +cryptography==43.0.3 +umap-learn==0.5.7 +arrow==1.3.0 +docker==7.1.0 +certifi==2025.1.31 +fastjsonschema==2.20.0 +tensorflow==2.15.0 +googleapis-common-protos==1.65.0 +iniconfig==2.0.0 +Markdown==3.6 +llvmlite==0.43.0 +wslink==2.3.2 +attrs==24.2.0 +rich==13.9.4 +cupy==13.3.0 +uc-micro-py==1.0.3 +alembic==1.14.0 +joblib==1.4.2 +reportlab==4.2.5 +miniful==0.0.6 +jupyter_core==5.7.2 +wheel==0.45.0 +phik==0.12.3 +mistune==3.0.2 +wcwidth==0.2.13 +dacite==1.8.1 +accelerate==0.22.0 +sacremoses==0.0.53 +revtok==0.0.3 +python-slugify==8.0.4 +tangled-up-in-unicode==0.2.0 +dask==2024.11.0 +markdown-it-py==3.0.0 +sentencepiece==0.1.99 +beautifulsoup4==4.12.3 +six==1.16.0 +numba-cuda==0.0.17 +argon2-cffi==23.1.0 +xxhash==3.5.0 +hjson==3.1.0 +fonttools==4.54.1 +graphql-core==3.2.5 +pyparsing==3.2.0 +pure_eval==0.2.3 +distlib==0.3.9 +lightning==2.4.0 +wordcloud==0.0.0 +catalogue==2.0.10 +jax==0.4.27 +tree-sitter==0.23.2 +notebook==7.2.2 +dataclasses-json==0.6.7 +propcache==0.2.0 +numba==0.60.0 +dask-expr==1.1.17 +pydantic==2.9.2 +gunicorn==22.0.0 +missingno==0.5.2 +pyOpenSSL==24.2.1 +openpyxl==3.1.5 +packaging==24.1 +python-dotenv==1.0.1 +cycler==0.12.1 +types-pytz==2024.2.0.20241003 +yellowbrick==1.5 +referencing==0.35.1 +pyLDAvis==3.4.1 +lazypredict==0.2.16 +fqdn==1.5.1 +websocket-client==1.8.0 +fastcore==1.7.19 +pynvjitlink-cu12==0.3.0 +pingouin==0.5.5 +numpy==1.26.4 +typing-inspect==0.9.0 +nltk==3.9.1 +onnxruntime==1.19.2 +tensorflow-probability==0.23.0 +datasets==3.0.2 +pickleshare==0.7.5 +peewee==3.17.7 +torch-geometric==2.6.1 +ptyprocess==0.7.0 +greenlet==3.1.1 +graphql-relay==3.2.0 +graphene==3.4.3 +et_xmlfile==2.0.0 +webencodings==0.5.1 +hyperframe==6.0.1 +multitasking==0.0.9 +typer-slim==0.13.0 +onnx==1.15.0 +uvicorn==0.32.0 +memray==1.13.4 +xgboost==2.1.2 +Brotli==1.1.0 +zipp==3.21.0 +nbformat==5.10.4 +responses==0.18.0 +funcy==2.0 +Pygments==2.18.0 +tqdm==4.67.0 +linkify-it-py==2.0.3 +srsly==2.4.8 +cuda-python==12.6.0 +lightning-utilities==0.11.8 +cudf==24.12.0a337 +dask-ml==2024.4.4 +docker-pycreds==0.4.0 +pkgutil_resolve_name==1.3.10 +opentelemetry-api==1.16.0 +fsspec==2024.9.0 +nbclient==0.10.0 +psutil==5.9.8 +pytorch-lightning==2.4.0 +sortedcontainers==2.4.0 +matplotlib==3.9.2 +defusedxml==0.7.1 +urllib3==1.26.19 +jupyterlab_server==2.27.3 +retrying==1.3.3 +dask-cudf==24.12.0a337 +sqlparse==0.5.1 +text-unidecode==1.3 +seaborn==0.13.2 +typing_extensions==4.12.2 +pyzmq==26.2.0 +rfc3339-validator==0.1.4 +pynndescent==0.5.13 +pip==24.3.1 +confection==0.1.4 +wrapt==1.14.1 +fastprogress==1.0.3 +traitlets==5.14.3 +asttokens==2.4.1 +json5==0.9.28 +pandas-stubs==2.2.3.241126 +torchmetrics==1.2.1 +gitdb==4.0.11 +annotated-types==0.7.0 +ipython-autotime==0.1 +httpcore==1.0.6 +click==8.1.7 +setproctitle==1.3.3 +starlette==0.41.2 +jupyterlab==4.2.5 +rmm==24.12.0a27 +opentelemetry-sdk==1.16.0 +textblob==0.15.3 +imbalanced-learn==0.12.4 +typeguard==4.3.0 +more-itertools==10.3.0 +zipp==3.19.2 +autocommand==2.2.2 +jaraco.context==5.3.0 +packaging==24.1 +importlib_metadata==8.0.0 +platformdirs==4.2.2 +jaraco.functools==4.0.1 +importlib_resources==6.4.0 +tomli==2.0.1 +jaraco.text==3.12.1 +wheel==0.43.0 +jaraco.collections==5.1.0 +typing_extensions==4.12.2 +inflect==7.3.1 +backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_160955-rqk2hbkf/files/wandb-metadata.json b/wandb/run-20250504_160955-rqk2hbkf/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..96c1d759e7d3dd3d826e5a66a823a8a3f9265c9c --- /dev/null +++ b/wandb/run-20250504_160955-rqk2hbkf/files/wandb-metadata.json @@ -0,0 +1,77 @@ +{ + "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", + "python": "3.10.15", + "startedAt": "2025-05-04T13:09:55.928947Z", + "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", + "codePath": "finetuning_bc_prott5.py", + "email": "zeynep.isik1@sabanciuniv.edu", + "root": "/arf/scratch/zisik/prott5_bc_ft", + "host": "kolyoz1", + "username": "zisik", + "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", + "codePathLocal": "finetuning_bc_prott5.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "7643995308032", + "used": "272740364288" + } + }, + "memory": { + "total": "1081373220864" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "cluster_name": "cuda", + "conf": "/etc/slurm/slurm.conf", + "cpus_on_node": "16", + "cpus_per_task": "16", + "gpus_on_node": "1", + "gtids": "0", + "job_account": "tbag154", + "job_cpus_per_node": "16", + "job_end_time": "1746623370", + "job_gid": "11636", + "job_gpus": "1", + "job_id": "1027946", + "job_name": "msa_ph_pt", + "job_nodelist": "kolyoz1", + "job_num_nodes": "1", + "job_partition": "kolyoz-cuda", + "job_qos": "tbag", + "job_start_time": "1746364170", + "job_uid": "11636", + "job_user": "zisik", + "jobid": "1027946", + "localid": "0", + "mem_per_cpu": "14000", + "nnodes": "1", + "node_aliases": "(null)", + "nodeid": "0", + "nodelist": "kolyoz1", + "prio_process": "0", + "procid": "0", + "submit_dir": "/arf/scratch/zisik", + "submit_host": "cuda-ui", + "task_pid": "3179106", + "tasks_per_node": "1", + "topology_addr": "kolyoz1", + "topology_addr_pattern": "node", + "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" + }, + "cudaVersion": "12.6" +} \ No newline at end of file diff --git a/wandb/run-20250504_160955-rqk2hbkf/files/wandb-summary.json b/wandb/run-20250504_160955-rqk2hbkf/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..1d52051e315a7a21a9d9e5a40a517408bb086162 --- /dev/null +++ b/wandb/run-20250504_160955-rqk2hbkf/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":2}} \ No newline at end of file diff --git a/wandb/run-20250504_160955-rqk2hbkf/logs/debug-core.log b/wandb/run-20250504_160955-rqk2hbkf/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..999d56f784a1c5621e4f166d8ed3d656b4110162 --- /dev/null +++ b/wandb/run-20250504_160955-rqk2hbkf/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-05-04T16:09:55.241065297+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmplpbc9pnb/port-3179132.txt","pid":3179132,"debug":false,"disable-analytics":false} +{"time":"2025-05-04T16:09:55.241124751+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2025-05-04T16:09:55.241864+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":37981,"Zone":""}} +{"time":"2025-05-04T16:09:55.241967868+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3179132} +{"time":"2025-05-04T16:09:55.428960455+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:40950"} +{"time":"2025-05-04T16:09:55.928508592+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"rqk2hbkf","id":"127.0.0.1:40950"} +{"time":"2025-05-04T16:09:56.056026556+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"rqk2hbkf","id":"127.0.0.1:40950"} +{"time":"2025-05-04T16:09:58.597503038+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:40950"} +{"time":"2025-05-04T16:09:58.597631333+03:00","level":"INFO","msg":"server is shutting down"} +{"time":"2025-05-04T16:09:58.597601675+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:40950"} +{"time":"2025-05-04T16:09:58.597793186+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:40950"} +{"time":"2025-05-04T16:09:59.528863432+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:40950"} +{"time":"2025-05-04T16:09:59.528880642+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:40950"} +{"time":"2025-05-04T16:09:59.528893164+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_160955-rqk2hbkf/logs/debug-internal.log b/wandb/run-20250504_160955-rqk2hbkf/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..a63c5f27c7d8b0b1be30a0aa81b63cec47472ec9 --- /dev/null +++ b/wandb/run-20250504_160955-rqk2hbkf/logs/debug-internal.log @@ -0,0 +1,19 @@ +{"time":"2025-05-04T16:09:55.930352223+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} +{"time":"2025-05-04T16:09:55.930398642+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_160955-rqk2hbkf/logs/debug-core.log"} +{"time":"2025-05-04T16:09:56.055953645+03:00","level":"INFO","msg":"created new stream","id":"rqk2hbkf"} +{"time":"2025-05-04T16:09:56.056013829+03:00","level":"INFO","msg":"stream: started","id":"rqk2hbkf"} +{"time":"2025-05-04T16:09:56.056183059+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"rqk2hbkf"} +{"time":"2025-05-04T16:09:56.056291373+03:00","level":"INFO","msg":"sender: started","stream_id":"rqk2hbkf"} +{"time":"2025-05-04T16:09:56.056498843+03:00","level":"INFO","msg":"handler: started","stream_id":"rqk2hbkf"} +{"time":"2025-05-04T16:09:56.455842701+03:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-05-04T16:09:58.597599181+03:00","level":"INFO","msg":"stream: closing","id":"rqk2hbkf"} +{"time":"2025-05-04T16:09:58.597716873+03:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-05-04T16:09:58.598825235+03:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-05-04T16:09:58.792882763+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} +{"time":"2025-05-04T16:09:58.792915401+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} +{"time":"2025-05-04T16:09:58.792926694+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} +{"time":"2025-05-04T16:09:59.286977407+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-05-04T16:09:59.528666057+03:00","level":"INFO","msg":"handler: closed","stream_id":"rqk2hbkf"} +{"time":"2025-05-04T16:09:59.528710573+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"rqk2hbkf"} +{"time":"2025-05-04T16:09:59.528726369+03:00","level":"INFO","msg":"sender: closed","stream_id":"rqk2hbkf"} +{"time":"2025-05-04T16:09:59.528792264+03:00","level":"INFO","msg":"stream: closed","id":"rqk2hbkf"} diff --git a/wandb/run-20250504_160955-rqk2hbkf/logs/debug.log b/wandb/run-20250504_160955-rqk2hbkf/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..bb9afff70b842f64258d1cce03b036f94b3b7f15 --- /dev/null +++ b/wandb/run-20250504_160955-rqk2hbkf/logs/debug.log @@ -0,0 +1,26 @@ +2025-05-04 16:09:55,914 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 +2025-05-04 16:09:55,915 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Configure stats pid to 3179132 +2025-05-04 16:09:55,915 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings +2025-05-04 16:09:55,915 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings +2025-05-04 16:09:55,915 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2025-05-04 16:09:55,915 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} +2025-05-04 16:09:55,915 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 16:09:55,916 INFO MainThread:3179132 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 16:09:55,916 INFO MainThread:3179132 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_160955-rqk2hbkf/logs/debug.log +2025-05-04 16:09:55,916 INFO MainThread:3179132 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_160955-rqk2hbkf/logs/debug-internal.log +2025-05-04 16:09:55,917 INFO MainThread:3179132 [wandb_init.py:init():619] calling init triggers +2025-05-04 16:09:55,917 INFO MainThread:3179132 [wandb_init.py:init():626] wandb.init called with sweep_config: {} +config: {} +2025-05-04 16:09:55,917 INFO MainThread:3179132 [wandb_init.py:init():669] starting backend +2025-05-04 16:09:55,917 INFO MainThread:3179132 [wandb_init.py:init():673] sending inform_init request +2025-05-04 16:09:55,925 INFO MainThread:3179132 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-04 16:09:55,927 INFO MainThread:3179132 [wandb_init.py:init():686] backend started and connected +2025-05-04 16:09:55,965 INFO MainThread:3179132 [wandb_init.py:init():781] updated telemetry +2025-05-04 16:09:55,969 INFO MainThread:3179132 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout +2025-05-04 16:09:56,441 INFO MainThread:3179132 [wandb_init.py:init():867] starting run threads in backend +2025-05-04 16:09:57,857 INFO MainThread:3179132 [wandb_run.py:_console_start():2456] atexit reg +2025-05-04 16:09:57,858 INFO MainThread:3179132 [wandb_run.py:_redirect():2305] redirect: wrap_raw +2025-05-04 16:09:57,859 INFO MainThread:3179132 [wandb_run.py:_redirect():2370] Wrapping output streams. +2025-05-04 16:09:57,859 INFO MainThread:3179132 [wandb_run.py:_redirect():2395] Redirects installed. +2025-05-04 16:09:57,874 INFO MainThread:3179132 [wandb_init.py:init():911] run started, returning control to user process +2025-05-04 16:09:58,598 WARNING MsgRouterThr:3179132 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_160955-rqk2hbkf/run-rqk2hbkf.wandb b/wandb/run-20250504_160955-rqk2hbkf/run-rqk2hbkf.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e9a6c7bc04fa77bdb7e2940e46071101d371b1d3 Binary files /dev/null and b/wandb/run-20250504_160955-rqk2hbkf/run-rqk2hbkf.wandb differ diff --git a/wandb/run-20250504_161246-rdbtc2pz/files/config.yaml b/wandb/run-20250504_161246-rdbtc2pz/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..901a1d15058a51157e2bae9ec48a096a58e60825 --- /dev/null +++ b/wandb/run-20250504_161246-rdbtc2pz/files/config.yaml @@ -0,0 +1,357 @@ +_wandb: + value: + cli_version: 0.18.7 + m: + - "1": eval/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/global_step + "6": + - 3 + "7": [] + - "1": eval/runtime + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/samples_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/steps_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/accuracy + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.10.15 + t: + "1": + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "2": + - 1 + - 2 + - 3 + - 5 + - 6 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "3": + - 7 + - 23 + - 55 + - 62 + - 66 + "4": 3.10.15 + "5": 0.18.7 + "6": 4.45.2 + "8": + - 5 + "9": + "1": transformers_trainer + "12": 0.18.7 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +auto_find_batch_size: + value: false +batch_eval_metrics: + value: false +bf16: + value: false +bf16_full_eval: + value: false +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +dispatch_batches: + value: null +do_eval: + value: true +do_predict: + value: false +do_train: + value: false +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: epoch +eval_use_gather_object: + value: false +evaluation_strategy: + value: epoch +fp16: + value: true +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: false +group_by_length: + value: false +half_precision_backend: + value: auto +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +ignore_data_skip: + value: false +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +learning_rate: + value: 5e-05 +length_column_name: + value: length +load_best_model_at_end: + value: true +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: t5-bc-out/runs/May04_16-12-52_kolyoz1 +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 500 +logging_strategy: + value: steps +lr_scheduler_type: + value: linear +max_grad_norm: + value: 1 +max_steps: + value: -1 +metric_for_best_model: + value: loss +mp_parameters: + value: "" +neftune_noise_alpha: + value: null +no_cuda: + value: false +num_train_epochs: + value: 3 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_dir: + value: t5-bc-out +overwrite_output_dir: + value: false +past_index: + value: -1 +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 8 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +prediction_loss_only: + value: false +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_unused_columns: + value: true +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +run_name: + value: t5-bc-out +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: false +save_steps: + value: 500 +save_strategy: + value: epoch +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +split_batches: + value: null +tf32: + value: null +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_liger_kernel: + value: false +use_mps_device: + value: false +warmup_ratio: + value: 0 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/wandb/run-20250504_161246-rdbtc2pz/files/output.log b/wandb/run-20250504_161246-rdbtc2pz/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..19a60f451615e772fad3d7c838cfbc32af90c5ca --- /dev/null +++ b/wandb/run-20250504_161246-rdbtc2pz/files/output.log @@ -0,0 +1,27 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Map: 100%|██████████| 70/70 [00:00<00:00, 4499.50 examples/s] +Map: 100%|██████████| 15/15 [00:00<00:00, 2515.68 examples/s] +/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2025-05-04 16:12:57,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. +100%|██████████| 6/6 [01:04<00:00, 10.71s/it] +Map: 100%|██████████| 15/15 [00:00<00:00, 3408.53 examples/s] +{'eval_loss': 0.2836913764476776, 'eval_accuracy': 1.0, 'eval_runtime': 0.0837, 'eval_samples_per_second': 179.205, 'eval_steps_per_second': 23.894, 'epoch': 0.89} +{'eval_loss': 0.10505779087543488, 'eval_accuracy': 1.0, 'eval_runtime': 0.0869, 'eval_samples_per_second': 172.624, 'eval_steps_per_second': 23.017, 'epoch': 1.78} +{'eval_loss': 0.05776570364832878, 'eval_accuracy': 1.0, 'eval_runtime': 0.1, 'eval_samples_per_second': 149.979, 'eval_steps_per_second': 19.997, 'epoch': 2.67} +{'train_runtime': 64.2466, 'train_samples_per_second': 3.269, 'train_steps_per_second': 0.093, 'train_loss': 0.3210471471150716, 'epoch': 2.67} +100%|██████████| 2/2 [00:00<00:00, 77.74it/s] +{'eval_loss': 0.05800781771540642, 'eval_accuracy': 1.0, 'eval_runtime': 0.0642, 'eval_samples_per_second': 233.689, 'eval_steps_per_second': 31.158, 'epoch': 2.6666666666666665} +Traceback (most recent call last): + File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in + model.push_to_hub("isikz/prot_t5_binary_classifier") + File "/arf/home/zisik/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1928, in __getattr__ + raise AttributeError( +AttributeError: 'T5BinaryClassifier' object has no attribute 'push_to_hub' +Traceback (most recent call last): + File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in + model.push_to_hub("isikz/prot_t5_binary_classifier") + File "/arf/home/zisik/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1928, in __getattr__ + raise AttributeError( +AttributeError: 'T5BinaryClassifier' object has no attribute 'push_to_hub' diff --git a/wandb/run-20250504_161246-rdbtc2pz/files/requirements.txt b/wandb/run-20250504_161246-rdbtc2pz/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..847c45ecccb522de294762faeeb01fe5fb02f7ac --- /dev/null +++ b/wandb/run-20250504_161246-rdbtc2pz/files/requirements.txt @@ -0,0 +1,541 @@ +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +pyg-lib==0.4.0+pt20cu117 +biopython==1.85 +iniconfig==2.0.0 +tokenizers==0.20.0 +accelerate==1.3.0 +torch==2.6.0 +nvidia-nccl-cu12==2.21.5 +transformers==4.45.2 +nvidia-cusparse-cu12==12.3.1.170 +torch-scatter==2.1.2+pt20cu117 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nvtx-cu12==12.4.127 +zstd==1.5.6.6 +fair-esm==2.0.0 +omegaconf==2.3.0 +pluggy==1.5.0 +pytest==8.3.5 +nvidia-curand-cu12==10.3.5.147 +nvidia-cufft-cu12==11.2.1.3 +torch-cluster==1.6.3+pt20cu117 +regex==2024.9.11 +nvidia-cudnn-cu12==9.1.0.70 +torch-spline-conv==1.2.2+pt20cu117 +nvidia-cusolver-cu12==11.6.1.9 +antlr4-python3-runtime==4.9.3 +msgpack-numpy==0.4.8 +nlp==0.2.0 +einops==0.8.1 +nvidia-cublas-cu12==12.4.5.8 +triton==3.2.0 +ninja==1.11.1.3 +hydra-core==1.3.2 +nvidia-nvjitlink-cu12==12.4.127 +biotite==0.41.2 +torch-sparse==0.6.18+pt20cu117 +esm==3.1.4 +sympy==1.13.1 +nvidia-cuda-runtime-cu12==12.4.127 +jupyter-lsp==2.2.5 +jupyter-events==0.10.0 +ipykernel==6.29.5 +Mako==1.3.5 +proto-plus==1.25.0 +fst-pso==1.8.1 +gensim==4.3.3 +htmlmin==0.1.12 +tokenizers==0.13.3 +timm==1.0.11 +MarkupSafe==3.0.2 +safetensors==0.4.5 +requests==2.32.3 +gast==0.5.5 +cuml==24.12.0a33 +jaxlib==0.4.23.dev20240214 +spacy-loggers==1.0.5 +pytz==2024.1 +idna==3.10 +python-dateutil==2.9.0 +mdurl==0.1.2 +blis==0.7.10 +jupyter==1.1.1 +pyerfa==2.0.1.5 +comm==0.2.2 +pygraphviz==1.14 +dill==0.3.8 +paramiko==3.5.0 +llama-index==0.8.36 +mdit-py-plugins==0.4.2 +Werkzeug==3.1.3 +pyu2f==0.1.5 +dask-glm==0.2.0 +httpx==0.27.2 +typeguard==4.4.1 +mypy-extensions==1.0.0 +kmodes==0.12.2 +keras==2.15.0 +ydata-profiling==0.0.dev0 +regex==2024.11.6 +xarray==2024.11.0 +setuptools==75.3.0 +charset-normalizer==3.4.0 +jupyterlab_nvdashboard==0.11.0 +pylibraft==24.12.0a36 +spacy==3.7.6 +mlflow-skinny==2.17.2 +nvtx==0.2.10 +multimethod==1.12 +pexpect==4.9.0 +torch==2.1.0.post301 +flatbuffers==24.3.25 +python-json-logger==2.0.7 +PyJWT==2.9.0 +multiprocess==0.70.16 +colorlover==0.3.0 +yarl==1.16.0 +locket==1.0.0 +patsy==1.0.0 +rapids-dask-dependency==24.12.0a0 +stanza==1.9.2 +debugpy==1.8.8 +jupyterlab_pygments==0.3.0 +pylibcudf==24.12.0a337 +lz4==4.3.3 +pandas==2.2.3 +tifffile==2024.9.20 +pynvml==11.4.1 +cufflinks==0.17.3 +ipywidgets==8.1.5 +requests-oauthlib==2.0.0 +google-auth-oauthlib==1.2.1 +rsa==4.9 +webcolors==24.8.0 +jsonschema-specifications==2024.10.1 +scikit-learn==1.5.2 +langchain-text-splitters==0.3.2 +pandas-datareader==0.10.0 +tomli==2.0.2 +tzdata==2024.2 +scikit-image==0.24.0 +tensorboard_data_server==0.7.0 +kiwisolver==1.4.7 +cloudpathlib==0.20.0 +isodate==0.6.1 +adversarial-robustness-toolbox==1.19.1 +SQLAlchemy==2.0.36 +pytest-runner==6.0.0 +pycairo==1.27.0 +treelite==4.3.0 +jiter==0.7.0 +threadpoolctl==3.5.0 +pandocfilters==1.5.0 +loguru==0.7.2 +smart_open==7.0.5 +shellingham==1.5.4 +deepspeed==0.15.4 +prompt_toolkit==3.0.48 +databricks-sdk==0.34.0 +langchain-core==0.3.15 +imageio==2.36.0 +openapi-schema-pydantic==1.2.4 +zict==3.0.0 +cachetools==5.5.0 +colorful==0.5.6 +mpmath==1.3.0 +nest_asyncio==1.6.0 +pyFUME==0.2.25 +opencv-python-headless==4.9.0 +fastai==2.7.18 +importlib_resources==6.4.5 +binaryornot==0.4.4 +evaluate==0.4.1 +matplotlib-inline==0.1.7 +wasabi==1.1.2 +pycparser==2.22 +GitPython==3.1.43 +pluggy==1.5.0 +async-lru==2.0.4 +pgmpy==0.1.24 +anyio==4.4.0 +executing==2.1.0 +orjson==3.10.11 +humanfriendly==10.0 +tornado==6.4.1 +gmpy2==2.1.5 +rlPyCairo==0.2.0 +distributed==2024.11.0 +FuzzyTM==2.0.5 +torchtext==0.15.2a0+5ce3163 +pytest==8.3.5 +pyod==2.0.2 +ImageHash==4.3.1 +soupsieve==2.5 +tblib==3.0.0 +emoji==2.14.0 +aiohappyeyeballs==2.4.3 +uri-template==1.3.0 +tensorflow_estimator==2.15.0 +babel==2.16.0 +dask-cuda==24.12.0a12 +overrides==7.7.0 +opencensus==0.11.3 +openai==0.28.1 +language_data==1.2.0 +jedi==0.19.2 +cookiecutter==2.6.0 +entrypoints==0.4 +exceptiongroup==1.2.2 +marisa-trie==1.2.0 +uvloop==0.20.0 +aiosignal==1.3.1 +Flask==3.0.3 +tensorboard==2.15.2 +cffi==1.17.1 +tf_keras==2.15.0 +absl-py==2.1.0 +blinker==1.9.0 +types-python-dateutil==2.9.0.20241003 +opencv-python==4.9.0 +frozendict==2.4.6 +aiohttp-cors==0.7.0 +statsmodels==0.14.4 +tinycss2==1.4.0 +terminado==0.18.1 +pycaret==2.2.3 +aiohttp==3.10.10 +distributed-ucxx==0.41.0 +prometheus_client==0.21.0 +fastdownload==0.0.7 +grpcio==1.59.3 +google-api-core==2.22.0 +jupyterlab_widgets==3.0.13 +appdirs==1.4.4 +littleutils==0.0.0 +ray==2.24.0 +kaggle==1.6.17 +jsonschema==4.23.0 +google-auth==2.36.0 +scikit-base==0.11.0 +visions==0.7.6 +pyarrow==15.0.0 +transformers==4.33.0 +prometheus_flask_exporter==0.23.1 +dm-tree==0.1.8 +colorama==0.4.6 +requests-toolbelt==1.0.0 +cached-property==1.5.2 +cymem==2.0.8 +PyNaCl==1.5.0 +PyWavelets==1.7.0 +httptools==0.6.1 +typing-utils==0.1.0 +email_validator==2.2.0 +marshmallow==3.23.1 +Deprecated==1.2.14 +virtualenv==20.4.7 +optuna==3.6.1 +jupyter_server==2.14.2 +termcolor==2.5.0 +mpi4py==4.0.1 +torchdata==0.7.1+8cea82f +dataclasses==0.8 +cloudpickle==3.1.0 +tree_sitter_languages==1.10.2 +tabulate==0.9.0 +ipython==8.29.0 +lightgbm==4.3.0 +captum==0.6.0 +confuse==2.0.1 +torchvision==0.16.1+adc3221 +lxml==4.9.4 +fastapi==0.115.4 +python-multipart==0.0.17 +dnspython==2.7.0 +jupyter-console==6.6.3 +preshed==3.0.9 +py-cpuinfo==9.0.0 +Send2Trash==1.8.3 +murmurhash==1.0.10 +sniffio==1.3.1 +websockets==13.1 +h11==0.14.0 +smmap==5.0.0 +textual==0.85.2 +jsonpatch==1.33 +opencensus-context==0.1.3 +nbconvert==7.16.4 +sentry-sdk==2.19.0 +opentelemetry-semantic-conventions==0.37b0 +pandas-profiling==2.8.0 +pillow==10.3.0 +peft==0.13.2 +rpds-py==0.21.0 +bokeh==3.6.1 +distro==1.9.0 +itsdangerous==2.2.0 +wandb==0.18.7 +jsonpointer==3.0.0 +astropy-iers-data==0.2024.11.11.0.32.38 +horovod==0.28.1 +graphviz==0.20.3 +vtk==9.3.1 +bleach==6.2.0 +numexpr==2.8.7 +pydantic_core==2.23.4 +Jinja2==3.1.4 +widgetsnbextension==4.0.13 +filelock==3.16.1 +catboost==1.2.7 +raft-dask==24.12.0a36 +async-timeout==4.0.3 +datefinder==0.7.3 +coloredlogs==15.0.1 +platformdirs==4.3.6 +spacy-legacy==3.0.12 +chardet==5.2.0 +jupyter_client==8.6.3 +importlib_metadata==8.5.0 +rfc3986-validator==0.1.1 +huggingface_hub==0.26.2 +PySocks==1.7.1 +mlxtend==0.23.2 +outdated==0.2.2 +partd==1.4.2 +thinc==8.2.5 +astropy==6.1.6 +rdflib==6.3.2 +h2==4.1.0 +typer==0.13.0 +xyzservices==2024.9.0 +toolz==0.12.1 +frozenlist==1.5.0 +rdkit==2024.9.2 +pyasn1==0.6.1 +jupyter_server_terminals==0.5.3 +ucx-py==0.41.0a11 +astunparse==1.6.3 +simpful==2.12.0 +notebook_shim==0.2.4 +scipy==1.13.1 +colorlog==6.9.0 +tiktoken==0.3.3 +plotly==5.24.1 +fastrlock==0.8.2 +chart-studio==1.1.0 +stack-data==0.6.2 +google-pasta==0.2.0 +sktime==0.34.0 +PyYAML==6.0.2 +sympy==1.13.3 +multidict==6.1.0 +ml-dtypes==0.2.0 +tensorboardX==2.6.2.2 +decorator==5.1.1 +cytoolz==1.0.0 +ase==3.23.0 +isoduration==20.11.0 +html5lib==1.1 +langsmith==0.1.142 +future==1.0.0 +onnx2torch==1.5.15 +multipledispatch==0.6.0 +protobuf==4.24.4 +ucxx==0.41.0 +pandas_flavor==0.6.0 +msgpack==1.1.0 +pyasn1_modules==0.4.1 +imagecodecs==2024.1.1 +mlflow==2.17.2 +watchfiles==0.24.0 +dm-sonnet==2.0.2 +langcodes==3.4.1 +freetype-py==2.3.0 +argon2-cffi-bindings==21.2.0 +trimesh==4.5.2 +opt_einsum==3.4.0 +tenacity==8.5.0 +h5py==3.12.1 +fastapi-cli==0.0.5 +oauthlib==3.2.2 +parso==0.8.4 +weasel==0.4.1 +yfinance==0.2.49 +networkx==2.8.8 +bitsandbytes==0.44.1 +lazy_loader==0.4 +querystring_parser==1.2.4 +contourpy==1.3.0 +unicodedata2==15.1.0 +bcrypt==4.2.0 +munkres==1.1.4 +langchain==0.0.298 +hpack==4.0.0 +cryptography==43.0.3 +umap-learn==0.5.7 +arrow==1.3.0 +docker==7.1.0 +certifi==2025.1.31 +fastjsonschema==2.20.0 +tensorflow==2.15.0 +googleapis-common-protos==1.65.0 +iniconfig==2.0.0 +Markdown==3.6 +llvmlite==0.43.0 +wslink==2.3.2 +attrs==24.2.0 +rich==13.9.4 +cupy==13.3.0 +uc-micro-py==1.0.3 +alembic==1.14.0 +joblib==1.4.2 +reportlab==4.2.5 +miniful==0.0.6 +jupyter_core==5.7.2 +wheel==0.45.0 +phik==0.12.3 +mistune==3.0.2 +wcwidth==0.2.13 +dacite==1.8.1 +accelerate==0.22.0 +sacremoses==0.0.53 +revtok==0.0.3 +python-slugify==8.0.4 +tangled-up-in-unicode==0.2.0 +dask==2024.11.0 +markdown-it-py==3.0.0 +sentencepiece==0.1.99 +beautifulsoup4==4.12.3 +six==1.16.0 +numba-cuda==0.0.17 +argon2-cffi==23.1.0 +xxhash==3.5.0 +hjson==3.1.0 +fonttools==4.54.1 +graphql-core==3.2.5 +pyparsing==3.2.0 +pure_eval==0.2.3 +distlib==0.3.9 +lightning==2.4.0 +wordcloud==0.0.0 +catalogue==2.0.10 +jax==0.4.27 +tree-sitter==0.23.2 +notebook==7.2.2 +dataclasses-json==0.6.7 +propcache==0.2.0 +numba==0.60.0 +dask-expr==1.1.17 +pydantic==2.9.2 +gunicorn==22.0.0 +missingno==0.5.2 +pyOpenSSL==24.2.1 +openpyxl==3.1.5 +packaging==24.1 +python-dotenv==1.0.1 +cycler==0.12.1 +types-pytz==2024.2.0.20241003 +yellowbrick==1.5 +referencing==0.35.1 +pyLDAvis==3.4.1 +lazypredict==0.2.16 +fqdn==1.5.1 +websocket-client==1.8.0 +fastcore==1.7.19 +pynvjitlink-cu12==0.3.0 +pingouin==0.5.5 +numpy==1.26.4 +typing-inspect==0.9.0 +nltk==3.9.1 +onnxruntime==1.19.2 +tensorflow-probability==0.23.0 +datasets==3.0.2 +pickleshare==0.7.5 +peewee==3.17.7 +torch-geometric==2.6.1 +ptyprocess==0.7.0 +greenlet==3.1.1 +graphql-relay==3.2.0 +graphene==3.4.3 +et_xmlfile==2.0.0 +webencodings==0.5.1 +hyperframe==6.0.1 +multitasking==0.0.9 +typer-slim==0.13.0 +onnx==1.15.0 +uvicorn==0.32.0 +memray==1.13.4 +xgboost==2.1.2 +Brotli==1.1.0 +zipp==3.21.0 +nbformat==5.10.4 +responses==0.18.0 +funcy==2.0 +Pygments==2.18.0 +tqdm==4.67.0 +linkify-it-py==2.0.3 +srsly==2.4.8 +cuda-python==12.6.0 +lightning-utilities==0.11.8 +cudf==24.12.0a337 +dask-ml==2024.4.4 +docker-pycreds==0.4.0 +pkgutil_resolve_name==1.3.10 +opentelemetry-api==1.16.0 +fsspec==2024.9.0 +nbclient==0.10.0 +psutil==5.9.8 +pytorch-lightning==2.4.0 +sortedcontainers==2.4.0 +matplotlib==3.9.2 +defusedxml==0.7.1 +urllib3==1.26.19 +jupyterlab_server==2.27.3 +retrying==1.3.3 +dask-cudf==24.12.0a337 +sqlparse==0.5.1 +text-unidecode==1.3 +seaborn==0.13.2 +typing_extensions==4.12.2 +pyzmq==26.2.0 +rfc3339-validator==0.1.4 +pynndescent==0.5.13 +pip==24.3.1 +confection==0.1.4 +wrapt==1.14.1 +fastprogress==1.0.3 +traitlets==5.14.3 +asttokens==2.4.1 +json5==0.9.28 +pandas-stubs==2.2.3.241126 +torchmetrics==1.2.1 +gitdb==4.0.11 +annotated-types==0.7.0 +ipython-autotime==0.1 +httpcore==1.0.6 +click==8.1.7 +setproctitle==1.3.3 +starlette==0.41.2 +jupyterlab==4.2.5 +rmm==24.12.0a27 +opentelemetry-sdk==1.16.0 +textblob==0.15.3 +imbalanced-learn==0.12.4 +typeguard==4.3.0 +more-itertools==10.3.0 +zipp==3.19.2 +autocommand==2.2.2 +jaraco.context==5.3.0 +packaging==24.1 +importlib_metadata==8.0.0 +platformdirs==4.2.2 +jaraco.functools==4.0.1 +importlib_resources==6.4.0 +tomli==2.0.1 +jaraco.text==3.12.1 +wheel==0.43.0 +jaraco.collections==5.1.0 +typing_extensions==4.12.2 +inflect==7.3.1 +backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_161246-rdbtc2pz/files/wandb-metadata.json b/wandb/run-20250504_161246-rdbtc2pz/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..d7d40733d28be0ffdf7ad38c1cd91cd7308f5fd5 --- /dev/null +++ b/wandb/run-20250504_161246-rdbtc2pz/files/wandb-metadata.json @@ -0,0 +1,77 @@ +{ + "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", + "python": "3.10.15", + "startedAt": "2025-05-04T13:12:46.058889Z", + "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", + "codePath": "finetuning_bc_prott5.py", + "email": "zeynep.isik1@sabanciuniv.edu", + "root": "/arf/scratch/zisik/prott5_bc_ft", + "host": "kolyoz1", + "username": "zisik", + "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", + "codePathLocal": "finetuning_bc_prott5.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "7643995308032", + "used": "274907410432" + } + }, + "memory": { + "total": "1081373220864" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "cluster_name": "cuda", + "conf": "/etc/slurm/slurm.conf", + "cpus_on_node": "16", + "cpus_per_task": "16", + "gpus_on_node": "1", + "gtids": "0", + "job_account": "tbag154", + "job_cpus_per_node": "16", + "job_end_time": "1746623540", + "job_gid": "11636", + "job_gpus": "1", + "job_id": "1027947", + "job_name": "msa_ph_pt", + "job_nodelist": "kolyoz1", + "job_num_nodes": "1", + "job_partition": "kolyoz-cuda", + "job_qos": "tbag", + "job_start_time": "1746364340", + "job_uid": "11636", + "job_user": "zisik", + "jobid": "1027947", + "localid": "0", + "mem_per_cpu": "14000", + "nnodes": "1", + "node_aliases": "(null)", + "nodeid": "0", + "nodelist": "kolyoz1", + "prio_process": "0", + "procid": "0", + "submit_dir": "/arf/scratch/zisik", + "submit_host": "cuda-ui", + "task_pid": "3179500", + "tasks_per_node": "1", + "topology_addr": "kolyoz1", + "topology_addr_pattern": "node", + "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" + }, + "cudaVersion": "12.6" +} \ No newline at end of file diff --git a/wandb/run-20250504_161246-rdbtc2pz/files/wandb-summary.json b/wandb/run-20250504_161246-rdbtc2pz/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ca1304c1a6f3387bb206e3ac8c9bfa50dd878c77 --- /dev/null +++ b/wandb/run-20250504_161246-rdbtc2pz/files/wandb-summary.json @@ -0,0 +1 @@ +{"train_loss":0.3210471471150716,"_runtime":80.142129451,"train_runtime":64.2466,"eval/loss":0.05800781771540642,"eval/steps_per_second":31.158,"total_flos":0,"eval/samples_per_second":233.689,"train/global_step":6,"_timestamp":1.746364446200474e+09,"train_samples_per_second":3.269,"_wandb":{"runtime":80},"eval/runtime":0.0642,"train_steps_per_second":0.093,"train/epoch":2.6666666666666665,"eval/accuracy":1,"_step":4} \ No newline at end of file diff --git a/wandb/run-20250504_161246-rdbtc2pz/logs/debug-core.log b/wandb/run-20250504_161246-rdbtc2pz/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..2233811df4108abf9e6d1a7a308e7fd9f315ac85 --- /dev/null +++ b/wandb/run-20250504_161246-rdbtc2pz/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-05-04T16:12:45.059197409+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmphflqkva1/port-3179526.txt","pid":3179526,"debug":false,"disable-analytics":false} +{"time":"2025-05-04T16:12:45.059250836+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2025-05-04T16:12:45.060076988+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3179526} +{"time":"2025-05-04T16:12:45.059982306+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":45921,"Zone":""}} +{"time":"2025-05-04T16:12:45.246915089+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:33132"} +{"time":"2025-05-04T16:12:46.063164622+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"rdbtc2pz","id":"127.0.0.1:33132"} +{"time":"2025-05-04T16:12:46.187062148+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"rdbtc2pz","id":"127.0.0.1:33132"} +{"time":"2025-05-04T16:14:06.269673416+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:33132"} +{"time":"2025-05-04T16:14:06.269788395+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:33132"} +{"time":"2025-05-04T16:14:06.26984398+03:00","level":"INFO","msg":"server is shutting down"} +{"time":"2025-05-04T16:14:06.269980058+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:33132"} +{"time":"2025-05-04T16:14:07.608460726+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:33132"} +{"time":"2025-05-04T16:14:07.608482723+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:33132"} +{"time":"2025-05-04T16:14:07.60849804+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_161246-rdbtc2pz/logs/debug-internal.log b/wandb/run-20250504_161246-rdbtc2pz/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..2f9c168a41928938cfb3d4e4e4131691e000328e --- /dev/null +++ b/wandb/run-20250504_161246-rdbtc2pz/logs/debug-internal.log @@ -0,0 +1,19 @@ +{"time":"2025-05-04T16:12:46.065859772+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} +{"time":"2025-05-04T16:12:46.065909143+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_161246-rdbtc2pz/logs/debug-core.log"} +{"time":"2025-05-04T16:12:46.186999454+03:00","level":"INFO","msg":"created new stream","id":"rdbtc2pz"} +{"time":"2025-05-04T16:12:46.187050012+03:00","level":"INFO","msg":"stream: started","id":"rdbtc2pz"} +{"time":"2025-05-04T16:12:46.187228889+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"rdbtc2pz"} +{"time":"2025-05-04T16:12:46.187328701+03:00","level":"INFO","msg":"handler: started","stream_id":"rdbtc2pz"} +{"time":"2025-05-04T16:12:46.187417103+03:00","level":"INFO","msg":"sender: started","stream_id":"rdbtc2pz"} +{"time":"2025-05-04T16:12:46.598141294+03:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-05-04T16:14:06.269782406+03:00","level":"INFO","msg":"stream: closing","id":"rdbtc2pz"} +{"time":"2025-05-04T16:14:06.269825637+03:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-05-04T16:14:06.270879471+03:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-05-04T16:14:06.55541099+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} +{"time":"2025-05-04T16:14:06.555433954+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} +{"time":"2025-05-04T16:14:06.555445965+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} +{"time":"2025-05-04T16:14:07.09767572+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-05-04T16:14:07.607443104+03:00","level":"INFO","msg":"handler: closed","stream_id":"rdbtc2pz"} +{"time":"2025-05-04T16:14:07.607487355+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"rdbtc2pz"} +{"time":"2025-05-04T16:14:07.607532609+03:00","level":"INFO","msg":"sender: closed","stream_id":"rdbtc2pz"} +{"time":"2025-05-04T16:14:07.607587557+03:00","level":"INFO","msg":"stream: closed","id":"rdbtc2pz"} diff --git a/wandb/run-20250504_161246-rdbtc2pz/logs/debug.log b/wandb/run-20250504_161246-rdbtc2pz/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..349cfbd59b697951167c42dd519765d328645a03 --- /dev/null +++ b/wandb/run-20250504_161246-rdbtc2pz/logs/debug.log @@ -0,0 +1,27 @@ +2025-05-04 16:12:46,051 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 +2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Configure stats pid to 3179526 +2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings +2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings +2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} +2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 16:12:46,052 INFO MainThread:3179526 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_161246-rdbtc2pz/logs/debug.log +2025-05-04 16:12:46,053 INFO MainThread:3179526 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_161246-rdbtc2pz/logs/debug-internal.log +2025-05-04 16:12:46,053 INFO MainThread:3179526 [wandb_init.py:init():619] calling init triggers +2025-05-04 16:12:46,053 INFO MainThread:3179526 [wandb_init.py:init():626] wandb.init called with sweep_config: {} +config: {} +2025-05-04 16:12:46,053 INFO MainThread:3179526 [wandb_init.py:init():669] starting backend +2025-05-04 16:12:46,053 INFO MainThread:3179526 [wandb_init.py:init():673] sending inform_init request +2025-05-04 16:12:46,057 INFO MainThread:3179526 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-04 16:12:46,058 INFO MainThread:3179526 [wandb_init.py:init():686] backend started and connected +2025-05-04 16:12:46,064 INFO MainThread:3179526 [wandb_init.py:init():781] updated telemetry +2025-05-04 16:12:46,067 INFO MainThread:3179526 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout +2025-05-04 16:12:46,584 INFO MainThread:3179526 [wandb_init.py:init():867] starting run threads in backend +2025-05-04 16:12:47,966 INFO MainThread:3179526 [wandb_run.py:_console_start():2456] atexit reg +2025-05-04 16:12:47,966 INFO MainThread:3179526 [wandb_run.py:_redirect():2305] redirect: wrap_raw +2025-05-04 16:12:47,966 INFO MainThread:3179526 [wandb_run.py:_redirect():2370] Wrapping output streams. +2025-05-04 16:12:47,966 INFO MainThread:3179526 [wandb_run.py:_redirect():2395] Redirects installed. +2025-05-04 16:12:47,974 INFO MainThread:3179526 [wandb_init.py:init():911] run started, returning control to user process +2025-05-04 16:13:01,857 INFO MainThread:3179526 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_16-12-52_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2025-05-04 16:14:06,270 WARNING MsgRouterThr:3179526 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_161246-rdbtc2pz/run-rdbtc2pz.wandb b/wandb/run-20250504_161246-rdbtc2pz/run-rdbtc2pz.wandb new file mode 100644 index 0000000000000000000000000000000000000000..f57225fe3777b25bea8c60ee43eed186f1df565a Binary files /dev/null and b/wandb/run-20250504_161246-rdbtc2pz/run-rdbtc2pz.wandb differ diff --git a/wandb/run-20250504_162343-cp870jym/files/config.yaml b/wandb/run-20250504_162343-cp870jym/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..18d70a8edeb099baad34583edef28bf48cb9585b --- /dev/null +++ b/wandb/run-20250504_162343-cp870jym/files/config.yaml @@ -0,0 +1,357 @@ +_wandb: + value: + cli_version: 0.18.7 + m: + - "1": eval/steps_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/global_step + "6": + - 3 + "7": [] + - "1": eval/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/accuracy + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/runtime + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/samples_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.10.15 + t: + "1": + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "2": + - 1 + - 2 + - 3 + - 5 + - 6 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "3": + - 7 + - 23 + - 55 + - 62 + - 66 + "4": 3.10.15 + "5": 0.18.7 + "6": 4.45.2 + "8": + - 5 + "9": + "1": transformers_trainer + "12": 0.18.7 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +auto_find_batch_size: + value: false +batch_eval_metrics: + value: false +bf16: + value: false +bf16_full_eval: + value: false +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +dispatch_batches: + value: null +do_eval: + value: true +do_predict: + value: false +do_train: + value: false +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: epoch +eval_use_gather_object: + value: false +evaluation_strategy: + value: epoch +fp16: + value: true +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: false +group_by_length: + value: false +half_precision_backend: + value: auto +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +ignore_data_skip: + value: false +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +learning_rate: + value: 5e-05 +length_column_name: + value: length +load_best_model_at_end: + value: true +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: t5-bc-out/runs/May04_16-23-49_kolyoz1 +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 500 +logging_strategy: + value: steps +lr_scheduler_type: + value: linear +max_grad_norm: + value: 1 +max_steps: + value: -1 +metric_for_best_model: + value: loss +mp_parameters: + value: "" +neftune_noise_alpha: + value: null +no_cuda: + value: false +num_train_epochs: + value: 3 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_dir: + value: t5-bc-out +overwrite_output_dir: + value: false +past_index: + value: -1 +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 8 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +prediction_loss_only: + value: false +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_unused_columns: + value: true +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +run_name: + value: t5-bc-out +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: false +save_steps: + value: 500 +save_strategy: + value: epoch +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +split_batches: + value: null +tf32: + value: null +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_liger_kernel: + value: false +use_mps_device: + value: false +warmup_ratio: + value: 0 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/wandb/run-20250504_162343-cp870jym/files/output.log b/wandb/run-20250504_162343-cp870jym/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..37d4dd8bc142bc7bd0e5821b6fd7fc2418f4768a --- /dev/null +++ b/wandb/run-20250504_162343-cp870jym/files/output.log @@ -0,0 +1,27 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Map: 100%|██████████| 70/70 [00:00<00:00, 4479.59 examples/s] +Map: 100%|██████████| 15/15 [00:00<00:00, 2556.26 examples/s] +/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2025-05-04 16:23:55,053] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. +100%|██████████| 6/6 [01:08<00:00, 11.47s/it] +Map: 100%|██████████| 15/15 [00:00<00:00, 3414.44 examples/s] +{'eval_loss': 0.32496747374534607, 'eval_accuracy': 1.0, 'eval_runtime': 0.0946, 'eval_samples_per_second': 158.536, 'eval_steps_per_second': 21.138, 'epoch': 0.89} +{'eval_loss': 0.14126792550086975, 'eval_accuracy': 1.0, 'eval_runtime': 0.0935, 'eval_samples_per_second': 160.347, 'eval_steps_per_second': 21.38, 'epoch': 1.78} +{'eval_loss': 0.08305665105581284, 'eval_accuracy': 1.0, 'eval_runtime': 0.0868, 'eval_samples_per_second': 172.874, 'eval_steps_per_second': 23.05, 'epoch': 2.67} +{'train_runtime': 68.815, 'train_samples_per_second': 3.052, 'train_steps_per_second': 0.087, 'train_loss': 0.34361688296000165, 'epoch': 2.67} +100%|██████████| 2/2 [00:00<00:00, 93.00it/s] +{'eval_loss': 0.07820229977369308, 'eval_accuracy': 1.0, 'eval_runtime': 0.0516, 'eval_samples_per_second': 290.667, 'eval_steps_per_second': 38.756, 'epoch': 2.6666666666666665} +Traceback (most recent call last): + File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in + model.save_pretrained( + File "/arf/home/zisik/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1928, in __getattr__ + raise AttributeError( +AttributeError: 'T5BinaryClassifier' object has no attribute 'save_pretrained' +Traceback (most recent call last): + File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in + model.save_pretrained( + File "/arf/home/zisik/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1928, in __getattr__ + raise AttributeError( +AttributeError: 'T5BinaryClassifier' object has no attribute 'save_pretrained' diff --git a/wandb/run-20250504_162343-cp870jym/files/requirements.txt b/wandb/run-20250504_162343-cp870jym/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..847c45ecccb522de294762faeeb01fe5fb02f7ac --- /dev/null +++ b/wandb/run-20250504_162343-cp870jym/files/requirements.txt @@ -0,0 +1,541 @@ +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +pyg-lib==0.4.0+pt20cu117 +biopython==1.85 +iniconfig==2.0.0 +tokenizers==0.20.0 +accelerate==1.3.0 +torch==2.6.0 +nvidia-nccl-cu12==2.21.5 +transformers==4.45.2 +nvidia-cusparse-cu12==12.3.1.170 +torch-scatter==2.1.2+pt20cu117 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nvtx-cu12==12.4.127 +zstd==1.5.6.6 +fair-esm==2.0.0 +omegaconf==2.3.0 +pluggy==1.5.0 +pytest==8.3.5 +nvidia-curand-cu12==10.3.5.147 +nvidia-cufft-cu12==11.2.1.3 +torch-cluster==1.6.3+pt20cu117 +regex==2024.9.11 +nvidia-cudnn-cu12==9.1.0.70 +torch-spline-conv==1.2.2+pt20cu117 +nvidia-cusolver-cu12==11.6.1.9 +antlr4-python3-runtime==4.9.3 +msgpack-numpy==0.4.8 +nlp==0.2.0 +einops==0.8.1 +nvidia-cublas-cu12==12.4.5.8 +triton==3.2.0 +ninja==1.11.1.3 +hydra-core==1.3.2 +nvidia-nvjitlink-cu12==12.4.127 +biotite==0.41.2 +torch-sparse==0.6.18+pt20cu117 +esm==3.1.4 +sympy==1.13.1 +nvidia-cuda-runtime-cu12==12.4.127 +jupyter-lsp==2.2.5 +jupyter-events==0.10.0 +ipykernel==6.29.5 +Mako==1.3.5 +proto-plus==1.25.0 +fst-pso==1.8.1 +gensim==4.3.3 +htmlmin==0.1.12 +tokenizers==0.13.3 +timm==1.0.11 +MarkupSafe==3.0.2 +safetensors==0.4.5 +requests==2.32.3 +gast==0.5.5 +cuml==24.12.0a33 +jaxlib==0.4.23.dev20240214 +spacy-loggers==1.0.5 +pytz==2024.1 +idna==3.10 +python-dateutil==2.9.0 +mdurl==0.1.2 +blis==0.7.10 +jupyter==1.1.1 +pyerfa==2.0.1.5 +comm==0.2.2 +pygraphviz==1.14 +dill==0.3.8 +paramiko==3.5.0 +llama-index==0.8.36 +mdit-py-plugins==0.4.2 +Werkzeug==3.1.3 +pyu2f==0.1.5 +dask-glm==0.2.0 +httpx==0.27.2 +typeguard==4.4.1 +mypy-extensions==1.0.0 +kmodes==0.12.2 +keras==2.15.0 +ydata-profiling==0.0.dev0 +regex==2024.11.6 +xarray==2024.11.0 +setuptools==75.3.0 +charset-normalizer==3.4.0 +jupyterlab_nvdashboard==0.11.0 +pylibraft==24.12.0a36 +spacy==3.7.6 +mlflow-skinny==2.17.2 +nvtx==0.2.10 +multimethod==1.12 +pexpect==4.9.0 +torch==2.1.0.post301 +flatbuffers==24.3.25 +python-json-logger==2.0.7 +PyJWT==2.9.0 +multiprocess==0.70.16 +colorlover==0.3.0 +yarl==1.16.0 +locket==1.0.0 +patsy==1.0.0 +rapids-dask-dependency==24.12.0a0 +stanza==1.9.2 +debugpy==1.8.8 +jupyterlab_pygments==0.3.0 +pylibcudf==24.12.0a337 +lz4==4.3.3 +pandas==2.2.3 +tifffile==2024.9.20 +pynvml==11.4.1 +cufflinks==0.17.3 +ipywidgets==8.1.5 +requests-oauthlib==2.0.0 +google-auth-oauthlib==1.2.1 +rsa==4.9 +webcolors==24.8.0 +jsonschema-specifications==2024.10.1 +scikit-learn==1.5.2 +langchain-text-splitters==0.3.2 +pandas-datareader==0.10.0 +tomli==2.0.2 +tzdata==2024.2 +scikit-image==0.24.0 +tensorboard_data_server==0.7.0 +kiwisolver==1.4.7 +cloudpathlib==0.20.0 +isodate==0.6.1 +adversarial-robustness-toolbox==1.19.1 +SQLAlchemy==2.0.36 +pytest-runner==6.0.0 +pycairo==1.27.0 +treelite==4.3.0 +jiter==0.7.0 +threadpoolctl==3.5.0 +pandocfilters==1.5.0 +loguru==0.7.2 +smart_open==7.0.5 +shellingham==1.5.4 +deepspeed==0.15.4 +prompt_toolkit==3.0.48 +databricks-sdk==0.34.0 +langchain-core==0.3.15 +imageio==2.36.0 +openapi-schema-pydantic==1.2.4 +zict==3.0.0 +cachetools==5.5.0 +colorful==0.5.6 +mpmath==1.3.0 +nest_asyncio==1.6.0 +pyFUME==0.2.25 +opencv-python-headless==4.9.0 +fastai==2.7.18 +importlib_resources==6.4.5 +binaryornot==0.4.4 +evaluate==0.4.1 +matplotlib-inline==0.1.7 +wasabi==1.1.2 +pycparser==2.22 +GitPython==3.1.43 +pluggy==1.5.0 +async-lru==2.0.4 +pgmpy==0.1.24 +anyio==4.4.0 +executing==2.1.0 +orjson==3.10.11 +humanfriendly==10.0 +tornado==6.4.1 +gmpy2==2.1.5 +rlPyCairo==0.2.0 +distributed==2024.11.0 +FuzzyTM==2.0.5 +torchtext==0.15.2a0+5ce3163 +pytest==8.3.5 +pyod==2.0.2 +ImageHash==4.3.1 +soupsieve==2.5 +tblib==3.0.0 +emoji==2.14.0 +aiohappyeyeballs==2.4.3 +uri-template==1.3.0 +tensorflow_estimator==2.15.0 +babel==2.16.0 +dask-cuda==24.12.0a12 +overrides==7.7.0 +opencensus==0.11.3 +openai==0.28.1 +language_data==1.2.0 +jedi==0.19.2 +cookiecutter==2.6.0 +entrypoints==0.4 +exceptiongroup==1.2.2 +marisa-trie==1.2.0 +uvloop==0.20.0 +aiosignal==1.3.1 +Flask==3.0.3 +tensorboard==2.15.2 +cffi==1.17.1 +tf_keras==2.15.0 +absl-py==2.1.0 +blinker==1.9.0 +types-python-dateutil==2.9.0.20241003 +opencv-python==4.9.0 +frozendict==2.4.6 +aiohttp-cors==0.7.0 +statsmodels==0.14.4 +tinycss2==1.4.0 +terminado==0.18.1 +pycaret==2.2.3 +aiohttp==3.10.10 +distributed-ucxx==0.41.0 +prometheus_client==0.21.0 +fastdownload==0.0.7 +grpcio==1.59.3 +google-api-core==2.22.0 +jupyterlab_widgets==3.0.13 +appdirs==1.4.4 +littleutils==0.0.0 +ray==2.24.0 +kaggle==1.6.17 +jsonschema==4.23.0 +google-auth==2.36.0 +scikit-base==0.11.0 +visions==0.7.6 +pyarrow==15.0.0 +transformers==4.33.0 +prometheus_flask_exporter==0.23.1 +dm-tree==0.1.8 +colorama==0.4.6 +requests-toolbelt==1.0.0 +cached-property==1.5.2 +cymem==2.0.8 +PyNaCl==1.5.0 +PyWavelets==1.7.0 +httptools==0.6.1 +typing-utils==0.1.0 +email_validator==2.2.0 +marshmallow==3.23.1 +Deprecated==1.2.14 +virtualenv==20.4.7 +optuna==3.6.1 +jupyter_server==2.14.2 +termcolor==2.5.0 +mpi4py==4.0.1 +torchdata==0.7.1+8cea82f +dataclasses==0.8 +cloudpickle==3.1.0 +tree_sitter_languages==1.10.2 +tabulate==0.9.0 +ipython==8.29.0 +lightgbm==4.3.0 +captum==0.6.0 +confuse==2.0.1 +torchvision==0.16.1+adc3221 +lxml==4.9.4 +fastapi==0.115.4 +python-multipart==0.0.17 +dnspython==2.7.0 +jupyter-console==6.6.3 +preshed==3.0.9 +py-cpuinfo==9.0.0 +Send2Trash==1.8.3 +murmurhash==1.0.10 +sniffio==1.3.1 +websockets==13.1 +h11==0.14.0 +smmap==5.0.0 +textual==0.85.2 +jsonpatch==1.33 +opencensus-context==0.1.3 +nbconvert==7.16.4 +sentry-sdk==2.19.0 +opentelemetry-semantic-conventions==0.37b0 +pandas-profiling==2.8.0 +pillow==10.3.0 +peft==0.13.2 +rpds-py==0.21.0 +bokeh==3.6.1 +distro==1.9.0 +itsdangerous==2.2.0 +wandb==0.18.7 +jsonpointer==3.0.0 +astropy-iers-data==0.2024.11.11.0.32.38 +horovod==0.28.1 +graphviz==0.20.3 +vtk==9.3.1 +bleach==6.2.0 +numexpr==2.8.7 +pydantic_core==2.23.4 +Jinja2==3.1.4 +widgetsnbextension==4.0.13 +filelock==3.16.1 +catboost==1.2.7 +raft-dask==24.12.0a36 +async-timeout==4.0.3 +datefinder==0.7.3 +coloredlogs==15.0.1 +platformdirs==4.3.6 +spacy-legacy==3.0.12 +chardet==5.2.0 +jupyter_client==8.6.3 +importlib_metadata==8.5.0 +rfc3986-validator==0.1.1 +huggingface_hub==0.26.2 +PySocks==1.7.1 +mlxtend==0.23.2 +outdated==0.2.2 +partd==1.4.2 +thinc==8.2.5 +astropy==6.1.6 +rdflib==6.3.2 +h2==4.1.0 +typer==0.13.0 +xyzservices==2024.9.0 +toolz==0.12.1 +frozenlist==1.5.0 +rdkit==2024.9.2 +pyasn1==0.6.1 +jupyter_server_terminals==0.5.3 +ucx-py==0.41.0a11 +astunparse==1.6.3 +simpful==2.12.0 +notebook_shim==0.2.4 +scipy==1.13.1 +colorlog==6.9.0 +tiktoken==0.3.3 +plotly==5.24.1 +fastrlock==0.8.2 +chart-studio==1.1.0 +stack-data==0.6.2 +google-pasta==0.2.0 +sktime==0.34.0 +PyYAML==6.0.2 +sympy==1.13.3 +multidict==6.1.0 +ml-dtypes==0.2.0 +tensorboardX==2.6.2.2 +decorator==5.1.1 +cytoolz==1.0.0 +ase==3.23.0 +isoduration==20.11.0 +html5lib==1.1 +langsmith==0.1.142 +future==1.0.0 +onnx2torch==1.5.15 +multipledispatch==0.6.0 +protobuf==4.24.4 +ucxx==0.41.0 +pandas_flavor==0.6.0 +msgpack==1.1.0 +pyasn1_modules==0.4.1 +imagecodecs==2024.1.1 +mlflow==2.17.2 +watchfiles==0.24.0 +dm-sonnet==2.0.2 +langcodes==3.4.1 +freetype-py==2.3.0 +argon2-cffi-bindings==21.2.0 +trimesh==4.5.2 +opt_einsum==3.4.0 +tenacity==8.5.0 +h5py==3.12.1 +fastapi-cli==0.0.5 +oauthlib==3.2.2 +parso==0.8.4 +weasel==0.4.1 +yfinance==0.2.49 +networkx==2.8.8 +bitsandbytes==0.44.1 +lazy_loader==0.4 +querystring_parser==1.2.4 +contourpy==1.3.0 +unicodedata2==15.1.0 +bcrypt==4.2.0 +munkres==1.1.4 +langchain==0.0.298 +hpack==4.0.0 +cryptography==43.0.3 +umap-learn==0.5.7 +arrow==1.3.0 +docker==7.1.0 +certifi==2025.1.31 +fastjsonschema==2.20.0 +tensorflow==2.15.0 +googleapis-common-protos==1.65.0 +iniconfig==2.0.0 +Markdown==3.6 +llvmlite==0.43.0 +wslink==2.3.2 +attrs==24.2.0 +rich==13.9.4 +cupy==13.3.0 +uc-micro-py==1.0.3 +alembic==1.14.0 +joblib==1.4.2 +reportlab==4.2.5 +miniful==0.0.6 +jupyter_core==5.7.2 +wheel==0.45.0 +phik==0.12.3 +mistune==3.0.2 +wcwidth==0.2.13 +dacite==1.8.1 +accelerate==0.22.0 +sacremoses==0.0.53 +revtok==0.0.3 +python-slugify==8.0.4 +tangled-up-in-unicode==0.2.0 +dask==2024.11.0 +markdown-it-py==3.0.0 +sentencepiece==0.1.99 +beautifulsoup4==4.12.3 +six==1.16.0 +numba-cuda==0.0.17 +argon2-cffi==23.1.0 +xxhash==3.5.0 +hjson==3.1.0 +fonttools==4.54.1 +graphql-core==3.2.5 +pyparsing==3.2.0 +pure_eval==0.2.3 +distlib==0.3.9 +lightning==2.4.0 +wordcloud==0.0.0 +catalogue==2.0.10 +jax==0.4.27 +tree-sitter==0.23.2 +notebook==7.2.2 +dataclasses-json==0.6.7 +propcache==0.2.0 +numba==0.60.0 +dask-expr==1.1.17 +pydantic==2.9.2 +gunicorn==22.0.0 +missingno==0.5.2 +pyOpenSSL==24.2.1 +openpyxl==3.1.5 +packaging==24.1 +python-dotenv==1.0.1 +cycler==0.12.1 +types-pytz==2024.2.0.20241003 +yellowbrick==1.5 +referencing==0.35.1 +pyLDAvis==3.4.1 +lazypredict==0.2.16 +fqdn==1.5.1 +websocket-client==1.8.0 +fastcore==1.7.19 +pynvjitlink-cu12==0.3.0 +pingouin==0.5.5 +numpy==1.26.4 +typing-inspect==0.9.0 +nltk==3.9.1 +onnxruntime==1.19.2 +tensorflow-probability==0.23.0 +datasets==3.0.2 +pickleshare==0.7.5 +peewee==3.17.7 +torch-geometric==2.6.1 +ptyprocess==0.7.0 +greenlet==3.1.1 +graphql-relay==3.2.0 +graphene==3.4.3 +et_xmlfile==2.0.0 +webencodings==0.5.1 +hyperframe==6.0.1 +multitasking==0.0.9 +typer-slim==0.13.0 +onnx==1.15.0 +uvicorn==0.32.0 +memray==1.13.4 +xgboost==2.1.2 +Brotli==1.1.0 +zipp==3.21.0 +nbformat==5.10.4 +responses==0.18.0 +funcy==2.0 +Pygments==2.18.0 +tqdm==4.67.0 +linkify-it-py==2.0.3 +srsly==2.4.8 +cuda-python==12.6.0 +lightning-utilities==0.11.8 +cudf==24.12.0a337 +dask-ml==2024.4.4 +docker-pycreds==0.4.0 +pkgutil_resolve_name==1.3.10 +opentelemetry-api==1.16.0 +fsspec==2024.9.0 +nbclient==0.10.0 +psutil==5.9.8 +pytorch-lightning==2.4.0 +sortedcontainers==2.4.0 +matplotlib==3.9.2 +defusedxml==0.7.1 +urllib3==1.26.19 +jupyterlab_server==2.27.3 +retrying==1.3.3 +dask-cudf==24.12.0a337 +sqlparse==0.5.1 +text-unidecode==1.3 +seaborn==0.13.2 +typing_extensions==4.12.2 +pyzmq==26.2.0 +rfc3339-validator==0.1.4 +pynndescent==0.5.13 +pip==24.3.1 +confection==0.1.4 +wrapt==1.14.1 +fastprogress==1.0.3 +traitlets==5.14.3 +asttokens==2.4.1 +json5==0.9.28 +pandas-stubs==2.2.3.241126 +torchmetrics==1.2.1 +gitdb==4.0.11 +annotated-types==0.7.0 +ipython-autotime==0.1 +httpcore==1.0.6 +click==8.1.7 +setproctitle==1.3.3 +starlette==0.41.2 +jupyterlab==4.2.5 +rmm==24.12.0a27 +opentelemetry-sdk==1.16.0 +textblob==0.15.3 +imbalanced-learn==0.12.4 +typeguard==4.3.0 +more-itertools==10.3.0 +zipp==3.19.2 +autocommand==2.2.2 +jaraco.context==5.3.0 +packaging==24.1 +importlib_metadata==8.0.0 +platformdirs==4.2.2 +jaraco.functools==4.0.1 +importlib_resources==6.4.0 +tomli==2.0.1 +jaraco.text==3.12.1 +wheel==0.43.0 +jaraco.collections==5.1.0 +typing_extensions==4.12.2 +inflect==7.3.1 +backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_162343-cp870jym/files/wandb-metadata.json b/wandb/run-20250504_162343-cp870jym/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e5bd8ffd94d157531e5b2d7abc7c46e50d9074ff --- /dev/null +++ b/wandb/run-20250504_162343-cp870jym/files/wandb-metadata.json @@ -0,0 +1,77 @@ +{ + "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", + "python": "3.10.15", + "startedAt": "2025-05-04T13:23:43.746737Z", + "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", + "codePath": "finetuning_bc_prott5.py", + "email": "zeynep.isik1@sabanciuniv.edu", + "root": "/arf/scratch/zisik/prott5_bc_ft", + "host": "kolyoz1", + "username": "zisik", + "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", + "codePathLocal": "finetuning_bc_prott5.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "7643995308032", + "used": "274884100096" + } + }, + "memory": { + "total": "1081373220864" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "cluster_name": "cuda", + "conf": "/etc/slurm/slurm.conf", + "cpus_on_node": "16", + "cpus_per_task": "16", + "gpus_on_node": "1", + "gtids": "0", + "job_account": "tbag154", + "job_cpus_per_node": "16", + "job_end_time": "1746624198", + "job_gid": "11636", + "job_gpus": "1", + "job_id": "1027950", + "job_name": "msa_ph_pt", + "job_nodelist": "kolyoz1", + "job_num_nodes": "1", + "job_partition": "kolyoz-cuda", + "job_qos": "tbag", + "job_start_time": "1746364998", + "job_uid": "11636", + "job_user": "zisik", + "jobid": "1027950", + "localid": "0", + "mem_per_cpu": "14000", + "nnodes": "1", + "node_aliases": "(null)", + "nodeid": "0", + "nodelist": "kolyoz1", + "prio_process": "0", + "procid": "0", + "submit_dir": "/arf/scratch/zisik", + "submit_host": "cuda-ui", + "task_pid": "3180708", + "tasks_per_node": "1", + "topology_addr": "kolyoz1", + "topology_addr_pattern": "node", + "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" + }, + "cudaVersion": "12.6" +} \ No newline at end of file diff --git a/wandb/run-20250504_162343-cp870jym/files/wandb-summary.json b/wandb/run-20250504_162343-cp870jym/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..e9e34e6f68682beece2671803867da11fe15a3c5 --- /dev/null +++ b/wandb/run-20250504_162343-cp870jym/files/wandb-summary.json @@ -0,0 +1 @@ +{"_step":4,"_runtime":84.708140457,"train_runtime":68.815,"eval/runtime":0.0516,"_wandb":{"runtime":84},"train_samples_per_second":3.052,"train/epoch":2.6666666666666665,"eval/loss":0.07820229977369308,"train_loss":0.34361688296000165,"total_flos":0,"_timestamp":1.7463651084544086e+09,"eval/samples_per_second":290.667,"eval/accuracy":1,"train_steps_per_second":0.087,"train/global_step":6,"eval/steps_per_second":38.756} \ No newline at end of file diff --git a/wandb/run-20250504_162343-cp870jym/logs/debug-core.log b/wandb/run-20250504_162343-cp870jym/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..1e2f80d79d66ceb2fd9940e2b195bd656cbe50a6 --- /dev/null +++ b/wandb/run-20250504_162343-cp870jym/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-05-04T16:23:43.103970405+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpgvzpqnd2/port-3180737.txt","pid":3180737,"debug":false,"disable-analytics":false} +{"time":"2025-05-04T16:23:43.104018+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2025-05-04T16:23:43.104795371+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":39787,"Zone":""}} +{"time":"2025-05-04T16:23:43.104898929+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3180737} +{"time":"2025-05-04T16:23:43.291758092+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:38582"} +{"time":"2025-05-04T16:23:43.748521574+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"cp870jym","id":"127.0.0.1:38582"} +{"time":"2025-05-04T16:23:43.873512977+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"cp870jym","id":"127.0.0.1:38582"} +{"time":"2025-05-04T16:25:08.531174232+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:38582"} +{"time":"2025-05-04T16:25:08.531307956+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:38582"} +{"time":"2025-05-04T16:25:08.531367815+03:00","level":"INFO","msg":"server is shutting down"} +{"time":"2025-05-04T16:25:08.53150429+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:38582"} +{"time":"2025-05-04T16:25:09.788149247+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:38582"} +{"time":"2025-05-04T16:25:09.788183611+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:38582"} +{"time":"2025-05-04T16:25:09.788206528+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_162343-cp870jym/logs/debug-internal.log b/wandb/run-20250504_162343-cp870jym/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..1625047ba9735e4a64500c381492219a264c5df4 --- /dev/null +++ b/wandb/run-20250504_162343-cp870jym/logs/debug-internal.log @@ -0,0 +1,19 @@ +{"time":"2025-05-04T16:23:43.750249064+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} +{"time":"2025-05-04T16:23:43.750294337+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_162343-cp870jym/logs/debug-core.log"} +{"time":"2025-05-04T16:23:43.873441585+03:00","level":"INFO","msg":"created new stream","id":"cp870jym"} +{"time":"2025-05-04T16:23:43.873500609+03:00","level":"INFO","msg":"stream: started","id":"cp870jym"} +{"time":"2025-05-04T16:23:43.873652279+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"cp870jym"} +{"time":"2025-05-04T16:23:43.873745942+03:00","level":"INFO","msg":"handler: started","stream_id":"cp870jym"} +{"time":"2025-05-04T16:23:43.873943316+03:00","level":"INFO","msg":"sender: started","stream_id":"cp870jym"} +{"time":"2025-05-04T16:23:44.451037367+03:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-05-04T16:25:08.531294356+03:00","level":"INFO","msg":"stream: closing","id":"cp870jym"} +{"time":"2025-05-04T16:25:08.531341197+03:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-05-04T16:25:08.532383047+03:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-05-04T16:25:08.797985156+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} +{"time":"2025-05-04T16:25:08.798011707+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} +{"time":"2025-05-04T16:25:08.798022316+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} +{"time":"2025-05-04T16:25:09.301751579+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-05-04T16:25:09.787364957+03:00","level":"INFO","msg":"handler: closed","stream_id":"cp870jym"} +{"time":"2025-05-04T16:25:09.787438823+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"cp870jym"} +{"time":"2025-05-04T16:25:09.78745243+03:00","level":"INFO","msg":"sender: closed","stream_id":"cp870jym"} +{"time":"2025-05-04T16:25:09.787535096+03:00","level":"INFO","msg":"stream: closed","id":"cp870jym"} diff --git a/wandb/run-20250504_162343-cp870jym/logs/debug.log b/wandb/run-20250504_162343-cp870jym/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..a5def13ec74b1339547213aa42656f477507efbb --- /dev/null +++ b/wandb/run-20250504_162343-cp870jym/logs/debug.log @@ -0,0 +1,27 @@ +2025-05-04 16:23:43,738 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 +2025-05-04 16:23:43,738 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Configure stats pid to 3180737 +2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings +2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings +2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} +2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_162343-cp870jym/logs/debug.log +2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_162343-cp870jym/logs/debug-internal.log +2025-05-04 16:23:43,739 INFO MainThread:3180737 [wandb_init.py:init():619] calling init triggers +2025-05-04 16:23:43,740 INFO MainThread:3180737 [wandb_init.py:init():626] wandb.init called with sweep_config: {} +config: {} +2025-05-04 16:23:43,740 INFO MainThread:3180737 [wandb_init.py:init():669] starting backend +2025-05-04 16:23:43,740 INFO MainThread:3180737 [wandb_init.py:init():673] sending inform_init request +2025-05-04 16:23:43,745 INFO MainThread:3180737 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-04 16:23:43,746 INFO MainThread:3180737 [wandb_init.py:init():686] backend started and connected +2025-05-04 16:23:43,754 INFO MainThread:3180737 [wandb_init.py:init():781] updated telemetry +2025-05-04 16:23:43,757 INFO MainThread:3180737 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout +2025-05-04 16:23:44,437 INFO MainThread:3180737 [wandb_init.py:init():867] starting run threads in backend +2025-05-04 16:23:45,830 INFO MainThread:3180737 [wandb_run.py:_console_start():2456] atexit reg +2025-05-04 16:23:45,831 INFO MainThread:3180737 [wandb_run.py:_redirect():2305] redirect: wrap_raw +2025-05-04 16:23:45,831 INFO MainThread:3180737 [wandb_run.py:_redirect():2370] Wrapping output streams. +2025-05-04 16:23:45,832 INFO MainThread:3180737 [wandb_run.py:_redirect():2395] Redirects installed. +2025-05-04 16:23:45,842 INFO MainThread:3180737 [wandb_init.py:init():911] run started, returning control to user process +2025-05-04 16:23:59,567 INFO MainThread:3180737 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_16-23-49_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2025-05-04 16:25:08,531 WARNING MsgRouterThr:3180737 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_162343-cp870jym/run-cp870jym.wandb b/wandb/run-20250504_162343-cp870jym/run-cp870jym.wandb new file mode 100644 index 0000000000000000000000000000000000000000..948b5b7c23ce67858ab71bf4a99f3cfd510cbee6 Binary files /dev/null and b/wandb/run-20250504_162343-cp870jym/run-cp870jym.wandb differ diff --git a/wandb/run-20250504_162813-vqs6o6w5/files/config.yaml b/wandb/run-20250504_162813-vqs6o6w5/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2e371277d6ebafd880eb06c8f0ef37b936a3a706 --- /dev/null +++ b/wandb/run-20250504_162813-vqs6o6w5/files/config.yaml @@ -0,0 +1,357 @@ +_wandb: + value: + cli_version: 0.18.7 + m: + - "1": eval/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/global_step + "6": + - 3 + "7": [] + - "1": eval/runtime + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/steps_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/accuracy + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/samples_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.10.15 + t: + "1": + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "2": + - 1 + - 2 + - 3 + - 5 + - 6 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "3": + - 7 + - 23 + - 55 + - 62 + - 66 + "4": 3.10.15 + "5": 0.18.7 + "6": 4.45.2 + "8": + - 5 + "9": + "1": transformers_trainer + "12": 0.18.7 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +auto_find_batch_size: + value: false +batch_eval_metrics: + value: false +bf16: + value: false +bf16_full_eval: + value: false +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +dispatch_batches: + value: null +do_eval: + value: true +do_predict: + value: false +do_train: + value: false +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: epoch +eval_use_gather_object: + value: false +evaluation_strategy: + value: epoch +fp16: + value: true +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: false +group_by_length: + value: false +half_precision_backend: + value: auto +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +ignore_data_skip: + value: false +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +learning_rate: + value: 5e-05 +length_column_name: + value: length +load_best_model_at_end: + value: true +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: t5-bc-out/runs/May04_16-28-19_kolyoz1 +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 500 +logging_strategy: + value: steps +lr_scheduler_type: + value: linear +max_grad_norm: + value: 1 +max_steps: + value: -1 +metric_for_best_model: + value: loss +mp_parameters: + value: "" +neftune_noise_alpha: + value: null +no_cuda: + value: false +num_train_epochs: + value: 3 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_dir: + value: t5-bc-out +overwrite_output_dir: + value: false +past_index: + value: -1 +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 8 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +prediction_loss_only: + value: false +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_unused_columns: + value: true +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +run_name: + value: t5-bc-out +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: false +save_steps: + value: 500 +save_strategy: + value: epoch +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +split_batches: + value: null +tf32: + value: null +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_liger_kernel: + value: false +use_mps_device: + value: false +warmup_ratio: + value: 0 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/wandb/run-20250504_162813-vqs6o6w5/files/output.log b/wandb/run-20250504_162813-vqs6o6w5/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..a8998b57d109ed7312a9c2e3f41928ec6b4905de --- /dev/null +++ b/wandb/run-20250504_162813-vqs6o6w5/files/output.log @@ -0,0 +1,23 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Map: 100%|██████████| 70/70 [00:00<00:00, 4467.73 examples/s] +Map: 100%|██████████| 15/15 [00:00<00:00, 2557.19 examples/s] +/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2025-05-04 16:28:25,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. +100%|██████████| 6/6 [01:06<00:00, 11.03s/it] +Map: 100%|██████████| 15/15 [00:00<00:00, 3353.30 examples/s] +{'eval_loss': 0.23444823920726776, 'eval_accuracy': 1.0, 'eval_runtime': 0.0842, 'eval_samples_per_second': 178.164, 'eval_steps_per_second': 23.755, 'epoch': 0.89} +{'eval_loss': 0.08114013075828552, 'eval_accuracy': 1.0, 'eval_runtime': 0.0928, 'eval_samples_per_second': 161.657, 'eval_steps_per_second': 21.554, 'epoch': 1.78} +{'eval_loss': 0.0510762594640255, 'eval_accuracy': 1.0, 'eval_runtime': 0.0788, 'eval_samples_per_second': 190.397, 'eval_steps_per_second': 25.386, 'epoch': 2.67} +{'train_runtime': 66.2064, 'train_samples_per_second': 3.172, 'train_steps_per_second': 0.091, 'train_loss': 0.281462828318278, 'epoch': 2.67} +100%|██████████| 2/2 [00:00<00:00, 90.79it/s] +{'eval_loss': 0.046335864812135696, 'eval_accuracy': 1.0, 'eval_runtime': 0.0528, 'eval_samples_per_second': 284.031, 'eval_steps_per_second': 37.871, 'epoch': 2.6666666666666665} +Traceback (most recent call last): + File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in + trainer.save_model( +TypeError: Trainer.save_model() got an unexpected keyword argument 'safe_serialization' +Traceback (most recent call last): + File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in + trainer.save_model( +TypeError: Trainer.save_model() got an unexpected keyword argument 'safe_serialization' diff --git a/wandb/run-20250504_162813-vqs6o6w5/files/requirements.txt b/wandb/run-20250504_162813-vqs6o6w5/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..847c45ecccb522de294762faeeb01fe5fb02f7ac --- /dev/null +++ b/wandb/run-20250504_162813-vqs6o6w5/files/requirements.txt @@ -0,0 +1,541 @@ +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +pyg-lib==0.4.0+pt20cu117 +biopython==1.85 +iniconfig==2.0.0 +tokenizers==0.20.0 +accelerate==1.3.0 +torch==2.6.0 +nvidia-nccl-cu12==2.21.5 +transformers==4.45.2 +nvidia-cusparse-cu12==12.3.1.170 +torch-scatter==2.1.2+pt20cu117 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nvtx-cu12==12.4.127 +zstd==1.5.6.6 +fair-esm==2.0.0 +omegaconf==2.3.0 +pluggy==1.5.0 +pytest==8.3.5 +nvidia-curand-cu12==10.3.5.147 +nvidia-cufft-cu12==11.2.1.3 +torch-cluster==1.6.3+pt20cu117 +regex==2024.9.11 +nvidia-cudnn-cu12==9.1.0.70 +torch-spline-conv==1.2.2+pt20cu117 +nvidia-cusolver-cu12==11.6.1.9 +antlr4-python3-runtime==4.9.3 +msgpack-numpy==0.4.8 +nlp==0.2.0 +einops==0.8.1 +nvidia-cublas-cu12==12.4.5.8 +triton==3.2.0 +ninja==1.11.1.3 +hydra-core==1.3.2 +nvidia-nvjitlink-cu12==12.4.127 +biotite==0.41.2 +torch-sparse==0.6.18+pt20cu117 +esm==3.1.4 +sympy==1.13.1 +nvidia-cuda-runtime-cu12==12.4.127 +jupyter-lsp==2.2.5 +jupyter-events==0.10.0 +ipykernel==6.29.5 +Mako==1.3.5 +proto-plus==1.25.0 +fst-pso==1.8.1 +gensim==4.3.3 +htmlmin==0.1.12 +tokenizers==0.13.3 +timm==1.0.11 +MarkupSafe==3.0.2 +safetensors==0.4.5 +requests==2.32.3 +gast==0.5.5 +cuml==24.12.0a33 +jaxlib==0.4.23.dev20240214 +spacy-loggers==1.0.5 +pytz==2024.1 +idna==3.10 +python-dateutil==2.9.0 +mdurl==0.1.2 +blis==0.7.10 +jupyter==1.1.1 +pyerfa==2.0.1.5 +comm==0.2.2 +pygraphviz==1.14 +dill==0.3.8 +paramiko==3.5.0 +llama-index==0.8.36 +mdit-py-plugins==0.4.2 +Werkzeug==3.1.3 +pyu2f==0.1.5 +dask-glm==0.2.0 +httpx==0.27.2 +typeguard==4.4.1 +mypy-extensions==1.0.0 +kmodes==0.12.2 +keras==2.15.0 +ydata-profiling==0.0.dev0 +regex==2024.11.6 +xarray==2024.11.0 +setuptools==75.3.0 +charset-normalizer==3.4.0 +jupyterlab_nvdashboard==0.11.0 +pylibraft==24.12.0a36 +spacy==3.7.6 +mlflow-skinny==2.17.2 +nvtx==0.2.10 +multimethod==1.12 +pexpect==4.9.0 +torch==2.1.0.post301 +flatbuffers==24.3.25 +python-json-logger==2.0.7 +PyJWT==2.9.0 +multiprocess==0.70.16 +colorlover==0.3.0 +yarl==1.16.0 +locket==1.0.0 +patsy==1.0.0 +rapids-dask-dependency==24.12.0a0 +stanza==1.9.2 +debugpy==1.8.8 +jupyterlab_pygments==0.3.0 +pylibcudf==24.12.0a337 +lz4==4.3.3 +pandas==2.2.3 +tifffile==2024.9.20 +pynvml==11.4.1 +cufflinks==0.17.3 +ipywidgets==8.1.5 +requests-oauthlib==2.0.0 +google-auth-oauthlib==1.2.1 +rsa==4.9 +webcolors==24.8.0 +jsonschema-specifications==2024.10.1 +scikit-learn==1.5.2 +langchain-text-splitters==0.3.2 +pandas-datareader==0.10.0 +tomli==2.0.2 +tzdata==2024.2 +scikit-image==0.24.0 +tensorboard_data_server==0.7.0 +kiwisolver==1.4.7 +cloudpathlib==0.20.0 +isodate==0.6.1 +adversarial-robustness-toolbox==1.19.1 +SQLAlchemy==2.0.36 +pytest-runner==6.0.0 +pycairo==1.27.0 +treelite==4.3.0 +jiter==0.7.0 +threadpoolctl==3.5.0 +pandocfilters==1.5.0 +loguru==0.7.2 +smart_open==7.0.5 +shellingham==1.5.4 +deepspeed==0.15.4 +prompt_toolkit==3.0.48 +databricks-sdk==0.34.0 +langchain-core==0.3.15 +imageio==2.36.0 +openapi-schema-pydantic==1.2.4 +zict==3.0.0 +cachetools==5.5.0 +colorful==0.5.6 +mpmath==1.3.0 +nest_asyncio==1.6.0 +pyFUME==0.2.25 +opencv-python-headless==4.9.0 +fastai==2.7.18 +importlib_resources==6.4.5 +binaryornot==0.4.4 +evaluate==0.4.1 +matplotlib-inline==0.1.7 +wasabi==1.1.2 +pycparser==2.22 +GitPython==3.1.43 +pluggy==1.5.0 +async-lru==2.0.4 +pgmpy==0.1.24 +anyio==4.4.0 +executing==2.1.0 +orjson==3.10.11 +humanfriendly==10.0 +tornado==6.4.1 +gmpy2==2.1.5 +rlPyCairo==0.2.0 +distributed==2024.11.0 +FuzzyTM==2.0.5 +torchtext==0.15.2a0+5ce3163 +pytest==8.3.5 +pyod==2.0.2 +ImageHash==4.3.1 +soupsieve==2.5 +tblib==3.0.0 +emoji==2.14.0 +aiohappyeyeballs==2.4.3 +uri-template==1.3.0 +tensorflow_estimator==2.15.0 +babel==2.16.0 +dask-cuda==24.12.0a12 +overrides==7.7.0 +opencensus==0.11.3 +openai==0.28.1 +language_data==1.2.0 +jedi==0.19.2 +cookiecutter==2.6.0 +entrypoints==0.4 +exceptiongroup==1.2.2 +marisa-trie==1.2.0 +uvloop==0.20.0 +aiosignal==1.3.1 +Flask==3.0.3 +tensorboard==2.15.2 +cffi==1.17.1 +tf_keras==2.15.0 +absl-py==2.1.0 +blinker==1.9.0 +types-python-dateutil==2.9.0.20241003 +opencv-python==4.9.0 +frozendict==2.4.6 +aiohttp-cors==0.7.0 +statsmodels==0.14.4 +tinycss2==1.4.0 +terminado==0.18.1 +pycaret==2.2.3 +aiohttp==3.10.10 +distributed-ucxx==0.41.0 +prometheus_client==0.21.0 +fastdownload==0.0.7 +grpcio==1.59.3 +google-api-core==2.22.0 +jupyterlab_widgets==3.0.13 +appdirs==1.4.4 +littleutils==0.0.0 +ray==2.24.0 +kaggle==1.6.17 +jsonschema==4.23.0 +google-auth==2.36.0 +scikit-base==0.11.0 +visions==0.7.6 +pyarrow==15.0.0 +transformers==4.33.0 +prometheus_flask_exporter==0.23.1 +dm-tree==0.1.8 +colorama==0.4.6 +requests-toolbelt==1.0.0 +cached-property==1.5.2 +cymem==2.0.8 +PyNaCl==1.5.0 +PyWavelets==1.7.0 +httptools==0.6.1 +typing-utils==0.1.0 +email_validator==2.2.0 +marshmallow==3.23.1 +Deprecated==1.2.14 +virtualenv==20.4.7 +optuna==3.6.1 +jupyter_server==2.14.2 +termcolor==2.5.0 +mpi4py==4.0.1 +torchdata==0.7.1+8cea82f +dataclasses==0.8 +cloudpickle==3.1.0 +tree_sitter_languages==1.10.2 +tabulate==0.9.0 +ipython==8.29.0 +lightgbm==4.3.0 +captum==0.6.0 +confuse==2.0.1 +torchvision==0.16.1+adc3221 +lxml==4.9.4 +fastapi==0.115.4 +python-multipart==0.0.17 +dnspython==2.7.0 +jupyter-console==6.6.3 +preshed==3.0.9 +py-cpuinfo==9.0.0 +Send2Trash==1.8.3 +murmurhash==1.0.10 +sniffio==1.3.1 +websockets==13.1 +h11==0.14.0 +smmap==5.0.0 +textual==0.85.2 +jsonpatch==1.33 +opencensus-context==0.1.3 +nbconvert==7.16.4 +sentry-sdk==2.19.0 +opentelemetry-semantic-conventions==0.37b0 +pandas-profiling==2.8.0 +pillow==10.3.0 +peft==0.13.2 +rpds-py==0.21.0 +bokeh==3.6.1 +distro==1.9.0 +itsdangerous==2.2.0 +wandb==0.18.7 +jsonpointer==3.0.0 +astropy-iers-data==0.2024.11.11.0.32.38 +horovod==0.28.1 +graphviz==0.20.3 +vtk==9.3.1 +bleach==6.2.0 +numexpr==2.8.7 +pydantic_core==2.23.4 +Jinja2==3.1.4 +widgetsnbextension==4.0.13 +filelock==3.16.1 +catboost==1.2.7 +raft-dask==24.12.0a36 +async-timeout==4.0.3 +datefinder==0.7.3 +coloredlogs==15.0.1 +platformdirs==4.3.6 +spacy-legacy==3.0.12 +chardet==5.2.0 +jupyter_client==8.6.3 +importlib_metadata==8.5.0 +rfc3986-validator==0.1.1 +huggingface_hub==0.26.2 +PySocks==1.7.1 +mlxtend==0.23.2 +outdated==0.2.2 +partd==1.4.2 +thinc==8.2.5 +astropy==6.1.6 +rdflib==6.3.2 +h2==4.1.0 +typer==0.13.0 +xyzservices==2024.9.0 +toolz==0.12.1 +frozenlist==1.5.0 +rdkit==2024.9.2 +pyasn1==0.6.1 +jupyter_server_terminals==0.5.3 +ucx-py==0.41.0a11 +astunparse==1.6.3 +simpful==2.12.0 +notebook_shim==0.2.4 +scipy==1.13.1 +colorlog==6.9.0 +tiktoken==0.3.3 +plotly==5.24.1 +fastrlock==0.8.2 +chart-studio==1.1.0 +stack-data==0.6.2 +google-pasta==0.2.0 +sktime==0.34.0 +PyYAML==6.0.2 +sympy==1.13.3 +multidict==6.1.0 +ml-dtypes==0.2.0 +tensorboardX==2.6.2.2 +decorator==5.1.1 +cytoolz==1.0.0 +ase==3.23.0 +isoduration==20.11.0 +html5lib==1.1 +langsmith==0.1.142 +future==1.0.0 +onnx2torch==1.5.15 +multipledispatch==0.6.0 +protobuf==4.24.4 +ucxx==0.41.0 +pandas_flavor==0.6.0 +msgpack==1.1.0 +pyasn1_modules==0.4.1 +imagecodecs==2024.1.1 +mlflow==2.17.2 +watchfiles==0.24.0 +dm-sonnet==2.0.2 +langcodes==3.4.1 +freetype-py==2.3.0 +argon2-cffi-bindings==21.2.0 +trimesh==4.5.2 +opt_einsum==3.4.0 +tenacity==8.5.0 +h5py==3.12.1 +fastapi-cli==0.0.5 +oauthlib==3.2.2 +parso==0.8.4 +weasel==0.4.1 +yfinance==0.2.49 +networkx==2.8.8 +bitsandbytes==0.44.1 +lazy_loader==0.4 +querystring_parser==1.2.4 +contourpy==1.3.0 +unicodedata2==15.1.0 +bcrypt==4.2.0 +munkres==1.1.4 +langchain==0.0.298 +hpack==4.0.0 +cryptography==43.0.3 +umap-learn==0.5.7 +arrow==1.3.0 +docker==7.1.0 +certifi==2025.1.31 +fastjsonschema==2.20.0 +tensorflow==2.15.0 +googleapis-common-protos==1.65.0 +iniconfig==2.0.0 +Markdown==3.6 +llvmlite==0.43.0 +wslink==2.3.2 +attrs==24.2.0 +rich==13.9.4 +cupy==13.3.0 +uc-micro-py==1.0.3 +alembic==1.14.0 +joblib==1.4.2 +reportlab==4.2.5 +miniful==0.0.6 +jupyter_core==5.7.2 +wheel==0.45.0 +phik==0.12.3 +mistune==3.0.2 +wcwidth==0.2.13 +dacite==1.8.1 +accelerate==0.22.0 +sacremoses==0.0.53 +revtok==0.0.3 +python-slugify==8.0.4 +tangled-up-in-unicode==0.2.0 +dask==2024.11.0 +markdown-it-py==3.0.0 +sentencepiece==0.1.99 +beautifulsoup4==4.12.3 +six==1.16.0 +numba-cuda==0.0.17 +argon2-cffi==23.1.0 +xxhash==3.5.0 +hjson==3.1.0 +fonttools==4.54.1 +graphql-core==3.2.5 +pyparsing==3.2.0 +pure_eval==0.2.3 +distlib==0.3.9 +lightning==2.4.0 +wordcloud==0.0.0 +catalogue==2.0.10 +jax==0.4.27 +tree-sitter==0.23.2 +notebook==7.2.2 +dataclasses-json==0.6.7 +propcache==0.2.0 +numba==0.60.0 +dask-expr==1.1.17 +pydantic==2.9.2 +gunicorn==22.0.0 +missingno==0.5.2 +pyOpenSSL==24.2.1 +openpyxl==3.1.5 +packaging==24.1 +python-dotenv==1.0.1 +cycler==0.12.1 +types-pytz==2024.2.0.20241003 +yellowbrick==1.5 +referencing==0.35.1 +pyLDAvis==3.4.1 +lazypredict==0.2.16 +fqdn==1.5.1 +websocket-client==1.8.0 +fastcore==1.7.19 +pynvjitlink-cu12==0.3.0 +pingouin==0.5.5 +numpy==1.26.4 +typing-inspect==0.9.0 +nltk==3.9.1 +onnxruntime==1.19.2 +tensorflow-probability==0.23.0 +datasets==3.0.2 +pickleshare==0.7.5 +peewee==3.17.7 +torch-geometric==2.6.1 +ptyprocess==0.7.0 +greenlet==3.1.1 +graphql-relay==3.2.0 +graphene==3.4.3 +et_xmlfile==2.0.0 +webencodings==0.5.1 +hyperframe==6.0.1 +multitasking==0.0.9 +typer-slim==0.13.0 +onnx==1.15.0 +uvicorn==0.32.0 +memray==1.13.4 +xgboost==2.1.2 +Brotli==1.1.0 +zipp==3.21.0 +nbformat==5.10.4 +responses==0.18.0 +funcy==2.0 +Pygments==2.18.0 +tqdm==4.67.0 +linkify-it-py==2.0.3 +srsly==2.4.8 +cuda-python==12.6.0 +lightning-utilities==0.11.8 +cudf==24.12.0a337 +dask-ml==2024.4.4 +docker-pycreds==0.4.0 +pkgutil_resolve_name==1.3.10 +opentelemetry-api==1.16.0 +fsspec==2024.9.0 +nbclient==0.10.0 +psutil==5.9.8 +pytorch-lightning==2.4.0 +sortedcontainers==2.4.0 +matplotlib==3.9.2 +defusedxml==0.7.1 +urllib3==1.26.19 +jupyterlab_server==2.27.3 +retrying==1.3.3 +dask-cudf==24.12.0a337 +sqlparse==0.5.1 +text-unidecode==1.3 +seaborn==0.13.2 +typing_extensions==4.12.2 +pyzmq==26.2.0 +rfc3339-validator==0.1.4 +pynndescent==0.5.13 +pip==24.3.1 +confection==0.1.4 +wrapt==1.14.1 +fastprogress==1.0.3 +traitlets==5.14.3 +asttokens==2.4.1 +json5==0.9.28 +pandas-stubs==2.2.3.241126 +torchmetrics==1.2.1 +gitdb==4.0.11 +annotated-types==0.7.0 +ipython-autotime==0.1 +httpcore==1.0.6 +click==8.1.7 +setproctitle==1.3.3 +starlette==0.41.2 +jupyterlab==4.2.5 +rmm==24.12.0a27 +opentelemetry-sdk==1.16.0 +textblob==0.15.3 +imbalanced-learn==0.12.4 +typeguard==4.3.0 +more-itertools==10.3.0 +zipp==3.19.2 +autocommand==2.2.2 +jaraco.context==5.3.0 +packaging==24.1 +importlib_metadata==8.0.0 +platformdirs==4.2.2 +jaraco.functools==4.0.1 +importlib_resources==6.4.0 +tomli==2.0.1 +jaraco.text==3.12.1 +wheel==0.43.0 +jaraco.collections==5.1.0 +typing_extensions==4.12.2 +inflect==7.3.1 +backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_162813-vqs6o6w5/files/wandb-metadata.json b/wandb/run-20250504_162813-vqs6o6w5/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4cdef3cdbd7a79ab90f8929362f8e799e456f186 --- /dev/null +++ b/wandb/run-20250504_162813-vqs6o6w5/files/wandb-metadata.json @@ -0,0 +1,77 @@ +{ + "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", + "python": "3.10.15", + "startedAt": "2025-05-04T13:28:13.563930Z", + "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", + "codePath": "finetuning_bc_prott5.py", + "email": "zeynep.isik1@sabanciuniv.edu", + "root": "/arf/scratch/zisik/prott5_bc_ft", + "host": "kolyoz1", + "username": "zisik", + "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", + "codePathLocal": "finetuning_bc_prott5.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "7643995308032", + "used": "274899660800" + } + }, + "memory": { + "total": "1081373220864" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "cluster_name": "cuda", + "conf": "/etc/slurm/slurm.conf", + "cpus_on_node": "16", + "cpus_per_task": "16", + "gpus_on_node": "1", + "gtids": "0", + "job_account": "tbag154", + "job_cpus_per_node": "16", + "job_end_time": "1746624467", + "job_gid": "11636", + "job_gpus": "1", + "job_id": "1027952", + "job_name": "msa_ph_pt", + "job_nodelist": "kolyoz1", + "job_num_nodes": "1", + "job_partition": "kolyoz-cuda", + "job_qos": "tbag", + "job_start_time": "1746365267", + "job_uid": "11636", + "job_user": "zisik", + "jobid": "1027952", + "localid": "0", + "mem_per_cpu": "14000", + "nnodes": "1", + "node_aliases": "(null)", + "nodeid": "0", + "nodelist": "kolyoz1", + "prio_process": "0", + "procid": "0", + "submit_dir": "/arf/scratch/zisik", + "submit_host": "cuda-ui", + "task_pid": "3182008", + "tasks_per_node": "1", + "topology_addr": "kolyoz1", + "topology_addr_pattern": "node", + "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" + }, + "cudaVersion": "12.6" +} \ No newline at end of file diff --git a/wandb/run-20250504_162813-vqs6o6w5/files/wandb-summary.json b/wandb/run-20250504_162813-vqs6o6w5/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..35b5e4df9660cdaaebd8d4bd033fa5210a1d5fcd --- /dev/null +++ b/wandb/run-20250504_162813-vqs6o6w5/files/wandb-summary.json @@ -0,0 +1 @@ +{"train_steps_per_second":0.091,"eval/samples_per_second":284.031,"_step":4,"eval/runtime":0.0528,"eval/loss":0.046335864812135696,"total_flos":0,"train_runtime":66.2064,"train_loss":0.281462828318278,"eval/accuracy":1,"_timestamp":1.746365376058223e+09,"_runtime":82.494806798,"train/epoch":2.6666666666666665,"train_samples_per_second":3.172,"_wandb":{"runtime":82},"eval/steps_per_second":37.871,"train/global_step":6} \ No newline at end of file diff --git a/wandb/run-20250504_162813-vqs6o6w5/logs/debug-core.log b/wandb/run-20250504_162813-vqs6o6w5/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..ed0d11f5576631e8906b885ebeffe69cd840f942 --- /dev/null +++ b/wandb/run-20250504_162813-vqs6o6w5/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-05-04T16:28:12.92389089+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpsu_1e075/port-3182035.txt","pid":3182035,"debug":false,"disable-analytics":false} +{"time":"2025-05-04T16:28:12.923946336+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2025-05-04T16:28:12.924930159+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3182035} +{"time":"2025-05-04T16:28:12.924790098+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":40313,"Zone":""}} +{"time":"2025-05-04T16:28:13.10973957+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:59452"} +{"time":"2025-05-04T16:28:13.567567491+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"vqs6o6w5","id":"127.0.0.1:59452"} +{"time":"2025-05-04T16:28:13.69241432+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"vqs6o6w5","id":"127.0.0.1:59452"} +{"time":"2025-05-04T16:29:36.127793865+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:59452"} +{"time":"2025-05-04T16:29:36.127929839+03:00","level":"INFO","msg":"server is shutting down"} +{"time":"2025-05-04T16:29:36.127907923+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:59452"} +{"time":"2025-05-04T16:29:36.128125509+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:59452"} +{"time":"2025-05-04T16:29:37.558291716+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:59452"} +{"time":"2025-05-04T16:29:37.558314076+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:59452"} +{"time":"2025-05-04T16:29:37.558329488+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_162813-vqs6o6w5/logs/debug-internal.log b/wandb/run-20250504_162813-vqs6o6w5/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..92f3f80ba91679919e80e56dd8abd8e950015b71 --- /dev/null +++ b/wandb/run-20250504_162813-vqs6o6w5/logs/debug-internal.log @@ -0,0 +1,19 @@ +{"time":"2025-05-04T16:28:13.569618821+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} +{"time":"2025-05-04T16:28:13.56966556+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_162813-vqs6o6w5/logs/debug-core.log"} +{"time":"2025-05-04T16:28:13.692347406+03:00","level":"INFO","msg":"created new stream","id":"vqs6o6w5"} +{"time":"2025-05-04T16:28:13.692401835+03:00","level":"INFO","msg":"stream: started","id":"vqs6o6w5"} +{"time":"2025-05-04T16:28:13.692589976+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"vqs6o6w5"} +{"time":"2025-05-04T16:28:13.69268191+03:00","level":"INFO","msg":"handler: started","stream_id":"vqs6o6w5"} +{"time":"2025-05-04T16:28:13.692686366+03:00","level":"INFO","msg":"sender: started","stream_id":"vqs6o6w5"} +{"time":"2025-05-04T16:28:14.077830252+03:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-05-04T16:29:36.127909239+03:00","level":"INFO","msg":"stream: closing","id":"vqs6o6w5"} +{"time":"2025-05-04T16:29:36.127953372+03:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-05-04T16:29:36.129135049+03:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-05-04T16:29:36.381385986+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} +{"time":"2025-05-04T16:29:36.381410641+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} +{"time":"2025-05-04T16:29:36.381421107+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} +{"time":"2025-05-04T16:29:36.890657991+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-05-04T16:29:37.557157658+03:00","level":"INFO","msg":"handler: closed","stream_id":"vqs6o6w5"} +{"time":"2025-05-04T16:29:37.55721188+03:00","level":"INFO","msg":"sender: closed","stream_id":"vqs6o6w5"} +{"time":"2025-05-04T16:29:37.557201882+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"vqs6o6w5"} +{"time":"2025-05-04T16:29:37.557304847+03:00","level":"INFO","msg":"stream: closed","id":"vqs6o6w5"} diff --git a/wandb/run-20250504_162813-vqs6o6w5/logs/debug.log b/wandb/run-20250504_162813-vqs6o6w5/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..0fb69f8e411995c37137af59a45087557d8c1802 --- /dev/null +++ b/wandb/run-20250504_162813-vqs6o6w5/logs/debug.log @@ -0,0 +1,27 @@ +2025-05-04 16:28:13,556 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 +2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Configure stats pid to 3182035 +2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings +2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings +2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} +2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 16:28:13,557 INFO MainThread:3182035 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_162813-vqs6o6w5/logs/debug.log +2025-05-04 16:28:13,558 INFO MainThread:3182035 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_162813-vqs6o6w5/logs/debug-internal.log +2025-05-04 16:28:13,558 INFO MainThread:3182035 [wandb_init.py:init():619] calling init triggers +2025-05-04 16:28:13,558 INFO MainThread:3182035 [wandb_init.py:init():626] wandb.init called with sweep_config: {} +config: {} +2025-05-04 16:28:13,558 INFO MainThread:3182035 [wandb_init.py:init():669] starting backend +2025-05-04 16:28:13,558 INFO MainThread:3182035 [wandb_init.py:init():673] sending inform_init request +2025-05-04 16:28:13,562 INFO MainThread:3182035 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-04 16:28:13,563 INFO MainThread:3182035 [wandb_init.py:init():686] backend started and connected +2025-05-04 16:28:13,569 INFO MainThread:3182035 [wandb_init.py:init():781] updated telemetry +2025-05-04 16:28:13,572 INFO MainThread:3182035 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout +2025-05-04 16:28:14,063 INFO MainThread:3182035 [wandb_init.py:init():867] starting run threads in backend +2025-05-04 16:28:15,452 INFO MainThread:3182035 [wandb_run.py:_console_start():2456] atexit reg +2025-05-04 16:28:15,452 INFO MainThread:3182035 [wandb_run.py:_redirect():2305] redirect: wrap_raw +2025-05-04 16:28:15,452 INFO MainThread:3182035 [wandb_run.py:_redirect():2370] Wrapping output streams. +2025-05-04 16:28:15,452 INFO MainThread:3182035 [wandb_run.py:_redirect():2395] Redirects installed. +2025-05-04 16:28:15,461 INFO MainThread:3182035 [wandb_init.py:init():911] run started, returning control to user process +2025-05-04 16:28:29,768 INFO MainThread:3182035 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_16-28-19_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2025-05-04 16:29:36,128 WARNING MsgRouterThr:3182035 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_162813-vqs6o6w5/run-vqs6o6w5.wandb b/wandb/run-20250504_162813-vqs6o6w5/run-vqs6o6w5.wandb new file mode 100644 index 0000000000000000000000000000000000000000..a738d1c054d6cee9c4fee21d6554ff8c952d6250 Binary files /dev/null and b/wandb/run-20250504_162813-vqs6o6w5/run-vqs6o6w5.wandb differ diff --git a/wandb/run-20250504_163202-a8cxeqmf/files/config.yaml b/wandb/run-20250504_163202-a8cxeqmf/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b134dd5782bfb1efdf79a02c28eb4463bbe598a7 --- /dev/null +++ b/wandb/run-20250504_163202-a8cxeqmf/files/config.yaml @@ -0,0 +1,357 @@ +_wandb: + value: + cli_version: 0.18.7 + m: + - "1": eval/accuracy + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/global_step + "6": + - 3 + "7": [] + - "1": eval/runtime + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/samples_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/steps_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.10.15 + t: + "1": + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "2": + - 1 + - 2 + - 3 + - 5 + - 6 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "3": + - 7 + - 23 + - 55 + - 62 + - 66 + "4": 3.10.15 + "5": 0.18.7 + "6": 4.45.2 + "8": + - 5 + "9": + "1": transformers_trainer + "12": 0.18.7 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +auto_find_batch_size: + value: false +batch_eval_metrics: + value: false +bf16: + value: false +bf16_full_eval: + value: false +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +dispatch_batches: + value: null +do_eval: + value: true +do_predict: + value: false +do_train: + value: false +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: epoch +eval_use_gather_object: + value: false +evaluation_strategy: + value: epoch +fp16: + value: true +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: false +group_by_length: + value: false +half_precision_backend: + value: auto +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +ignore_data_skip: + value: false +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +learning_rate: + value: 5e-05 +length_column_name: + value: length +load_best_model_at_end: + value: true +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: t5-bc-out/runs/May04_16-32-08_kolyoz1 +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 500 +logging_strategy: + value: steps +lr_scheduler_type: + value: linear +max_grad_norm: + value: 1 +max_steps: + value: -1 +metric_for_best_model: + value: loss +mp_parameters: + value: "" +neftune_noise_alpha: + value: null +no_cuda: + value: false +num_train_epochs: + value: 3 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_dir: + value: t5-bc-out +overwrite_output_dir: + value: false +past_index: + value: -1 +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 8 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +prediction_loss_only: + value: false +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_unused_columns: + value: true +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +run_name: + value: t5-bc-out +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: false +save_steps: + value: 500 +save_strategy: + value: epoch +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +split_batches: + value: null +tf32: + value: null +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_liger_kernel: + value: false +use_mps_device: + value: false +warmup_ratio: + value: 0 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/wandb/run-20250504_163202-a8cxeqmf/files/output.log b/wandb/run-20250504_163202-a8cxeqmf/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..3df36c5de543befcbdc45eb6aa9b4dd90fcb1682 --- /dev/null +++ b/wandb/run-20250504_163202-a8cxeqmf/files/output.log @@ -0,0 +1,35 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Map: 100%|██████████| 70/70 [00:00<00:00, 4245.80 examples/s] +Map: 100%|██████████| 15/15 [00:00<00:00, 2515.98 examples/s] +/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2025-05-04 16:32:13,990] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. +100%|██████████| 6/6 [01:07<00:00, 11.21s/it] +Map: 100%|██████████| 15/15 [00:00<00:00, 3498.75 examples/s] +{'eval_loss': 0.28029781579971313, 'eval_accuracy': 1.0, 'eval_runtime': 0.0833, 'eval_samples_per_second': 180.04, 'eval_steps_per_second': 24.005, 'epoch': 0.89} +{'eval_loss': 0.1000773161649704, 'eval_accuracy': 1.0, 'eval_runtime': 0.0863, 'eval_samples_per_second': 173.864, 'eval_steps_per_second': 23.182, 'epoch': 1.78} +{'eval_loss': 0.05684203654527664, 'eval_accuracy': 1.0, 'eval_runtime': 0.0937, 'eval_samples_per_second': 160.033, 'eval_steps_per_second': 21.338, 'epoch': 2.67} +{'train_runtime': 67.2983, 'train_samples_per_second': 3.12, 'train_steps_per_second': 0.089, 'train_loss': 0.31141672531763714, 'epoch': 2.67} +100%|██████████| 2/2 [00:00<00:00, 101.26it/s] +{'eval_loss': 0.04954631254076958, 'eval_accuracy': 1.0, 'eval_runtime': 0.0471, 'eval_samples_per_second': 318.692, 'eval_steps_per_second': 42.492, 'epoch': 2.6666666666666665} +Traceback (most recent call last): + File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in + trainer.save_model( + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3623, in save_model + self._save(output_dir) + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3704, in _save + os.makedirs(output_dir, exist_ok=True) + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/os.py", line 225, in makedirs + mkdir(name, mode) +PermissionError: [Errno 13] Permission denied: '/prott5_bc_ft' +Traceback (most recent call last): + File "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", line 141, in + trainer.save_model( + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3623, in save_model + self._save(output_dir) + File "/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3704, in _save + os.makedirs(output_dir, exist_ok=True) + File "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/lib/python3.10/os.py", line 225, in makedirs + mkdir(name, mode) +PermissionError: [Errno 13] Permission denied: '/prott5_bc_ft' diff --git a/wandb/run-20250504_163202-a8cxeqmf/files/requirements.txt b/wandb/run-20250504_163202-a8cxeqmf/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..847c45ecccb522de294762faeeb01fe5fb02f7ac --- /dev/null +++ b/wandb/run-20250504_163202-a8cxeqmf/files/requirements.txt @@ -0,0 +1,541 @@ +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +pyg-lib==0.4.0+pt20cu117 +biopython==1.85 +iniconfig==2.0.0 +tokenizers==0.20.0 +accelerate==1.3.0 +torch==2.6.0 +nvidia-nccl-cu12==2.21.5 +transformers==4.45.2 +nvidia-cusparse-cu12==12.3.1.170 +torch-scatter==2.1.2+pt20cu117 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nvtx-cu12==12.4.127 +zstd==1.5.6.6 +fair-esm==2.0.0 +omegaconf==2.3.0 +pluggy==1.5.0 +pytest==8.3.5 +nvidia-curand-cu12==10.3.5.147 +nvidia-cufft-cu12==11.2.1.3 +torch-cluster==1.6.3+pt20cu117 +regex==2024.9.11 +nvidia-cudnn-cu12==9.1.0.70 +torch-spline-conv==1.2.2+pt20cu117 +nvidia-cusolver-cu12==11.6.1.9 +antlr4-python3-runtime==4.9.3 +msgpack-numpy==0.4.8 +nlp==0.2.0 +einops==0.8.1 +nvidia-cublas-cu12==12.4.5.8 +triton==3.2.0 +ninja==1.11.1.3 +hydra-core==1.3.2 +nvidia-nvjitlink-cu12==12.4.127 +biotite==0.41.2 +torch-sparse==0.6.18+pt20cu117 +esm==3.1.4 +sympy==1.13.1 +nvidia-cuda-runtime-cu12==12.4.127 +jupyter-lsp==2.2.5 +jupyter-events==0.10.0 +ipykernel==6.29.5 +Mako==1.3.5 +proto-plus==1.25.0 +fst-pso==1.8.1 +gensim==4.3.3 +htmlmin==0.1.12 +tokenizers==0.13.3 +timm==1.0.11 +MarkupSafe==3.0.2 +safetensors==0.4.5 +requests==2.32.3 +gast==0.5.5 +cuml==24.12.0a33 +jaxlib==0.4.23.dev20240214 +spacy-loggers==1.0.5 +pytz==2024.1 +idna==3.10 +python-dateutil==2.9.0 +mdurl==0.1.2 +blis==0.7.10 +jupyter==1.1.1 +pyerfa==2.0.1.5 +comm==0.2.2 +pygraphviz==1.14 +dill==0.3.8 +paramiko==3.5.0 +llama-index==0.8.36 +mdit-py-plugins==0.4.2 +Werkzeug==3.1.3 +pyu2f==0.1.5 +dask-glm==0.2.0 +httpx==0.27.2 +typeguard==4.4.1 +mypy-extensions==1.0.0 +kmodes==0.12.2 +keras==2.15.0 +ydata-profiling==0.0.dev0 +regex==2024.11.6 +xarray==2024.11.0 +setuptools==75.3.0 +charset-normalizer==3.4.0 +jupyterlab_nvdashboard==0.11.0 +pylibraft==24.12.0a36 +spacy==3.7.6 +mlflow-skinny==2.17.2 +nvtx==0.2.10 +multimethod==1.12 +pexpect==4.9.0 +torch==2.1.0.post301 +flatbuffers==24.3.25 +python-json-logger==2.0.7 +PyJWT==2.9.0 +multiprocess==0.70.16 +colorlover==0.3.0 +yarl==1.16.0 +locket==1.0.0 +patsy==1.0.0 +rapids-dask-dependency==24.12.0a0 +stanza==1.9.2 +debugpy==1.8.8 +jupyterlab_pygments==0.3.0 +pylibcudf==24.12.0a337 +lz4==4.3.3 +pandas==2.2.3 +tifffile==2024.9.20 +pynvml==11.4.1 +cufflinks==0.17.3 +ipywidgets==8.1.5 +requests-oauthlib==2.0.0 +google-auth-oauthlib==1.2.1 +rsa==4.9 +webcolors==24.8.0 +jsonschema-specifications==2024.10.1 +scikit-learn==1.5.2 +langchain-text-splitters==0.3.2 +pandas-datareader==0.10.0 +tomli==2.0.2 +tzdata==2024.2 +scikit-image==0.24.0 +tensorboard_data_server==0.7.0 +kiwisolver==1.4.7 +cloudpathlib==0.20.0 +isodate==0.6.1 +adversarial-robustness-toolbox==1.19.1 +SQLAlchemy==2.0.36 +pytest-runner==6.0.0 +pycairo==1.27.0 +treelite==4.3.0 +jiter==0.7.0 +threadpoolctl==3.5.0 +pandocfilters==1.5.0 +loguru==0.7.2 +smart_open==7.0.5 +shellingham==1.5.4 +deepspeed==0.15.4 +prompt_toolkit==3.0.48 +databricks-sdk==0.34.0 +langchain-core==0.3.15 +imageio==2.36.0 +openapi-schema-pydantic==1.2.4 +zict==3.0.0 +cachetools==5.5.0 +colorful==0.5.6 +mpmath==1.3.0 +nest_asyncio==1.6.0 +pyFUME==0.2.25 +opencv-python-headless==4.9.0 +fastai==2.7.18 +importlib_resources==6.4.5 +binaryornot==0.4.4 +evaluate==0.4.1 +matplotlib-inline==0.1.7 +wasabi==1.1.2 +pycparser==2.22 +GitPython==3.1.43 +pluggy==1.5.0 +async-lru==2.0.4 +pgmpy==0.1.24 +anyio==4.4.0 +executing==2.1.0 +orjson==3.10.11 +humanfriendly==10.0 +tornado==6.4.1 +gmpy2==2.1.5 +rlPyCairo==0.2.0 +distributed==2024.11.0 +FuzzyTM==2.0.5 +torchtext==0.15.2a0+5ce3163 +pytest==8.3.5 +pyod==2.0.2 +ImageHash==4.3.1 +soupsieve==2.5 +tblib==3.0.0 +emoji==2.14.0 +aiohappyeyeballs==2.4.3 +uri-template==1.3.0 +tensorflow_estimator==2.15.0 +babel==2.16.0 +dask-cuda==24.12.0a12 +overrides==7.7.0 +opencensus==0.11.3 +openai==0.28.1 +language_data==1.2.0 +jedi==0.19.2 +cookiecutter==2.6.0 +entrypoints==0.4 +exceptiongroup==1.2.2 +marisa-trie==1.2.0 +uvloop==0.20.0 +aiosignal==1.3.1 +Flask==3.0.3 +tensorboard==2.15.2 +cffi==1.17.1 +tf_keras==2.15.0 +absl-py==2.1.0 +blinker==1.9.0 +types-python-dateutil==2.9.0.20241003 +opencv-python==4.9.0 +frozendict==2.4.6 +aiohttp-cors==0.7.0 +statsmodels==0.14.4 +tinycss2==1.4.0 +terminado==0.18.1 +pycaret==2.2.3 +aiohttp==3.10.10 +distributed-ucxx==0.41.0 +prometheus_client==0.21.0 +fastdownload==0.0.7 +grpcio==1.59.3 +google-api-core==2.22.0 +jupyterlab_widgets==3.0.13 +appdirs==1.4.4 +littleutils==0.0.0 +ray==2.24.0 +kaggle==1.6.17 +jsonschema==4.23.0 +google-auth==2.36.0 +scikit-base==0.11.0 +visions==0.7.6 +pyarrow==15.0.0 +transformers==4.33.0 +prometheus_flask_exporter==0.23.1 +dm-tree==0.1.8 +colorama==0.4.6 +requests-toolbelt==1.0.0 +cached-property==1.5.2 +cymem==2.0.8 +PyNaCl==1.5.0 +PyWavelets==1.7.0 +httptools==0.6.1 +typing-utils==0.1.0 +email_validator==2.2.0 +marshmallow==3.23.1 +Deprecated==1.2.14 +virtualenv==20.4.7 +optuna==3.6.1 +jupyter_server==2.14.2 +termcolor==2.5.0 +mpi4py==4.0.1 +torchdata==0.7.1+8cea82f +dataclasses==0.8 +cloudpickle==3.1.0 +tree_sitter_languages==1.10.2 +tabulate==0.9.0 +ipython==8.29.0 +lightgbm==4.3.0 +captum==0.6.0 +confuse==2.0.1 +torchvision==0.16.1+adc3221 +lxml==4.9.4 +fastapi==0.115.4 +python-multipart==0.0.17 +dnspython==2.7.0 +jupyter-console==6.6.3 +preshed==3.0.9 +py-cpuinfo==9.0.0 +Send2Trash==1.8.3 +murmurhash==1.0.10 +sniffio==1.3.1 +websockets==13.1 +h11==0.14.0 +smmap==5.0.0 +textual==0.85.2 +jsonpatch==1.33 +opencensus-context==0.1.3 +nbconvert==7.16.4 +sentry-sdk==2.19.0 +opentelemetry-semantic-conventions==0.37b0 +pandas-profiling==2.8.0 +pillow==10.3.0 +peft==0.13.2 +rpds-py==0.21.0 +bokeh==3.6.1 +distro==1.9.0 +itsdangerous==2.2.0 +wandb==0.18.7 +jsonpointer==3.0.0 +astropy-iers-data==0.2024.11.11.0.32.38 +horovod==0.28.1 +graphviz==0.20.3 +vtk==9.3.1 +bleach==6.2.0 +numexpr==2.8.7 +pydantic_core==2.23.4 +Jinja2==3.1.4 +widgetsnbextension==4.0.13 +filelock==3.16.1 +catboost==1.2.7 +raft-dask==24.12.0a36 +async-timeout==4.0.3 +datefinder==0.7.3 +coloredlogs==15.0.1 +platformdirs==4.3.6 +spacy-legacy==3.0.12 +chardet==5.2.0 +jupyter_client==8.6.3 +importlib_metadata==8.5.0 +rfc3986-validator==0.1.1 +huggingface_hub==0.26.2 +PySocks==1.7.1 +mlxtend==0.23.2 +outdated==0.2.2 +partd==1.4.2 +thinc==8.2.5 +astropy==6.1.6 +rdflib==6.3.2 +h2==4.1.0 +typer==0.13.0 +xyzservices==2024.9.0 +toolz==0.12.1 +frozenlist==1.5.0 +rdkit==2024.9.2 +pyasn1==0.6.1 +jupyter_server_terminals==0.5.3 +ucx-py==0.41.0a11 +astunparse==1.6.3 +simpful==2.12.0 +notebook_shim==0.2.4 +scipy==1.13.1 +colorlog==6.9.0 +tiktoken==0.3.3 +plotly==5.24.1 +fastrlock==0.8.2 +chart-studio==1.1.0 +stack-data==0.6.2 +google-pasta==0.2.0 +sktime==0.34.0 +PyYAML==6.0.2 +sympy==1.13.3 +multidict==6.1.0 +ml-dtypes==0.2.0 +tensorboardX==2.6.2.2 +decorator==5.1.1 +cytoolz==1.0.0 +ase==3.23.0 +isoduration==20.11.0 +html5lib==1.1 +langsmith==0.1.142 +future==1.0.0 +onnx2torch==1.5.15 +multipledispatch==0.6.0 +protobuf==4.24.4 +ucxx==0.41.0 +pandas_flavor==0.6.0 +msgpack==1.1.0 +pyasn1_modules==0.4.1 +imagecodecs==2024.1.1 +mlflow==2.17.2 +watchfiles==0.24.0 +dm-sonnet==2.0.2 +langcodes==3.4.1 +freetype-py==2.3.0 +argon2-cffi-bindings==21.2.0 +trimesh==4.5.2 +opt_einsum==3.4.0 +tenacity==8.5.0 +h5py==3.12.1 +fastapi-cli==0.0.5 +oauthlib==3.2.2 +parso==0.8.4 +weasel==0.4.1 +yfinance==0.2.49 +networkx==2.8.8 +bitsandbytes==0.44.1 +lazy_loader==0.4 +querystring_parser==1.2.4 +contourpy==1.3.0 +unicodedata2==15.1.0 +bcrypt==4.2.0 +munkres==1.1.4 +langchain==0.0.298 +hpack==4.0.0 +cryptography==43.0.3 +umap-learn==0.5.7 +arrow==1.3.0 +docker==7.1.0 +certifi==2025.1.31 +fastjsonschema==2.20.0 +tensorflow==2.15.0 +googleapis-common-protos==1.65.0 +iniconfig==2.0.0 +Markdown==3.6 +llvmlite==0.43.0 +wslink==2.3.2 +attrs==24.2.0 +rich==13.9.4 +cupy==13.3.0 +uc-micro-py==1.0.3 +alembic==1.14.0 +joblib==1.4.2 +reportlab==4.2.5 +miniful==0.0.6 +jupyter_core==5.7.2 +wheel==0.45.0 +phik==0.12.3 +mistune==3.0.2 +wcwidth==0.2.13 +dacite==1.8.1 +accelerate==0.22.0 +sacremoses==0.0.53 +revtok==0.0.3 +python-slugify==8.0.4 +tangled-up-in-unicode==0.2.0 +dask==2024.11.0 +markdown-it-py==3.0.0 +sentencepiece==0.1.99 +beautifulsoup4==4.12.3 +six==1.16.0 +numba-cuda==0.0.17 +argon2-cffi==23.1.0 +xxhash==3.5.0 +hjson==3.1.0 +fonttools==4.54.1 +graphql-core==3.2.5 +pyparsing==3.2.0 +pure_eval==0.2.3 +distlib==0.3.9 +lightning==2.4.0 +wordcloud==0.0.0 +catalogue==2.0.10 +jax==0.4.27 +tree-sitter==0.23.2 +notebook==7.2.2 +dataclasses-json==0.6.7 +propcache==0.2.0 +numba==0.60.0 +dask-expr==1.1.17 +pydantic==2.9.2 +gunicorn==22.0.0 +missingno==0.5.2 +pyOpenSSL==24.2.1 +openpyxl==3.1.5 +packaging==24.1 +python-dotenv==1.0.1 +cycler==0.12.1 +types-pytz==2024.2.0.20241003 +yellowbrick==1.5 +referencing==0.35.1 +pyLDAvis==3.4.1 +lazypredict==0.2.16 +fqdn==1.5.1 +websocket-client==1.8.0 +fastcore==1.7.19 +pynvjitlink-cu12==0.3.0 +pingouin==0.5.5 +numpy==1.26.4 +typing-inspect==0.9.0 +nltk==3.9.1 +onnxruntime==1.19.2 +tensorflow-probability==0.23.0 +datasets==3.0.2 +pickleshare==0.7.5 +peewee==3.17.7 +torch-geometric==2.6.1 +ptyprocess==0.7.0 +greenlet==3.1.1 +graphql-relay==3.2.0 +graphene==3.4.3 +et_xmlfile==2.0.0 +webencodings==0.5.1 +hyperframe==6.0.1 +multitasking==0.0.9 +typer-slim==0.13.0 +onnx==1.15.0 +uvicorn==0.32.0 +memray==1.13.4 +xgboost==2.1.2 +Brotli==1.1.0 +zipp==3.21.0 +nbformat==5.10.4 +responses==0.18.0 +funcy==2.0 +Pygments==2.18.0 +tqdm==4.67.0 +linkify-it-py==2.0.3 +srsly==2.4.8 +cuda-python==12.6.0 +lightning-utilities==0.11.8 +cudf==24.12.0a337 +dask-ml==2024.4.4 +docker-pycreds==0.4.0 +pkgutil_resolve_name==1.3.10 +opentelemetry-api==1.16.0 +fsspec==2024.9.0 +nbclient==0.10.0 +psutil==5.9.8 +pytorch-lightning==2.4.0 +sortedcontainers==2.4.0 +matplotlib==3.9.2 +defusedxml==0.7.1 +urllib3==1.26.19 +jupyterlab_server==2.27.3 +retrying==1.3.3 +dask-cudf==24.12.0a337 +sqlparse==0.5.1 +text-unidecode==1.3 +seaborn==0.13.2 +typing_extensions==4.12.2 +pyzmq==26.2.0 +rfc3339-validator==0.1.4 +pynndescent==0.5.13 +pip==24.3.1 +confection==0.1.4 +wrapt==1.14.1 +fastprogress==1.0.3 +traitlets==5.14.3 +asttokens==2.4.1 +json5==0.9.28 +pandas-stubs==2.2.3.241126 +torchmetrics==1.2.1 +gitdb==4.0.11 +annotated-types==0.7.0 +ipython-autotime==0.1 +httpcore==1.0.6 +click==8.1.7 +setproctitle==1.3.3 +starlette==0.41.2 +jupyterlab==4.2.5 +rmm==24.12.0a27 +opentelemetry-sdk==1.16.0 +textblob==0.15.3 +imbalanced-learn==0.12.4 +typeguard==4.3.0 +more-itertools==10.3.0 +zipp==3.19.2 +autocommand==2.2.2 +jaraco.context==5.3.0 +packaging==24.1 +importlib_metadata==8.0.0 +platformdirs==4.2.2 +jaraco.functools==4.0.1 +importlib_resources==6.4.0 +tomli==2.0.1 +jaraco.text==3.12.1 +wheel==0.43.0 +jaraco.collections==5.1.0 +typing_extensions==4.12.2 +inflect==7.3.1 +backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_163202-a8cxeqmf/files/wandb-metadata.json b/wandb/run-20250504_163202-a8cxeqmf/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..43011783c61f4ffb7c1e598cc8f3542d15c2ce5a --- /dev/null +++ b/wandb/run-20250504_163202-a8cxeqmf/files/wandb-metadata.json @@ -0,0 +1,77 @@ +{ + "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", + "python": "3.10.15", + "startedAt": "2025-05-04T13:32:02.055600Z", + "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", + "codePath": "finetuning_bc_prott5.py", + "email": "zeynep.isik1@sabanciuniv.edu", + "root": "/arf/scratch/zisik/prott5_bc_ft", + "host": "kolyoz1", + "username": "zisik", + "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", + "codePathLocal": "finetuning_bc_prott5.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "7643995308032", + "used": "274920230912" + } + }, + "memory": { + "total": "1081373220864" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "cluster_name": "cuda", + "conf": "/etc/slurm/slurm.conf", + "cpus_on_node": "16", + "cpus_per_task": "16", + "gpus_on_node": "1", + "gtids": "0", + "job_account": "tbag154", + "job_cpus_per_node": "16", + "job_end_time": "1746624696", + "job_gid": "11636", + "job_gpus": "1", + "job_id": "1027955", + "job_name": "msa_ph_pt", + "job_nodelist": "kolyoz1", + "job_num_nodes": "1", + "job_partition": "kolyoz-cuda", + "job_qos": "tbag", + "job_start_time": "1746365496", + "job_uid": "11636", + "job_user": "zisik", + "jobid": "1027955", + "localid": "0", + "mem_per_cpu": "14000", + "nnodes": "1", + "node_aliases": "(null)", + "nodeid": "0", + "nodelist": "kolyoz1", + "prio_process": "0", + "procid": "0", + "submit_dir": "/arf/scratch/zisik", + "submit_host": "cuda-ui", + "task_pid": "3182736", + "tasks_per_node": "1", + "topology_addr": "kolyoz1", + "topology_addr_pattern": "node", + "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" + }, + "cudaVersion": "12.6" +} \ No newline at end of file diff --git a/wandb/run-20250504_163202-a8cxeqmf/files/wandb-summary.json b/wandb/run-20250504_163202-a8cxeqmf/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ef3a5d0d977ffd8ab00ee2541f1ee061080e14a6 --- /dev/null +++ b/wandb/run-20250504_163202-a8cxeqmf/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":83},"train_loss":0.31141672531763714,"train_steps_per_second":0.089,"eval/accuracy":1,"eval/steps_per_second":42.492,"train_samples_per_second":3.12,"eval/samples_per_second":318.692,"train/global_step":6,"train_runtime":67.2983,"_runtime":83.580626717,"_timestamp":1.746365605635744e+09,"eval/loss":0.04954631254076958,"train/epoch":2.6666666666666665,"total_flos":0,"eval/runtime":0.0471,"_step":4} \ No newline at end of file diff --git a/wandb/run-20250504_163202-a8cxeqmf/logs/debug-core.log b/wandb/run-20250504_163202-a8cxeqmf/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..4d5cf6f58a5c9fc81b4569f485360bbc66a4434a --- /dev/null +++ b/wandb/run-20250504_163202-a8cxeqmf/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-05-04T16:32:01.418393778+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmplvhvsc_q/port-3182760.txt","pid":3182760,"debug":false,"disable-analytics":false} +{"time":"2025-05-04T16:32:01.418441665+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2025-05-04T16:32:01.419205535+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":45799,"Zone":""}} +{"time":"2025-05-04T16:32:01.419534072+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3182760} +{"time":"2025-05-04T16:32:01.606152917+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:48576"} +{"time":"2025-05-04T16:32:02.057688618+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"a8cxeqmf","id":"127.0.0.1:48576"} +{"time":"2025-05-04T16:32:02.186607102+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"a8cxeqmf","id":"127.0.0.1:48576"} +{"time":"2025-05-04T16:33:25.702060103+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:48576"} +{"time":"2025-05-04T16:33:25.702177617+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:48576"} +{"time":"2025-05-04T16:33:25.702273436+03:00","level":"INFO","msg":"server is shutting down"} +{"time":"2025-05-04T16:33:25.702373794+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:48576"} +{"time":"2025-05-04T16:33:26.974600306+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:48576"} +{"time":"2025-05-04T16:33:26.974618713+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:48576"} +{"time":"2025-05-04T16:33:26.974630492+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_163202-a8cxeqmf/logs/debug-internal.log b/wandb/run-20250504_163202-a8cxeqmf/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ebf1e141889b4f95835db70d0596ae304d399be9 --- /dev/null +++ b/wandb/run-20250504_163202-a8cxeqmf/logs/debug-internal.log @@ -0,0 +1,19 @@ +{"time":"2025-05-04T16:32:02.059376166+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} +{"time":"2025-05-04T16:32:02.059422726+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_163202-a8cxeqmf/logs/debug-core.log"} +{"time":"2025-05-04T16:32:02.18652874+03:00","level":"INFO","msg":"created new stream","id":"a8cxeqmf"} +{"time":"2025-05-04T16:32:02.186595094+03:00","level":"INFO","msg":"stream: started","id":"a8cxeqmf"} +{"time":"2025-05-04T16:32:02.18671057+03:00","level":"INFO","msg":"handler: started","stream_id":"a8cxeqmf"} +{"time":"2025-05-04T16:32:02.186759328+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"a8cxeqmf"} +{"time":"2025-05-04T16:32:02.186873015+03:00","level":"INFO","msg":"sender: started","stream_id":"a8cxeqmf"} +{"time":"2025-05-04T16:32:02.609103171+03:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-05-04T16:33:25.702185181+03:00","level":"INFO","msg":"stream: closing","id":"a8cxeqmf"} +{"time":"2025-05-04T16:33:25.702250772+03:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-05-04T16:33:25.703241445+03:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-05-04T16:33:25.984446677+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} +{"time":"2025-05-04T16:33:25.98447338+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} +{"time":"2025-05-04T16:33:25.984484498+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} +{"time":"2025-05-04T16:33:26.497866306+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-05-04T16:33:26.973748978+03:00","level":"INFO","msg":"handler: closed","stream_id":"a8cxeqmf"} +{"time":"2025-05-04T16:33:26.973797312+03:00","level":"INFO","msg":"sender: closed","stream_id":"a8cxeqmf"} +{"time":"2025-05-04T16:33:26.973781655+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"a8cxeqmf"} +{"time":"2025-05-04T16:33:26.973934766+03:00","level":"INFO","msg":"stream: closed","id":"a8cxeqmf"} diff --git a/wandb/run-20250504_163202-a8cxeqmf/logs/debug.log b/wandb/run-20250504_163202-a8cxeqmf/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..2d0d3a8a5a0e5c625f7eb9466c7bbde46220ad7b --- /dev/null +++ b/wandb/run-20250504_163202-a8cxeqmf/logs/debug.log @@ -0,0 +1,27 @@ +2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 +2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Configure stats pid to 3182760 +2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings +2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings +2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} +2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 16:32:02,049 INFO MainThread:3182760 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_163202-a8cxeqmf/logs/debug.log +2025-05-04 16:32:02,050 INFO MainThread:3182760 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_163202-a8cxeqmf/logs/debug-internal.log +2025-05-04 16:32:02,050 INFO MainThread:3182760 [wandb_init.py:init():619] calling init triggers +2025-05-04 16:32:02,050 INFO MainThread:3182760 [wandb_init.py:init():626] wandb.init called with sweep_config: {} +config: {} +2025-05-04 16:32:02,050 INFO MainThread:3182760 [wandb_init.py:init():669] starting backend +2025-05-04 16:32:02,050 INFO MainThread:3182760 [wandb_init.py:init():673] sending inform_init request +2025-05-04 16:32:02,054 INFO MainThread:3182760 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-04 16:32:02,055 INFO MainThread:3182760 [wandb_init.py:init():686] backend started and connected +2025-05-04 16:32:02,063 INFO MainThread:3182760 [wandb_init.py:init():781] updated telemetry +2025-05-04 16:32:02,066 INFO MainThread:3182760 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout +2025-05-04 16:32:02,595 INFO MainThread:3182760 [wandb_init.py:init():867] starting run threads in backend +2025-05-04 16:32:03,942 INFO MainThread:3182760 [wandb_run.py:_console_start():2456] atexit reg +2025-05-04 16:32:03,942 INFO MainThread:3182760 [wandb_run.py:_redirect():2305] redirect: wrap_raw +2025-05-04 16:32:03,942 INFO MainThread:3182760 [wandb_run.py:_redirect():2370] Wrapping output streams. +2025-05-04 16:32:03,943 INFO MainThread:3182760 [wandb_run.py:_redirect():2395] Redirects installed. +2025-05-04 16:32:03,951 INFO MainThread:3182760 [wandb_init.py:init():911] run started, returning control to user process +2025-05-04 16:32:18,271 INFO MainThread:3182760 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_16-32-08_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2025-05-04 16:33:25,702 WARNING MsgRouterThr:3182760 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_163202-a8cxeqmf/run-a8cxeqmf.wandb b/wandb/run-20250504_163202-a8cxeqmf/run-a8cxeqmf.wandb new file mode 100644 index 0000000000000000000000000000000000000000..6a806abc084b97c4e0ea4f3b996341285fb7d3e9 Binary files /dev/null and b/wandb/run-20250504_163202-a8cxeqmf/run-a8cxeqmf.wandb differ diff --git a/wandb/run-20250504_163644-j17n0z1w/files/config.yaml b/wandb/run-20250504_163644-j17n0z1w/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..47672e8d6eabcdaeb89962fefba89c107dc6703e --- /dev/null +++ b/wandb/run-20250504_163644-j17n0z1w/files/config.yaml @@ -0,0 +1,357 @@ +_wandb: + value: + cli_version: 0.18.7 + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "1": eval/accuracy + "5": 1 + "6": + - 1 + - 3 + "7": [] + - "1": eval/samples_per_second + "5": 1 + "6": + - 1 + - 3 + "7": [] + - "1": eval/steps_per_second + "5": 1 + "6": + - 1 + - 3 + "7": [] + - "1": train/epoch + "5": 1 + "6": + - 1 + - 3 + "7": [] + - "1": eval/loss + "5": 1 + "6": + - 1 + - 3 + "7": [] + - "1": eval/runtime + "5": 1 + "6": + - 1 + - 3 + "7": [] + python_version: 3.10.15 + t: + "1": + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "2": + - 1 + - 2 + - 3 + - 5 + - 6 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "3": + - 7 + - 23 + - 55 + - 62 + - 66 + "4": 3.10.15 + "5": 0.18.7 + "6": 4.45.2 + "8": + - 5 + "9": + "1": transformers_trainer + "12": 0.18.7 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +auto_find_batch_size: + value: false +batch_eval_metrics: + value: false +bf16: + value: false +bf16_full_eval: + value: false +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +dispatch_batches: + value: null +do_eval: + value: true +do_predict: + value: false +do_train: + value: false +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: epoch +eval_use_gather_object: + value: false +evaluation_strategy: + value: epoch +fp16: + value: true +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: false +group_by_length: + value: false +half_precision_backend: + value: auto +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +ignore_data_skip: + value: false +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +learning_rate: + value: 5e-05 +length_column_name: + value: length +load_best_model_at_end: + value: true +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: t5-bc-out/runs/May04_16-36-51_kolyoz1 +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 500 +logging_strategy: + value: steps +lr_scheduler_type: + value: linear +max_grad_norm: + value: 1 +max_steps: + value: -1 +metric_for_best_model: + value: loss +mp_parameters: + value: "" +neftune_noise_alpha: + value: null +no_cuda: + value: false +num_train_epochs: + value: 3 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_dir: + value: t5-bc-out +overwrite_output_dir: + value: false +past_index: + value: -1 +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 8 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +prediction_loss_only: + value: false +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_unused_columns: + value: true +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +run_name: + value: t5-bc-out +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: false +save_steps: + value: 500 +save_strategy: + value: epoch +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +split_batches: + value: null +tf32: + value: null +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_liger_kernel: + value: false +use_mps_device: + value: false +warmup_ratio: + value: 0 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/wandb/run-20250504_163644-j17n0z1w/files/output.log b/wandb/run-20250504_163644-j17n0z1w/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..789e4c60cde818136ebb5c5f55d9196d2dbffb5e --- /dev/null +++ b/wandb/run-20250504_163644-j17n0z1w/files/output.log @@ -0,0 +1,15 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Map: 100%|██████████| 70/70 [00:00<00:00, 6893.99 examples/s] +Map: 100%|██████████| 15/15 [00:00<00:00, 3422.06 examples/s] +/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2025-05-04 16:36:56,534] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. +100%|██████████| 6/6 [01:09<00:00, 11.54s/it] +Map: 100%|██████████| 15/15 [00:00<00:00, 3398.03 examples/s] +{'eval_loss': 0.30133458971977234, 'eval_accuracy': 1.0, 'eval_runtime': 0.084, 'eval_samples_per_second': 178.602, 'eval_steps_per_second': 23.814, 'epoch': 0.89} +{'eval_loss': 0.14025470614433289, 'eval_accuracy': 1.0, 'eval_runtime': 0.0899, 'eval_samples_per_second': 166.815, 'eval_steps_per_second': 22.242, 'epoch': 1.78} +{'eval_loss': 0.09236248582601547, 'eval_accuracy': 1.0, 'eval_runtime': 0.0606, 'eval_samples_per_second': 247.332, 'eval_steps_per_second': 32.978, 'epoch': 2.67} +{'train_runtime': 69.2309, 'train_samples_per_second': 3.033, 'train_steps_per_second': 0.087, 'train_loss': 0.34036485354105633, 'epoch': 2.67} +100%|██████████| 2/2 [00:00<00:00, 93.34it/s] +{'eval_loss': 0.09890136122703552, 'eval_accuracy': 1.0, 'eval_runtime': 0.0503, 'eval_samples_per_second': 298.458, 'eval_steps_per_second': 39.794, 'epoch': 2.6666666666666665} diff --git a/wandb/run-20250504_163644-j17n0z1w/files/requirements.txt b/wandb/run-20250504_163644-j17n0z1w/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..847c45ecccb522de294762faeeb01fe5fb02f7ac --- /dev/null +++ b/wandb/run-20250504_163644-j17n0z1w/files/requirements.txt @@ -0,0 +1,541 @@ +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +pyg-lib==0.4.0+pt20cu117 +biopython==1.85 +iniconfig==2.0.0 +tokenizers==0.20.0 +accelerate==1.3.0 +torch==2.6.0 +nvidia-nccl-cu12==2.21.5 +transformers==4.45.2 +nvidia-cusparse-cu12==12.3.1.170 +torch-scatter==2.1.2+pt20cu117 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nvtx-cu12==12.4.127 +zstd==1.5.6.6 +fair-esm==2.0.0 +omegaconf==2.3.0 +pluggy==1.5.0 +pytest==8.3.5 +nvidia-curand-cu12==10.3.5.147 +nvidia-cufft-cu12==11.2.1.3 +torch-cluster==1.6.3+pt20cu117 +regex==2024.9.11 +nvidia-cudnn-cu12==9.1.0.70 +torch-spline-conv==1.2.2+pt20cu117 +nvidia-cusolver-cu12==11.6.1.9 +antlr4-python3-runtime==4.9.3 +msgpack-numpy==0.4.8 +nlp==0.2.0 +einops==0.8.1 +nvidia-cublas-cu12==12.4.5.8 +triton==3.2.0 +ninja==1.11.1.3 +hydra-core==1.3.2 +nvidia-nvjitlink-cu12==12.4.127 +biotite==0.41.2 +torch-sparse==0.6.18+pt20cu117 +esm==3.1.4 +sympy==1.13.1 +nvidia-cuda-runtime-cu12==12.4.127 +jupyter-lsp==2.2.5 +jupyter-events==0.10.0 +ipykernel==6.29.5 +Mako==1.3.5 +proto-plus==1.25.0 +fst-pso==1.8.1 +gensim==4.3.3 +htmlmin==0.1.12 +tokenizers==0.13.3 +timm==1.0.11 +MarkupSafe==3.0.2 +safetensors==0.4.5 +requests==2.32.3 +gast==0.5.5 +cuml==24.12.0a33 +jaxlib==0.4.23.dev20240214 +spacy-loggers==1.0.5 +pytz==2024.1 +idna==3.10 +python-dateutil==2.9.0 +mdurl==0.1.2 +blis==0.7.10 +jupyter==1.1.1 +pyerfa==2.0.1.5 +comm==0.2.2 +pygraphviz==1.14 +dill==0.3.8 +paramiko==3.5.0 +llama-index==0.8.36 +mdit-py-plugins==0.4.2 +Werkzeug==3.1.3 +pyu2f==0.1.5 +dask-glm==0.2.0 +httpx==0.27.2 +typeguard==4.4.1 +mypy-extensions==1.0.0 +kmodes==0.12.2 +keras==2.15.0 +ydata-profiling==0.0.dev0 +regex==2024.11.6 +xarray==2024.11.0 +setuptools==75.3.0 +charset-normalizer==3.4.0 +jupyterlab_nvdashboard==0.11.0 +pylibraft==24.12.0a36 +spacy==3.7.6 +mlflow-skinny==2.17.2 +nvtx==0.2.10 +multimethod==1.12 +pexpect==4.9.0 +torch==2.1.0.post301 +flatbuffers==24.3.25 +python-json-logger==2.0.7 +PyJWT==2.9.0 +multiprocess==0.70.16 +colorlover==0.3.0 +yarl==1.16.0 +locket==1.0.0 +patsy==1.0.0 +rapids-dask-dependency==24.12.0a0 +stanza==1.9.2 +debugpy==1.8.8 +jupyterlab_pygments==0.3.0 +pylibcudf==24.12.0a337 +lz4==4.3.3 +pandas==2.2.3 +tifffile==2024.9.20 +pynvml==11.4.1 +cufflinks==0.17.3 +ipywidgets==8.1.5 +requests-oauthlib==2.0.0 +google-auth-oauthlib==1.2.1 +rsa==4.9 +webcolors==24.8.0 +jsonschema-specifications==2024.10.1 +scikit-learn==1.5.2 +langchain-text-splitters==0.3.2 +pandas-datareader==0.10.0 +tomli==2.0.2 +tzdata==2024.2 +scikit-image==0.24.0 +tensorboard_data_server==0.7.0 +kiwisolver==1.4.7 +cloudpathlib==0.20.0 +isodate==0.6.1 +adversarial-robustness-toolbox==1.19.1 +SQLAlchemy==2.0.36 +pytest-runner==6.0.0 +pycairo==1.27.0 +treelite==4.3.0 +jiter==0.7.0 +threadpoolctl==3.5.0 +pandocfilters==1.5.0 +loguru==0.7.2 +smart_open==7.0.5 +shellingham==1.5.4 +deepspeed==0.15.4 +prompt_toolkit==3.0.48 +databricks-sdk==0.34.0 +langchain-core==0.3.15 +imageio==2.36.0 +openapi-schema-pydantic==1.2.4 +zict==3.0.0 +cachetools==5.5.0 +colorful==0.5.6 +mpmath==1.3.0 +nest_asyncio==1.6.0 +pyFUME==0.2.25 +opencv-python-headless==4.9.0 +fastai==2.7.18 +importlib_resources==6.4.5 +binaryornot==0.4.4 +evaluate==0.4.1 +matplotlib-inline==0.1.7 +wasabi==1.1.2 +pycparser==2.22 +GitPython==3.1.43 +pluggy==1.5.0 +async-lru==2.0.4 +pgmpy==0.1.24 +anyio==4.4.0 +executing==2.1.0 +orjson==3.10.11 +humanfriendly==10.0 +tornado==6.4.1 +gmpy2==2.1.5 +rlPyCairo==0.2.0 +distributed==2024.11.0 +FuzzyTM==2.0.5 +torchtext==0.15.2a0+5ce3163 +pytest==8.3.5 +pyod==2.0.2 +ImageHash==4.3.1 +soupsieve==2.5 +tblib==3.0.0 +emoji==2.14.0 +aiohappyeyeballs==2.4.3 +uri-template==1.3.0 +tensorflow_estimator==2.15.0 +babel==2.16.0 +dask-cuda==24.12.0a12 +overrides==7.7.0 +opencensus==0.11.3 +openai==0.28.1 +language_data==1.2.0 +jedi==0.19.2 +cookiecutter==2.6.0 +entrypoints==0.4 +exceptiongroup==1.2.2 +marisa-trie==1.2.0 +uvloop==0.20.0 +aiosignal==1.3.1 +Flask==3.0.3 +tensorboard==2.15.2 +cffi==1.17.1 +tf_keras==2.15.0 +absl-py==2.1.0 +blinker==1.9.0 +types-python-dateutil==2.9.0.20241003 +opencv-python==4.9.0 +frozendict==2.4.6 +aiohttp-cors==0.7.0 +statsmodels==0.14.4 +tinycss2==1.4.0 +terminado==0.18.1 +pycaret==2.2.3 +aiohttp==3.10.10 +distributed-ucxx==0.41.0 +prometheus_client==0.21.0 +fastdownload==0.0.7 +grpcio==1.59.3 +google-api-core==2.22.0 +jupyterlab_widgets==3.0.13 +appdirs==1.4.4 +littleutils==0.0.0 +ray==2.24.0 +kaggle==1.6.17 +jsonschema==4.23.0 +google-auth==2.36.0 +scikit-base==0.11.0 +visions==0.7.6 +pyarrow==15.0.0 +transformers==4.33.0 +prometheus_flask_exporter==0.23.1 +dm-tree==0.1.8 +colorama==0.4.6 +requests-toolbelt==1.0.0 +cached-property==1.5.2 +cymem==2.0.8 +PyNaCl==1.5.0 +PyWavelets==1.7.0 +httptools==0.6.1 +typing-utils==0.1.0 +email_validator==2.2.0 +marshmallow==3.23.1 +Deprecated==1.2.14 +virtualenv==20.4.7 +optuna==3.6.1 +jupyter_server==2.14.2 +termcolor==2.5.0 +mpi4py==4.0.1 +torchdata==0.7.1+8cea82f +dataclasses==0.8 +cloudpickle==3.1.0 +tree_sitter_languages==1.10.2 +tabulate==0.9.0 +ipython==8.29.0 +lightgbm==4.3.0 +captum==0.6.0 +confuse==2.0.1 +torchvision==0.16.1+adc3221 +lxml==4.9.4 +fastapi==0.115.4 +python-multipart==0.0.17 +dnspython==2.7.0 +jupyter-console==6.6.3 +preshed==3.0.9 +py-cpuinfo==9.0.0 +Send2Trash==1.8.3 +murmurhash==1.0.10 +sniffio==1.3.1 +websockets==13.1 +h11==0.14.0 +smmap==5.0.0 +textual==0.85.2 +jsonpatch==1.33 +opencensus-context==0.1.3 +nbconvert==7.16.4 +sentry-sdk==2.19.0 +opentelemetry-semantic-conventions==0.37b0 +pandas-profiling==2.8.0 +pillow==10.3.0 +peft==0.13.2 +rpds-py==0.21.0 +bokeh==3.6.1 +distro==1.9.0 +itsdangerous==2.2.0 +wandb==0.18.7 +jsonpointer==3.0.0 +astropy-iers-data==0.2024.11.11.0.32.38 +horovod==0.28.1 +graphviz==0.20.3 +vtk==9.3.1 +bleach==6.2.0 +numexpr==2.8.7 +pydantic_core==2.23.4 +Jinja2==3.1.4 +widgetsnbextension==4.0.13 +filelock==3.16.1 +catboost==1.2.7 +raft-dask==24.12.0a36 +async-timeout==4.0.3 +datefinder==0.7.3 +coloredlogs==15.0.1 +platformdirs==4.3.6 +spacy-legacy==3.0.12 +chardet==5.2.0 +jupyter_client==8.6.3 +importlib_metadata==8.5.0 +rfc3986-validator==0.1.1 +huggingface_hub==0.26.2 +PySocks==1.7.1 +mlxtend==0.23.2 +outdated==0.2.2 +partd==1.4.2 +thinc==8.2.5 +astropy==6.1.6 +rdflib==6.3.2 +h2==4.1.0 +typer==0.13.0 +xyzservices==2024.9.0 +toolz==0.12.1 +frozenlist==1.5.0 +rdkit==2024.9.2 +pyasn1==0.6.1 +jupyter_server_terminals==0.5.3 +ucx-py==0.41.0a11 +astunparse==1.6.3 +simpful==2.12.0 +notebook_shim==0.2.4 +scipy==1.13.1 +colorlog==6.9.0 +tiktoken==0.3.3 +plotly==5.24.1 +fastrlock==0.8.2 +chart-studio==1.1.0 +stack-data==0.6.2 +google-pasta==0.2.0 +sktime==0.34.0 +PyYAML==6.0.2 +sympy==1.13.3 +multidict==6.1.0 +ml-dtypes==0.2.0 +tensorboardX==2.6.2.2 +decorator==5.1.1 +cytoolz==1.0.0 +ase==3.23.0 +isoduration==20.11.0 +html5lib==1.1 +langsmith==0.1.142 +future==1.0.0 +onnx2torch==1.5.15 +multipledispatch==0.6.0 +protobuf==4.24.4 +ucxx==0.41.0 +pandas_flavor==0.6.0 +msgpack==1.1.0 +pyasn1_modules==0.4.1 +imagecodecs==2024.1.1 +mlflow==2.17.2 +watchfiles==0.24.0 +dm-sonnet==2.0.2 +langcodes==3.4.1 +freetype-py==2.3.0 +argon2-cffi-bindings==21.2.0 +trimesh==4.5.2 +opt_einsum==3.4.0 +tenacity==8.5.0 +h5py==3.12.1 +fastapi-cli==0.0.5 +oauthlib==3.2.2 +parso==0.8.4 +weasel==0.4.1 +yfinance==0.2.49 +networkx==2.8.8 +bitsandbytes==0.44.1 +lazy_loader==0.4 +querystring_parser==1.2.4 +contourpy==1.3.0 +unicodedata2==15.1.0 +bcrypt==4.2.0 +munkres==1.1.4 +langchain==0.0.298 +hpack==4.0.0 +cryptography==43.0.3 +umap-learn==0.5.7 +arrow==1.3.0 +docker==7.1.0 +certifi==2025.1.31 +fastjsonschema==2.20.0 +tensorflow==2.15.0 +googleapis-common-protos==1.65.0 +iniconfig==2.0.0 +Markdown==3.6 +llvmlite==0.43.0 +wslink==2.3.2 +attrs==24.2.0 +rich==13.9.4 +cupy==13.3.0 +uc-micro-py==1.0.3 +alembic==1.14.0 +joblib==1.4.2 +reportlab==4.2.5 +miniful==0.0.6 +jupyter_core==5.7.2 +wheel==0.45.0 +phik==0.12.3 +mistune==3.0.2 +wcwidth==0.2.13 +dacite==1.8.1 +accelerate==0.22.0 +sacremoses==0.0.53 +revtok==0.0.3 +python-slugify==8.0.4 +tangled-up-in-unicode==0.2.0 +dask==2024.11.0 +markdown-it-py==3.0.0 +sentencepiece==0.1.99 +beautifulsoup4==4.12.3 +six==1.16.0 +numba-cuda==0.0.17 +argon2-cffi==23.1.0 +xxhash==3.5.0 +hjson==3.1.0 +fonttools==4.54.1 +graphql-core==3.2.5 +pyparsing==3.2.0 +pure_eval==0.2.3 +distlib==0.3.9 +lightning==2.4.0 +wordcloud==0.0.0 +catalogue==2.0.10 +jax==0.4.27 +tree-sitter==0.23.2 +notebook==7.2.2 +dataclasses-json==0.6.7 +propcache==0.2.0 +numba==0.60.0 +dask-expr==1.1.17 +pydantic==2.9.2 +gunicorn==22.0.0 +missingno==0.5.2 +pyOpenSSL==24.2.1 +openpyxl==3.1.5 +packaging==24.1 +python-dotenv==1.0.1 +cycler==0.12.1 +types-pytz==2024.2.0.20241003 +yellowbrick==1.5 +referencing==0.35.1 +pyLDAvis==3.4.1 +lazypredict==0.2.16 +fqdn==1.5.1 +websocket-client==1.8.0 +fastcore==1.7.19 +pynvjitlink-cu12==0.3.0 +pingouin==0.5.5 +numpy==1.26.4 +typing-inspect==0.9.0 +nltk==3.9.1 +onnxruntime==1.19.2 +tensorflow-probability==0.23.0 +datasets==3.0.2 +pickleshare==0.7.5 +peewee==3.17.7 +torch-geometric==2.6.1 +ptyprocess==0.7.0 +greenlet==3.1.1 +graphql-relay==3.2.0 +graphene==3.4.3 +et_xmlfile==2.0.0 +webencodings==0.5.1 +hyperframe==6.0.1 +multitasking==0.0.9 +typer-slim==0.13.0 +onnx==1.15.0 +uvicorn==0.32.0 +memray==1.13.4 +xgboost==2.1.2 +Brotli==1.1.0 +zipp==3.21.0 +nbformat==5.10.4 +responses==0.18.0 +funcy==2.0 +Pygments==2.18.0 +tqdm==4.67.0 +linkify-it-py==2.0.3 +srsly==2.4.8 +cuda-python==12.6.0 +lightning-utilities==0.11.8 +cudf==24.12.0a337 +dask-ml==2024.4.4 +docker-pycreds==0.4.0 +pkgutil_resolve_name==1.3.10 +opentelemetry-api==1.16.0 +fsspec==2024.9.0 +nbclient==0.10.0 +psutil==5.9.8 +pytorch-lightning==2.4.0 +sortedcontainers==2.4.0 +matplotlib==3.9.2 +defusedxml==0.7.1 +urllib3==1.26.19 +jupyterlab_server==2.27.3 +retrying==1.3.3 +dask-cudf==24.12.0a337 +sqlparse==0.5.1 +text-unidecode==1.3 +seaborn==0.13.2 +typing_extensions==4.12.2 +pyzmq==26.2.0 +rfc3339-validator==0.1.4 +pynndescent==0.5.13 +pip==24.3.1 +confection==0.1.4 +wrapt==1.14.1 +fastprogress==1.0.3 +traitlets==5.14.3 +asttokens==2.4.1 +json5==0.9.28 +pandas-stubs==2.2.3.241126 +torchmetrics==1.2.1 +gitdb==4.0.11 +annotated-types==0.7.0 +ipython-autotime==0.1 +httpcore==1.0.6 +click==8.1.7 +setproctitle==1.3.3 +starlette==0.41.2 +jupyterlab==4.2.5 +rmm==24.12.0a27 +opentelemetry-sdk==1.16.0 +textblob==0.15.3 +imbalanced-learn==0.12.4 +typeguard==4.3.0 +more-itertools==10.3.0 +zipp==3.19.2 +autocommand==2.2.2 +jaraco.context==5.3.0 +packaging==24.1 +importlib_metadata==8.0.0 +platformdirs==4.2.2 +jaraco.functools==4.0.1 +importlib_resources==6.4.0 +tomli==2.0.1 +jaraco.text==3.12.1 +wheel==0.43.0 +jaraco.collections==5.1.0 +typing_extensions==4.12.2 +inflect==7.3.1 +backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_163644-j17n0z1w/files/wandb-metadata.json b/wandb/run-20250504_163644-j17n0z1w/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..661f0244432fdb8428ff70df9987a780c88edab2 --- /dev/null +++ b/wandb/run-20250504_163644-j17n0z1w/files/wandb-metadata.json @@ -0,0 +1,77 @@ +{ + "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", + "python": "3.10.15", + "startedAt": "2025-05-04T13:36:44.683493Z", + "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", + "codePath": "finetuning_bc_prott5.py", + "email": "zeynep.isik1@sabanciuniv.edu", + "root": "/arf/scratch/zisik/prott5_bc_ft", + "host": "kolyoz1", + "username": "zisik", + "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", + "codePathLocal": "finetuning_bc_prott5.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "7643995308032", + "used": "274930868224" + } + }, + "memory": { + "total": "1081373220864" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "cluster_name": "cuda", + "conf": "/etc/slurm/slurm.conf", + "cpus_on_node": "16", + "cpus_per_task": "16", + "gpus_on_node": "1", + "gtids": "0", + "job_account": "tbag154", + "job_cpus_per_node": "16", + "job_end_time": "1746624978", + "job_gid": "11636", + "job_gpus": "1", + "job_id": "1027956", + "job_name": "msa_ph_pt", + "job_nodelist": "kolyoz1", + "job_num_nodes": "1", + "job_partition": "kolyoz-cuda", + "job_qos": "tbag", + "job_start_time": "1746365778", + "job_uid": "11636", + "job_user": "zisik", + "jobid": "1027956", + "localid": "0", + "mem_per_cpu": "14000", + "nnodes": "1", + "node_aliases": "(null)", + "nodeid": "0", + "nodelist": "kolyoz1", + "prio_process": "0", + "procid": "0", + "submit_dir": "/arf/scratch/zisik", + "submit_host": "cuda-ui", + "task_pid": "3183359", + "tasks_per_node": "1", + "topology_addr": "kolyoz1", + "topology_addr_pattern": "node", + "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" + }, + "cudaVersion": "12.6" +} \ No newline at end of file diff --git a/wandb/run-20250504_163644-j17n0z1w/files/wandb-summary.json b/wandb/run-20250504_163644-j17n0z1w/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ff8192ac678b3079d8ae2b540b83b8069023ccca --- /dev/null +++ b/wandb/run-20250504_163644-j17n0z1w/files/wandb-summary.json @@ -0,0 +1 @@ +{"eval/loss":0.09890136122703552,"_timestamp":1.746365889900344e+09,"train_steps_per_second":0.087,"train/epoch":2.6666666666666665,"eval/accuracy":1,"_wandb":{"runtime":90},"total_flos":0,"train_samples_per_second":3.033,"eval/samples_per_second":298.458,"eval/steps_per_second":39.794,"train_runtime":69.2309,"eval/runtime":0.0503,"train_loss":0.34036485354105633,"_step":4,"train/global_step":6,"_runtime":85.217340117} \ No newline at end of file diff --git a/wandb/run-20250504_163644-j17n0z1w/logs/debug-core.log b/wandb/run-20250504_163644-j17n0z1w/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..740d60d9f71c8c587518099b5e357d43e8786f46 --- /dev/null +++ b/wandb/run-20250504_163644-j17n0z1w/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-05-04T16:36:43.800622213+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpudnn84p2/port-3183386.txt","pid":3183386,"debug":false,"disable-analytics":false} +{"time":"2025-05-04T16:36:43.800675477+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2025-05-04T16:36:43.801533455+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3183386} +{"time":"2025-05-04T16:36:43.801429105+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":34585,"Zone":""}} +{"time":"2025-05-04T16:36:43.98511968+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:42446"} +{"time":"2025-05-04T16:36:44.686997088+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"j17n0z1w","id":"127.0.0.1:42446"} +{"time":"2025-05-04T16:36:44.811113202+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"j17n0z1w","id":"127.0.0.1:42446"} +{"time":"2025-05-04T16:38:15.462653307+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:42446"} +{"time":"2025-05-04T16:38:15.462760405+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:42446"} +{"time":"2025-05-04T16:38:15.462866235+03:00","level":"INFO","msg":"server is shutting down"} +{"time":"2025-05-04T16:38:15.462928073+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:42446"} +{"time":"2025-05-04T16:38:16.450021056+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:42446"} +{"time":"2025-05-04T16:38:16.450050764+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:42446"} +{"time":"2025-05-04T16:38:16.450073997+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_163644-j17n0z1w/logs/debug-internal.log b/wandb/run-20250504_163644-j17n0z1w/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..702df31c286faaf6c26783e1a598776dc6727960 --- /dev/null +++ b/wandb/run-20250504_163644-j17n0z1w/logs/debug-internal.log @@ -0,0 +1,19 @@ +{"time":"2025-05-04T16:36:44.68958036+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} +{"time":"2025-05-04T16:36:44.68962696+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_163644-j17n0z1w/logs/debug-core.log"} +{"time":"2025-05-04T16:36:44.811045191+03:00","level":"INFO","msg":"created new stream","id":"j17n0z1w"} +{"time":"2025-05-04T16:36:44.81110033+03:00","level":"INFO","msg":"stream: started","id":"j17n0z1w"} +{"time":"2025-05-04T16:36:44.811127326+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"j17n0z1w"} +{"time":"2025-05-04T16:36:44.812597235+03:00","level":"INFO","msg":"handler: started","stream_id":"j17n0z1w"} +{"time":"2025-05-04T16:36:44.812682202+03:00","level":"INFO","msg":"sender: started","stream_id":"j17n0z1w"} +{"time":"2025-05-04T16:36:45.2302005+03:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-05-04T16:38:15.462763223+03:00","level":"INFO","msg":"stream: closing","id":"j17n0z1w"} +{"time":"2025-05-04T16:38:15.462833186+03:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-05-04T16:38:15.463959432+03:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-05-04T16:38:15.653986013+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} +{"time":"2025-05-04T16:38:15.654018889+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} +{"time":"2025-05-04T16:38:15.654030152+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} +{"time":"2025-05-04T16:38:16.194806616+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-05-04T16:38:16.449749922+03:00","level":"INFO","msg":"handler: closed","stream_id":"j17n0z1w"} +{"time":"2025-05-04T16:38:16.449817209+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"j17n0z1w"} +{"time":"2025-05-04T16:38:16.449847499+03:00","level":"INFO","msg":"sender: closed","stream_id":"j17n0z1w"} +{"time":"2025-05-04T16:38:16.449922381+03:00","level":"INFO","msg":"stream: closed","id":"j17n0z1w"} diff --git a/wandb/run-20250504_163644-j17n0z1w/logs/debug.log b/wandb/run-20250504_163644-j17n0z1w/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d737c7f055847a85314b84c35816a14c7b1b12cf --- /dev/null +++ b/wandb/run-20250504_163644-j17n0z1w/logs/debug.log @@ -0,0 +1,27 @@ +2025-05-04 16:36:44,676 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 +2025-05-04 16:36:44,676 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Configure stats pid to 3183386 +2025-05-04 16:36:44,676 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings +2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings +2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} +2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_163644-j17n0z1w/logs/debug.log +2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_163644-j17n0z1w/logs/debug-internal.log +2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_init.py:init():619] calling init triggers +2025-05-04 16:36:44,677 INFO MainThread:3183386 [wandb_init.py:init():626] wandb.init called with sweep_config: {} +config: {} +2025-05-04 16:36:44,678 INFO MainThread:3183386 [wandb_init.py:init():669] starting backend +2025-05-04 16:36:44,678 INFO MainThread:3183386 [wandb_init.py:init():673] sending inform_init request +2025-05-04 16:36:44,682 INFO MainThread:3183386 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-04 16:36:44,683 INFO MainThread:3183386 [wandb_init.py:init():686] backend started and connected +2025-05-04 16:36:44,690 INFO MainThread:3183386 [wandb_init.py:init():781] updated telemetry +2025-05-04 16:36:44,693 INFO MainThread:3183386 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout +2025-05-04 16:36:45,217 INFO MainThread:3183386 [wandb_init.py:init():867] starting run threads in backend +2025-05-04 16:36:46,645 INFO MainThread:3183386 [wandb_run.py:_console_start():2456] atexit reg +2025-05-04 16:36:46,645 INFO MainThread:3183386 [wandb_run.py:_redirect():2305] redirect: wrap_raw +2025-05-04 16:36:46,645 INFO MainThread:3183386 [wandb_run.py:_redirect():2370] Wrapping output streams. +2025-05-04 16:36:46,645 INFO MainThread:3183386 [wandb_run.py:_redirect():2395] Redirects installed. +2025-05-04 16:36:46,651 INFO MainThread:3183386 [wandb_init.py:init():911] run started, returning control to user process +2025-05-04 16:37:00,590 INFO MainThread:3183386 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_16-36-51_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2025-05-04 16:38:15,463 WARNING MsgRouterThr:3183386 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_163644-j17n0z1w/run-j17n0z1w.wandb b/wandb/run-20250504_163644-j17n0z1w/run-j17n0z1w.wandb new file mode 100644 index 0000000000000000000000000000000000000000..dd24d8341cb77819cc99d2fe0367ee037f5874e8 Binary files /dev/null and b/wandb/run-20250504_163644-j17n0z1w/run-j17n0z1w.wandb differ diff --git a/wandb/run-20250504_172503-0ictlmwf/files/config.yaml b/wandb/run-20250504_172503-0ictlmwf/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..90819acdbf6ef774fd8d0e6d0cd98145d182ec2b --- /dev/null +++ b/wandb/run-20250504_172503-0ictlmwf/files/config.yaml @@ -0,0 +1,375 @@ +_wandb: + value: + cli_version: 0.18.7 + m: + - "1": eval/runtime + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/global_step + "6": + - 3 + "7": [] + - "1": eval/samples_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/grad_norm + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/learning_rate + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/accuracy + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/steps_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.10.15 + t: + "1": + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "2": + - 1 + - 2 + - 3 + - 5 + - 6 + - 11 + - 12 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + - 105 + "3": + - 7 + - 23 + - 55 + - 62 + - 66 + "4": 3.10.15 + "5": 0.18.7 + "6": 4.45.2 + "8": + - 5 + "9": + "1": transformers_trainer + "12": 0.18.7 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +auto_find_batch_size: + value: false +batch_eval_metrics: + value: false +bf16: + value: false +bf16_full_eval: + value: false +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +dispatch_batches: + value: null +do_eval: + value: true +do_predict: + value: false +do_train: + value: false +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: epoch +eval_use_gather_object: + value: false +evaluation_strategy: + value: epoch +fp16: + value: true +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 4 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: false +group_by_length: + value: false +half_precision_backend: + value: auto +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +ignore_data_skip: + value: false +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +learning_rate: + value: 5e-05 +length_column_name: + value: length +load_best_model_at_end: + value: true +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: t5-bc-out/runs/May04_17-25-43_kolyoz1 +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 500 +logging_strategy: + value: steps +lr_scheduler_type: + value: linear +max_grad_norm: + value: 1 +max_steps: + value: -1 +metric_for_best_model: + value: loss +mp_parameters: + value: "" +neftune_noise_alpha: + value: null +no_cuda: + value: false +num_train_epochs: + value: 3 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_dir: + value: t5-bc-out +overwrite_output_dir: + value: false +past_index: + value: -1 +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 8 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +prediction_loss_only: + value: false +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_unused_columns: + value: true +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +run_name: + value: t5-bc-out +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: false +save_steps: + value: 500 +save_strategy: + value: epoch +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +split_batches: + value: null +tf32: + value: null +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_liger_kernel: + value: false +use_mps_device: + value: false +warmup_ratio: + value: 0 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/wandb/run-20250504_172503-0ictlmwf/files/output.log b/wandb/run-20250504_172503-0ictlmwf/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..f279e4f7e0075186a5bddae1ec00f2da2afeb33d --- /dev/null +++ b/wandb/run-20250504_172503-0ictlmwf/files/output.log @@ -0,0 +1,110 @@ +You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +Map: 100%|██████████| 511104/511104 [00:29<00:00, 17366.65 examples/s] +Map: 100%|██████████| 109522/109522 [00:04<00:00, 26402.34 examples/s] +/arf/home/zisik/.local/lib/python3.10/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2025-05-04 17:25:48,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. + 33%|███▎ | 15972/47916 [2:23:32<4:53:49, 1.81it/s] +{'loss': 0.5856, 'grad_norm': 1.3348039388656616, 'learning_rate': 4.947825361048502e-05, 'epoch': 0.03} +{'loss': 0.5183, 'grad_norm': 2.473144292831421, 'learning_rate': 4.8956507220970036e-05, 'epoch': 0.06} +{'loss': 0.4879, 'grad_norm': 3.6210598945617676, 'learning_rate': 4.843476083145505e-05, 'epoch': 0.09} +{'loss': 0.4579, 'grad_norm': 6.336288928985596, 'learning_rate': 4.791405793471909e-05, 'epoch': 0.13} +{'loss': 0.4421, 'grad_norm': 2.6699299812316895, 'learning_rate': 4.739231154520411e-05, 'epoch': 0.16} +{'loss': 0.4205, 'grad_norm': 7.918868064880371, 'learning_rate': 4.6870565155689124e-05, 'epoch': 0.19} +{'loss': 0.4044, 'grad_norm': 2.9816083908081055, 'learning_rate': 4.634881876617414e-05, 'epoch': 0.22} +{'loss': 0.3901, 'grad_norm': 7.581803321838379, 'learning_rate': 4.582707237665916e-05, 'epoch': 0.25} +{'loss': 0.3834, 'grad_norm': 6.031352996826172, 'learning_rate': 4.5305325987144174e-05, 'epoch': 0.28} +{'loss': 0.3601, 'grad_norm': 2.581623077392578, 'learning_rate': 4.478357959762919e-05, 'epoch': 0.31} +{'loss': 0.3492, 'grad_norm': 4.7024245262146, 'learning_rate': 4.42618332081142e-05, 'epoch': 0.34} +{'loss': 0.3435, 'grad_norm': 8.929915428161621, 'learning_rate': 4.374217380415728e-05, 'epoch': 0.38} +{'loss': 0.3366, 'grad_norm': 3.694370985031128, 'learning_rate': 4.32204274146423e-05, 'epoch': 0.41} +{'loss': 0.3259, 'grad_norm': 5.6961350440979, 'learning_rate': 4.2698681025127307e-05, 'epoch': 0.44} +{'loss': 0.3224, 'grad_norm': 2.740339756011963, 'learning_rate': 4.217693463561232e-05, 'epoch': 0.47} +{'loss': 0.3103, 'grad_norm': 3.7285494804382324, 'learning_rate': 4.165518824609734e-05, 'epoch': 0.5} +{'loss': 0.3107, 'grad_norm': 5.1480326652526855, 'learning_rate': 4.1133441856582356e-05, 'epoch': 0.53} +{'loss': 0.2945, 'grad_norm': 4.8817620277404785, 'learning_rate': 4.0611695467067366e-05, 'epoch': 0.56} +{'loss': 0.2903, 'grad_norm': 5.003459453582764, 'learning_rate': 4.008994907755238e-05, 'epoch': 0.59} +{'loss': 0.284, 'grad_norm': 6.451533317565918, 'learning_rate': 3.95682026880374e-05, 'epoch': 0.63} +{'loss': 0.276, 'grad_norm': 7.442136287689209, 'learning_rate': 3.9046456298522416e-05, 'epoch': 0.66} +{'loss': 0.27, 'grad_norm': 3.617513656616211, 'learning_rate': 3.852575340178646e-05, 'epoch': 0.69} +{'loss': 0.2666, 'grad_norm': 5.776317596435547, 'learning_rate': 3.800400701227148e-05, 'epoch': 0.72} +{'loss': 0.257, 'grad_norm': 6.264099597930908, 'learning_rate': 3.7482260622756494e-05, 'epoch': 0.75} +{'loss': 0.2566, 'grad_norm': 4.222651481628418, 'learning_rate': 3.6960514233241504e-05, 'epoch': 0.78} +{'loss': 0.2502, 'grad_norm': 6.953704833984375, 'learning_rate': 3.643876784372652e-05, 'epoch': 0.81} +{'loss': 0.2364, 'grad_norm': 3.2264351844787598, 'learning_rate': 3.591806494699057e-05, 'epoch': 0.85} +{'loss': 0.2451, 'grad_norm': 6.233669281005859, 'learning_rate': 3.539631855747558e-05, 'epoch': 0.88} +{'loss': 0.2364, 'grad_norm': 8.540342330932617, 'learning_rate': 3.48745721679606e-05, 'epoch': 0.91} +{'loss': 0.2312, 'grad_norm': 4.3881516456604, 'learning_rate': 3.4352825778445616e-05, 'epoch': 0.94} +{'loss': 0.2323, 'grad_norm': 6.7153167724609375, 'learning_rate': 3.383107938893063e-05, 'epoch': 0.97} + +{'eval_loss': 0.2026778757572174, 'eval_accuracy': 0.9204725991125071, 'eval_runtime': 180.0542, 'eval_samples_per_second': 608.272, 'eval_steps_per_second': 76.038, 'epoch': 1.0} +{'loss': 0.2163, 'grad_norm': 4.329936504364014, 'learning_rate': 3.331037649219468e-05, 'epoch': 1.0} +{'loss': 0.139, 'grad_norm': 8.806492805480957, 'learning_rate': 3.278863010267969e-05, 'epoch': 1.03} +{'loss': 0.1419, 'grad_norm': 9.733407020568848, 'learning_rate': 3.226688371316471e-05, 'epoch': 1.06} +{'loss': 0.1361, 'grad_norm': 3.5503616333007812, 'learning_rate': 3.174513732364972e-05, 'epoch': 1.1} +{'loss': 0.1398, 'grad_norm': 5.853847503662109, 'learning_rate': 3.122339093413474e-05, 'epoch': 1.13} +{'loss': 0.1373, 'grad_norm': 1.6936904191970825, 'learning_rate': 3.0701644544619754e-05, 'epoch': 1.16} +{'loss': 0.1423, 'grad_norm': 1.5299335718154907, 'learning_rate': 3.017989815510477e-05, 'epoch': 1.19} +{'loss': 0.1391, 'grad_norm': 3.899322986602783, 'learning_rate': 2.965815176558978e-05, 'epoch': 1.22} +{'loss': 0.1408, 'grad_norm': 2.3118438720703125, 'learning_rate': 2.913744886885383e-05, 'epoch': 1.25} +{'loss': 0.1408, 'grad_norm': 0.6930440068244934, 'learning_rate': 2.8615702479338845e-05, 'epoch': 1.28} +{'loss': 0.1404, 'grad_norm': 2.851909875869751, 'learning_rate': 2.8093956089823858e-05, 'epoch': 1.31} +{'loss': 0.1382, 'grad_norm': 0.22848767042160034, 'learning_rate': 2.7572209700308875e-05, 'epoch': 1.35} +{'loss': 0.1396, 'grad_norm': 3.973886489868164, 'learning_rate': 2.7050463310793888e-05, 'epoch': 1.38} +{'loss': 0.127, 'grad_norm': 3.140080451965332, 'learning_rate': 2.6529760414057936e-05, 'epoch': 1.41} +{'loss': 0.1276, 'grad_norm': 5.468123435974121, 'learning_rate': 2.6008014024542953e-05, 'epoch': 1.44} +{'loss': 0.1219, 'grad_norm': 0.626640260219574, 'learning_rate': 2.5486267635027966e-05, 'epoch': 1.47} +{'loss': 0.1319, 'grad_norm': 3.1899547576904297, 'learning_rate': 2.496452124551298e-05, 'epoch': 1.5} +{'loss': 0.1298, 'grad_norm': 3.199150562286377, 'learning_rate': 2.4442774855997996e-05, 'epoch': 1.53} +{'loss': 0.1217, 'grad_norm': 5.129565715789795, 'learning_rate': 2.3921028466483013e-05, 'epoch': 1.57} +{'loss': 0.1288, 'grad_norm': 4.223311424255371, 'learning_rate': 2.339928207696803e-05, 'epoch': 1.6} +{'loss': 0.1263, 'grad_norm': 10.741965293884277, 'learning_rate': 2.2877535687453046e-05, 'epoch': 1.63} +{'loss': 0.122, 'grad_norm': 3.0217132568359375, 'learning_rate': 2.235578929793806e-05, 'epoch': 1.66} +{'loss': 0.122, 'grad_norm': 7.847172737121582, 'learning_rate': 2.1835086401202104e-05, 'epoch': 1.69} +{'loss': 0.1266, 'grad_norm': 9.223713874816895, 'learning_rate': 2.1313340011687117e-05, 'epoch': 1.72} +{'loss': 0.1274, 'grad_norm': 2.0706963539123535, 'learning_rate': 2.0791593622172137e-05, 'epoch': 1.75} +{'loss': 0.1214, 'grad_norm': 3.1475393772125244, 'learning_rate': 2.0270890725436182e-05, 'epoch': 1.78} +{'loss': 0.1191, 'grad_norm': 3.7348415851593018, 'learning_rate': 1.9749144335921196e-05, 'epoch': 1.82} +{'loss': 0.1199, 'grad_norm': 3.230713129043579, 'learning_rate': 1.9227397946406212e-05, 'epoch': 1.85} +{'loss': 0.1176, 'grad_norm': 0.4691683351993561, 'learning_rate': 1.8705651556891226e-05, 'epoch': 1.88} +{'loss': 0.1176, 'grad_norm': 4.382262706756592, 'learning_rate': 1.8183905167376242e-05, 'epoch': 1.91} +{'loss': 0.1083, 'grad_norm': 9.810182571411133, 'learning_rate': 1.7662158777861255e-05, 'epoch': 1.94} +{'loss': 0.1103, 'grad_norm': 8.107538223266602, 'learning_rate': 1.7140412388346275e-05, 'epoch': 1.97} +{'eval_loss': 0.1829579919576645, 'eval_accuracy': 0.9478369642628878, 'eval_runtime': 179.9731, 'eval_samples_per_second': 608.547, 'eval_steps_per_second': 76.072, 'epoch': 2.0} +{'loss': 0.1087, 'grad_norm': 0.5452843308448792, 'learning_rate': 1.661866599883129e-05, 'epoch': 2.0} +{'loss': 0.0456, 'grad_norm': 1.0569943189620972, 'learning_rate': 1.6097963102095334e-05, 'epoch': 2.03} +{'loss': 0.0523, 'grad_norm': 0.22022764384746552, 'learning_rate': 1.557621671258035e-05, 'epoch': 2.07} +{'loss': 0.0492, 'grad_norm': 9.75222396850586, 'learning_rate': 1.5054470323065365e-05, 'epoch': 2.1} +{'loss': 0.0498, 'grad_norm': 3.1281306743621826, 'learning_rate': 1.453272393355038e-05, 'epoch': 2.13} +{'loss': 0.0506, 'grad_norm': 0.012396792881190777, 'learning_rate': 1.4012021036814427e-05, 'epoch': 2.16} +{'loss': 0.0569, 'grad_norm': 6.527154922485352, 'learning_rate': 1.3490274647299442e-05, 'epoch': 2.19} +{'loss': 0.0548, 'grad_norm': 3.5429670810699463, 'learning_rate': 1.2968528257784457e-05, 'epoch': 2.22} +{'loss': 0.0558, 'grad_norm': 1.333369255065918, 'learning_rate': 1.2446781868269472e-05, 'epoch': 2.25} +{'loss': 0.0464, 'grad_norm': 0.10260029882192612, 'learning_rate': 1.1926078971533518e-05, 'epoch': 2.29} +{'loss': 0.0515, 'grad_norm': 0.14060164988040924, 'learning_rate': 1.1404332582018533e-05, 'epoch': 2.32} +{'loss': 0.0448, 'grad_norm': 1.031032919883728, 'learning_rate': 1.0882586192503548e-05, 'epoch': 2.35} +{'loss': 0.0475, 'grad_norm': 0.20121368765830994, 'learning_rate': 1.0360839802988565e-05, 'epoch': 2.38} +{'loss': 0.0522, 'grad_norm': 0.06531311571598053, 'learning_rate': 9.84013690625261e-06, 'epoch': 2.41} +{'loss': 0.0434, 'grad_norm': 0.04498385637998581, 'learning_rate': 9.318390516737625e-06, 'epoch': 2.44} +{'loss': 0.0468, 'grad_norm': 0.3482716679573059, 'learning_rate': 8.796644127222641e-06, 'epoch': 2.47} +{'loss': 0.0505, 'grad_norm': 4.0475053787231445, 'learning_rate': 8.274897737707656e-06, 'epoch': 2.5} +{'loss': 0.0421, 'grad_norm': 0.6960127353668213, 'learning_rate': 7.753151348192671e-06, 'epoch': 2.54} +{'loss': 0.0451, 'grad_norm': 0.8902493119239807, 'learning_rate': 7.231404958677686e-06, 'epoch': 2.57} +{'loss': 0.0522, 'grad_norm': 0.46462351083755493, 'learning_rate': 6.710702061941732e-06, 'epoch': 2.6} +{'loss': 0.0468, 'grad_norm': 0.07463126629590988, 'learning_rate': 6.1889556724267476e-06, 'epoch': 2.63} +{'loss': 0.0429, 'grad_norm': 0.05138092488050461, 'learning_rate': 5.6672092829117625e-06, 'epoch': 2.66} +{'loss': 0.038, 'grad_norm': 0.06017659977078438, 'learning_rate': 5.145462893396778e-06, 'epoch': 2.69} +{'loss': 0.0418, 'grad_norm': 3.794154405593872, 'learning_rate': 4.624759996660823e-06, 'epoch': 2.72} +{'loss': 0.0418, 'grad_norm': 9.929149627685547, 'learning_rate': 4.103013607145838e-06, 'epoch': 2.75} +{'loss': 0.0435, 'grad_norm': 0.10156802833080292, 'learning_rate': 3.5812672176308544e-06, 'epoch': 2.79} +{'loss': 0.039, 'grad_norm': 15.590471267700195, 'learning_rate': 3.0595208281158697e-06, 'epoch': 2.82} +{'loss': 0.0451, 'grad_norm': 0.1026441678404808, 'learning_rate': 2.5377744386008846e-06, 'epoch': 2.85} +{'loss': 0.0408, 'grad_norm': 0.08782440423965454, 'learning_rate': 2.0160280490859004e-06, 'epoch': 2.88} +{'loss': 0.0372, 'grad_norm': 17.5203857421875, 'learning_rate': 1.494281659570916e-06, 'epoch': 2.91} +{'loss': 0.041, 'grad_norm': 0.08832889050245285, 'learning_rate': 9.735787628349612e-07, 'epoch': 2.94} +{'loss': 0.0417, 'grad_norm': 10.057083129882812, 'learning_rate': 4.518323733199766e-07, 'epoch': 2.97} +{'eval_loss': 0.2335142344236374, 'eval_accuracy': 0.9541735906941071, 'eval_runtime': 176.4196, 'eval_samples_per_second': 620.804, 'eval_steps_per_second': 77.605, 'epoch': 3.0} +{'train_runtime': 26437.5455, 'train_samples_per_second': 57.998, 'train_steps_per_second': 1.812, 'train_loss': 0.1687752874718155, 'epoch': 3.0} +100%|██████████| 13691/13691 [02:53<00:00, 78.95it/s] +{'eval_loss': 0.17655357718467712, 'eval_accuracy': 0.9493257124074396, 'eval_runtime': 173.4293, 'eval_samples_per_second': 631.514, 'eval_steps_per_second': 78.943, 'epoch': 3.0} diff --git a/wandb/run-20250504_172503-0ictlmwf/files/requirements.txt b/wandb/run-20250504_172503-0ictlmwf/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..847c45ecccb522de294762faeeb01fe5fb02f7ac --- /dev/null +++ b/wandb/run-20250504_172503-0ictlmwf/files/requirements.txt @@ -0,0 +1,541 @@ +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +pyg-lib==0.4.0+pt20cu117 +biopython==1.85 +iniconfig==2.0.0 +tokenizers==0.20.0 +accelerate==1.3.0 +torch==2.6.0 +nvidia-nccl-cu12==2.21.5 +transformers==4.45.2 +nvidia-cusparse-cu12==12.3.1.170 +torch-scatter==2.1.2+pt20cu117 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nvtx-cu12==12.4.127 +zstd==1.5.6.6 +fair-esm==2.0.0 +omegaconf==2.3.0 +pluggy==1.5.0 +pytest==8.3.5 +nvidia-curand-cu12==10.3.5.147 +nvidia-cufft-cu12==11.2.1.3 +torch-cluster==1.6.3+pt20cu117 +regex==2024.9.11 +nvidia-cudnn-cu12==9.1.0.70 +torch-spline-conv==1.2.2+pt20cu117 +nvidia-cusolver-cu12==11.6.1.9 +antlr4-python3-runtime==4.9.3 +msgpack-numpy==0.4.8 +nlp==0.2.0 +einops==0.8.1 +nvidia-cublas-cu12==12.4.5.8 +triton==3.2.0 +ninja==1.11.1.3 +hydra-core==1.3.2 +nvidia-nvjitlink-cu12==12.4.127 +biotite==0.41.2 +torch-sparse==0.6.18+pt20cu117 +esm==3.1.4 +sympy==1.13.1 +nvidia-cuda-runtime-cu12==12.4.127 +jupyter-lsp==2.2.5 +jupyter-events==0.10.0 +ipykernel==6.29.5 +Mako==1.3.5 +proto-plus==1.25.0 +fst-pso==1.8.1 +gensim==4.3.3 +htmlmin==0.1.12 +tokenizers==0.13.3 +timm==1.0.11 +MarkupSafe==3.0.2 +safetensors==0.4.5 +requests==2.32.3 +gast==0.5.5 +cuml==24.12.0a33 +jaxlib==0.4.23.dev20240214 +spacy-loggers==1.0.5 +pytz==2024.1 +idna==3.10 +python-dateutil==2.9.0 +mdurl==0.1.2 +blis==0.7.10 +jupyter==1.1.1 +pyerfa==2.0.1.5 +comm==0.2.2 +pygraphviz==1.14 +dill==0.3.8 +paramiko==3.5.0 +llama-index==0.8.36 +mdit-py-plugins==0.4.2 +Werkzeug==3.1.3 +pyu2f==0.1.5 +dask-glm==0.2.0 +httpx==0.27.2 +typeguard==4.4.1 +mypy-extensions==1.0.0 +kmodes==0.12.2 +keras==2.15.0 +ydata-profiling==0.0.dev0 +regex==2024.11.6 +xarray==2024.11.0 +setuptools==75.3.0 +charset-normalizer==3.4.0 +jupyterlab_nvdashboard==0.11.0 +pylibraft==24.12.0a36 +spacy==3.7.6 +mlflow-skinny==2.17.2 +nvtx==0.2.10 +multimethod==1.12 +pexpect==4.9.0 +torch==2.1.0.post301 +flatbuffers==24.3.25 +python-json-logger==2.0.7 +PyJWT==2.9.0 +multiprocess==0.70.16 +colorlover==0.3.0 +yarl==1.16.0 +locket==1.0.0 +patsy==1.0.0 +rapids-dask-dependency==24.12.0a0 +stanza==1.9.2 +debugpy==1.8.8 +jupyterlab_pygments==0.3.0 +pylibcudf==24.12.0a337 +lz4==4.3.3 +pandas==2.2.3 +tifffile==2024.9.20 +pynvml==11.4.1 +cufflinks==0.17.3 +ipywidgets==8.1.5 +requests-oauthlib==2.0.0 +google-auth-oauthlib==1.2.1 +rsa==4.9 +webcolors==24.8.0 +jsonschema-specifications==2024.10.1 +scikit-learn==1.5.2 +langchain-text-splitters==0.3.2 +pandas-datareader==0.10.0 +tomli==2.0.2 +tzdata==2024.2 +scikit-image==0.24.0 +tensorboard_data_server==0.7.0 +kiwisolver==1.4.7 +cloudpathlib==0.20.0 +isodate==0.6.1 +adversarial-robustness-toolbox==1.19.1 +SQLAlchemy==2.0.36 +pytest-runner==6.0.0 +pycairo==1.27.0 +treelite==4.3.0 +jiter==0.7.0 +threadpoolctl==3.5.0 +pandocfilters==1.5.0 +loguru==0.7.2 +smart_open==7.0.5 +shellingham==1.5.4 +deepspeed==0.15.4 +prompt_toolkit==3.0.48 +databricks-sdk==0.34.0 +langchain-core==0.3.15 +imageio==2.36.0 +openapi-schema-pydantic==1.2.4 +zict==3.0.0 +cachetools==5.5.0 +colorful==0.5.6 +mpmath==1.3.0 +nest_asyncio==1.6.0 +pyFUME==0.2.25 +opencv-python-headless==4.9.0 +fastai==2.7.18 +importlib_resources==6.4.5 +binaryornot==0.4.4 +evaluate==0.4.1 +matplotlib-inline==0.1.7 +wasabi==1.1.2 +pycparser==2.22 +GitPython==3.1.43 +pluggy==1.5.0 +async-lru==2.0.4 +pgmpy==0.1.24 +anyio==4.4.0 +executing==2.1.0 +orjson==3.10.11 +humanfriendly==10.0 +tornado==6.4.1 +gmpy2==2.1.5 +rlPyCairo==0.2.0 +distributed==2024.11.0 +FuzzyTM==2.0.5 +torchtext==0.15.2a0+5ce3163 +pytest==8.3.5 +pyod==2.0.2 +ImageHash==4.3.1 +soupsieve==2.5 +tblib==3.0.0 +emoji==2.14.0 +aiohappyeyeballs==2.4.3 +uri-template==1.3.0 +tensorflow_estimator==2.15.0 +babel==2.16.0 +dask-cuda==24.12.0a12 +overrides==7.7.0 +opencensus==0.11.3 +openai==0.28.1 +language_data==1.2.0 +jedi==0.19.2 +cookiecutter==2.6.0 +entrypoints==0.4 +exceptiongroup==1.2.2 +marisa-trie==1.2.0 +uvloop==0.20.0 +aiosignal==1.3.1 +Flask==3.0.3 +tensorboard==2.15.2 +cffi==1.17.1 +tf_keras==2.15.0 +absl-py==2.1.0 +blinker==1.9.0 +types-python-dateutil==2.9.0.20241003 +opencv-python==4.9.0 +frozendict==2.4.6 +aiohttp-cors==0.7.0 +statsmodels==0.14.4 +tinycss2==1.4.0 +terminado==0.18.1 +pycaret==2.2.3 +aiohttp==3.10.10 +distributed-ucxx==0.41.0 +prometheus_client==0.21.0 +fastdownload==0.0.7 +grpcio==1.59.3 +google-api-core==2.22.0 +jupyterlab_widgets==3.0.13 +appdirs==1.4.4 +littleutils==0.0.0 +ray==2.24.0 +kaggle==1.6.17 +jsonschema==4.23.0 +google-auth==2.36.0 +scikit-base==0.11.0 +visions==0.7.6 +pyarrow==15.0.0 +transformers==4.33.0 +prometheus_flask_exporter==0.23.1 +dm-tree==0.1.8 +colorama==0.4.6 +requests-toolbelt==1.0.0 +cached-property==1.5.2 +cymem==2.0.8 +PyNaCl==1.5.0 +PyWavelets==1.7.0 +httptools==0.6.1 +typing-utils==0.1.0 +email_validator==2.2.0 +marshmallow==3.23.1 +Deprecated==1.2.14 +virtualenv==20.4.7 +optuna==3.6.1 +jupyter_server==2.14.2 +termcolor==2.5.0 +mpi4py==4.0.1 +torchdata==0.7.1+8cea82f +dataclasses==0.8 +cloudpickle==3.1.0 +tree_sitter_languages==1.10.2 +tabulate==0.9.0 +ipython==8.29.0 +lightgbm==4.3.0 +captum==0.6.0 +confuse==2.0.1 +torchvision==0.16.1+adc3221 +lxml==4.9.4 +fastapi==0.115.4 +python-multipart==0.0.17 +dnspython==2.7.0 +jupyter-console==6.6.3 +preshed==3.0.9 +py-cpuinfo==9.0.0 +Send2Trash==1.8.3 +murmurhash==1.0.10 +sniffio==1.3.1 +websockets==13.1 +h11==0.14.0 +smmap==5.0.0 +textual==0.85.2 +jsonpatch==1.33 +opencensus-context==0.1.3 +nbconvert==7.16.4 +sentry-sdk==2.19.0 +opentelemetry-semantic-conventions==0.37b0 +pandas-profiling==2.8.0 +pillow==10.3.0 +peft==0.13.2 +rpds-py==0.21.0 +bokeh==3.6.1 +distro==1.9.0 +itsdangerous==2.2.0 +wandb==0.18.7 +jsonpointer==3.0.0 +astropy-iers-data==0.2024.11.11.0.32.38 +horovod==0.28.1 +graphviz==0.20.3 +vtk==9.3.1 +bleach==6.2.0 +numexpr==2.8.7 +pydantic_core==2.23.4 +Jinja2==3.1.4 +widgetsnbextension==4.0.13 +filelock==3.16.1 +catboost==1.2.7 +raft-dask==24.12.0a36 +async-timeout==4.0.3 +datefinder==0.7.3 +coloredlogs==15.0.1 +platformdirs==4.3.6 +spacy-legacy==3.0.12 +chardet==5.2.0 +jupyter_client==8.6.3 +importlib_metadata==8.5.0 +rfc3986-validator==0.1.1 +huggingface_hub==0.26.2 +PySocks==1.7.1 +mlxtend==0.23.2 +outdated==0.2.2 +partd==1.4.2 +thinc==8.2.5 +astropy==6.1.6 +rdflib==6.3.2 +h2==4.1.0 +typer==0.13.0 +xyzservices==2024.9.0 +toolz==0.12.1 +frozenlist==1.5.0 +rdkit==2024.9.2 +pyasn1==0.6.1 +jupyter_server_terminals==0.5.3 +ucx-py==0.41.0a11 +astunparse==1.6.3 +simpful==2.12.0 +notebook_shim==0.2.4 +scipy==1.13.1 +colorlog==6.9.0 +tiktoken==0.3.3 +plotly==5.24.1 +fastrlock==0.8.2 +chart-studio==1.1.0 +stack-data==0.6.2 +google-pasta==0.2.0 +sktime==0.34.0 +PyYAML==6.0.2 +sympy==1.13.3 +multidict==6.1.0 +ml-dtypes==0.2.0 +tensorboardX==2.6.2.2 +decorator==5.1.1 +cytoolz==1.0.0 +ase==3.23.0 +isoduration==20.11.0 +html5lib==1.1 +langsmith==0.1.142 +future==1.0.0 +onnx2torch==1.5.15 +multipledispatch==0.6.0 +protobuf==4.24.4 +ucxx==0.41.0 +pandas_flavor==0.6.0 +msgpack==1.1.0 +pyasn1_modules==0.4.1 +imagecodecs==2024.1.1 +mlflow==2.17.2 +watchfiles==0.24.0 +dm-sonnet==2.0.2 +langcodes==3.4.1 +freetype-py==2.3.0 +argon2-cffi-bindings==21.2.0 +trimesh==4.5.2 +opt_einsum==3.4.0 +tenacity==8.5.0 +h5py==3.12.1 +fastapi-cli==0.0.5 +oauthlib==3.2.2 +parso==0.8.4 +weasel==0.4.1 +yfinance==0.2.49 +networkx==2.8.8 +bitsandbytes==0.44.1 +lazy_loader==0.4 +querystring_parser==1.2.4 +contourpy==1.3.0 +unicodedata2==15.1.0 +bcrypt==4.2.0 +munkres==1.1.4 +langchain==0.0.298 +hpack==4.0.0 +cryptography==43.0.3 +umap-learn==0.5.7 +arrow==1.3.0 +docker==7.1.0 +certifi==2025.1.31 +fastjsonschema==2.20.0 +tensorflow==2.15.0 +googleapis-common-protos==1.65.0 +iniconfig==2.0.0 +Markdown==3.6 +llvmlite==0.43.0 +wslink==2.3.2 +attrs==24.2.0 +rich==13.9.4 +cupy==13.3.0 +uc-micro-py==1.0.3 +alembic==1.14.0 +joblib==1.4.2 +reportlab==4.2.5 +miniful==0.0.6 +jupyter_core==5.7.2 +wheel==0.45.0 +phik==0.12.3 +mistune==3.0.2 +wcwidth==0.2.13 +dacite==1.8.1 +accelerate==0.22.0 +sacremoses==0.0.53 +revtok==0.0.3 +python-slugify==8.0.4 +tangled-up-in-unicode==0.2.0 +dask==2024.11.0 +markdown-it-py==3.0.0 +sentencepiece==0.1.99 +beautifulsoup4==4.12.3 +six==1.16.0 +numba-cuda==0.0.17 +argon2-cffi==23.1.0 +xxhash==3.5.0 +hjson==3.1.0 +fonttools==4.54.1 +graphql-core==3.2.5 +pyparsing==3.2.0 +pure_eval==0.2.3 +distlib==0.3.9 +lightning==2.4.0 +wordcloud==0.0.0 +catalogue==2.0.10 +jax==0.4.27 +tree-sitter==0.23.2 +notebook==7.2.2 +dataclasses-json==0.6.7 +propcache==0.2.0 +numba==0.60.0 +dask-expr==1.1.17 +pydantic==2.9.2 +gunicorn==22.0.0 +missingno==0.5.2 +pyOpenSSL==24.2.1 +openpyxl==3.1.5 +packaging==24.1 +python-dotenv==1.0.1 +cycler==0.12.1 +types-pytz==2024.2.0.20241003 +yellowbrick==1.5 +referencing==0.35.1 +pyLDAvis==3.4.1 +lazypredict==0.2.16 +fqdn==1.5.1 +websocket-client==1.8.0 +fastcore==1.7.19 +pynvjitlink-cu12==0.3.0 +pingouin==0.5.5 +numpy==1.26.4 +typing-inspect==0.9.0 +nltk==3.9.1 +onnxruntime==1.19.2 +tensorflow-probability==0.23.0 +datasets==3.0.2 +pickleshare==0.7.5 +peewee==3.17.7 +torch-geometric==2.6.1 +ptyprocess==0.7.0 +greenlet==3.1.1 +graphql-relay==3.2.0 +graphene==3.4.3 +et_xmlfile==2.0.0 +webencodings==0.5.1 +hyperframe==6.0.1 +multitasking==0.0.9 +typer-slim==0.13.0 +onnx==1.15.0 +uvicorn==0.32.0 +memray==1.13.4 +xgboost==2.1.2 +Brotli==1.1.0 +zipp==3.21.0 +nbformat==5.10.4 +responses==0.18.0 +funcy==2.0 +Pygments==2.18.0 +tqdm==4.67.0 +linkify-it-py==2.0.3 +srsly==2.4.8 +cuda-python==12.6.0 +lightning-utilities==0.11.8 +cudf==24.12.0a337 +dask-ml==2024.4.4 +docker-pycreds==0.4.0 +pkgutil_resolve_name==1.3.10 +opentelemetry-api==1.16.0 +fsspec==2024.9.0 +nbclient==0.10.0 +psutil==5.9.8 +pytorch-lightning==2.4.0 +sortedcontainers==2.4.0 +matplotlib==3.9.2 +defusedxml==0.7.1 +urllib3==1.26.19 +jupyterlab_server==2.27.3 +retrying==1.3.3 +dask-cudf==24.12.0a337 +sqlparse==0.5.1 +text-unidecode==1.3 +seaborn==0.13.2 +typing_extensions==4.12.2 +pyzmq==26.2.0 +rfc3339-validator==0.1.4 +pynndescent==0.5.13 +pip==24.3.1 +confection==0.1.4 +wrapt==1.14.1 +fastprogress==1.0.3 +traitlets==5.14.3 +asttokens==2.4.1 +json5==0.9.28 +pandas-stubs==2.2.3.241126 +torchmetrics==1.2.1 +gitdb==4.0.11 +annotated-types==0.7.0 +ipython-autotime==0.1 +httpcore==1.0.6 +click==8.1.7 +setproctitle==1.3.3 +starlette==0.41.2 +jupyterlab==4.2.5 +rmm==24.12.0a27 +opentelemetry-sdk==1.16.0 +textblob==0.15.3 +imbalanced-learn==0.12.4 +typeguard==4.3.0 +more-itertools==10.3.0 +zipp==3.19.2 +autocommand==2.2.2 +jaraco.context==5.3.0 +packaging==24.1 +importlib_metadata==8.0.0 +platformdirs==4.2.2 +jaraco.functools==4.0.1 +importlib_resources==6.4.0 +tomli==2.0.1 +jaraco.text==3.12.1 +wheel==0.43.0 +jaraco.collections==5.1.0 +typing_extensions==4.12.2 +inflect==7.3.1 +backports.tarfile==1.2.0 diff --git a/wandb/run-20250504_172503-0ictlmwf/files/wandb-metadata.json b/wandb/run-20250504_172503-0ictlmwf/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..f3c892eb99e110ed3a340c5c88fca4d4e7601345 --- /dev/null +++ b/wandb/run-20250504_172503-0ictlmwf/files/wandb-metadata.json @@ -0,0 +1,77 @@ +{ + "os": "Linux-5.14.0-427.13.1.el9_4.x86_64-x86_64-with-glibc2.34", + "python": "3.10.15", + "startedAt": "2025-05-04T14:25:03.372176Z", + "program": "/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py", + "codePath": "finetuning_bc_prott5.py", + "email": "zeynep.isik1@sabanciuniv.edu", + "root": "/arf/scratch/zisik/prott5_bc_ft", + "host": "kolyoz1", + "username": "zisik", + "executable": "/arf/sw/apps/truba-ai/gpu/miniforge3-2024/envs/gpu-2024.0/bin/python3", + "codePathLocal": "finetuning_bc_prott5.py", + "cpu_count": 64, + "cpu_count_logical": 64, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 1, + "disk": { + "/": { + "total": "7643995308032", + "used": "274939207680" + } + }, + "memory": { + "total": "1081373220864" + }, + "cpu": { + "count": 64, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "cluster_name": "cuda", + "conf": "/etc/slurm/slurm.conf", + "cpus_on_node": "16", + "cpus_per_task": "16", + "gpus_on_node": "1", + "gtids": "0", + "job_account": "tbag154", + "job_cpus_per_node": "16", + "job_end_time": "1746627878", + "job_gid": "11636", + "job_gpus": "3", + "job_id": "1027971", + "job_name": "msa_ph_pt", + "job_nodelist": "kolyoz1", + "job_num_nodes": "1", + "job_partition": "kolyoz-cuda", + "job_qos": "tbag", + "job_start_time": "1746368678", + "job_uid": "11636", + "job_user": "zisik", + "jobid": "1027971", + "localid": "0", + "mem_per_cpu": "14000", + "nnodes": "1", + "node_aliases": "(null)", + "nodeid": "0", + "nodelist": "kolyoz1", + "prio_process": "0", + "procid": "0", + "submit_dir": "/arf/scratch/zisik", + "submit_host": "cuda-ui", + "task_pid": "3189684", + "tasks_per_node": "1", + "topology_addr": "kolyoz1", + "topology_addr_pattern": "node", + "working_cluster": "cuda:slurmcontroller3.ib:6800:9984:109" + }, + "cudaVersion": "12.6" +} \ No newline at end of file diff --git a/wandb/run-20250504_172503-0ictlmwf/files/wandb-summary.json b/wandb/run-20250504_172503-0ictlmwf/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..1d540fd3ad4df49061ca8c1d3c6754d4ef243041 --- /dev/null +++ b/wandb/run-20250504_172503-0ictlmwf/files/wandb-summary.json @@ -0,0 +1 @@ +{"train/epoch":3,"_step":99,"train/grad_norm":10.057083129882812,"train/learning_rate":4.518323733199766e-07,"eval/steps_per_second":78.943,"train/loss":0.0417,"train_steps_per_second":1.812,"_wandb":{"runtime":26669},"total_flos":0,"eval/loss":0.17655357718467712,"train_loss":0.1687752874718155,"train_runtime":26437.5455,"eval/accuracy":0.9493257124074396,"_timestamp":1.7463953681937246e+09,"train_samples_per_second":57.998,"eval/runtime":173.4293,"train/global_step":47916,"eval/samples_per_second":631.514,"_runtime":26664.822052477} \ No newline at end of file diff --git a/wandb/run-20250504_172503-0ictlmwf/logs/debug-core.log b/wandb/run-20250504_172503-0ictlmwf/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..be6c40900e077df04434c2058af02e8279bfd761 --- /dev/null +++ b/wandb/run-20250504_172503-0ictlmwf/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-05-04T17:25:02.471184127+03:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpiwo6benn/port-3189710.txt","pid":3189710,"debug":false,"disable-analytics":false} +{"time":"2025-05-04T17:25:02.471231751+03:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2025-05-04T17:25:02.472141348+03:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":33653,"Zone":""}} +{"time":"2025-05-04T17:25:02.472268534+03:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3189710} +{"time":"2025-05-04T17:25:02.658908169+03:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:54060"} +{"time":"2025-05-04T17:25:03.374276784+03:00","level":"INFO","msg":"handleInformInit: received","streamId":"0ictlmwf","id":"127.0.0.1:54060"} +{"time":"2025-05-04T17:25:03.501310319+03:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"0ictlmwf","id":"127.0.0.1:54060"} +{"time":"2025-05-05T00:49:32.577676717+03:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:54060"} +{"time":"2025-05-05T00:49:32.577829204+03:00","level":"INFO","msg":"server is shutting down"} +{"time":"2025-05-05T00:49:32.577794736+03:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:54060"} +{"time":"2025-05-05T00:49:32.578007826+03:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:54060"} +{"time":"2025-05-05T00:49:33.742564589+03:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:54060"} +{"time":"2025-05-05T00:49:33.742592681+03:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:54060"} +{"time":"2025-05-05T00:49:33.742613896+03:00","level":"INFO","msg":"server is closed"} diff --git a/wandb/run-20250504_172503-0ictlmwf/logs/debug-internal.log b/wandb/run-20250504_172503-0ictlmwf/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..8f66fe3c7ae770c0e93c28ce15a95a46c40e21af --- /dev/null +++ b/wandb/run-20250504_172503-0ictlmwf/logs/debug-internal.log @@ -0,0 +1,21 @@ +{"time":"2025-05-04T17:25:03.375857654+03:00","level":"INFO","msg":"using version","core version":"0.18.7"} +{"time":"2025-05-04T17:25:03.375905253+03:00","level":"INFO","msg":"created symlink","path":"/arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_172503-0ictlmwf/logs/debug-core.log"} +{"time":"2025-05-04T17:25:03.501241143+03:00","level":"INFO","msg":"created new stream","id":"0ictlmwf"} +{"time":"2025-05-04T17:25:03.501294637+03:00","level":"INFO","msg":"stream: started","id":"0ictlmwf"} +{"time":"2025-05-04T17:25:03.501448652+03:00","level":"INFO","msg":"writer: Do: started","stream_id":"0ictlmwf"} +{"time":"2025-05-04T17:25:03.501451145+03:00","level":"INFO","msg":"handler: started","stream_id":"0ictlmwf"} +{"time":"2025-05-04T17:25:03.501574427+03:00","level":"INFO","msg":"sender: started","stream_id":"0ictlmwf"} +{"time":"2025-05-04T17:25:03.865922055+03:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-05-04T22:47:43.191425732+03:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/isikz/finetuning-bc-protT5/0ictlmwf/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"} +{"time":"2025-05-05T00:01:47.351449692+03:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/isikz/finetuning-bc-protT5/0ictlmwf/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"} +{"time":"2025-05-05T00:49:32.57779148+03:00","level":"INFO","msg":"stream: closing","id":"0ictlmwf"} +{"time":"2025-05-05T00:49:32.577842715+03:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-05-05T00:49:32.578849729+03:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-05-05T00:49:32.781968337+03:00","level":"WARN","msg":"No job ingredients found, not creating job artifact"} +{"time":"2025-05-05T00:49:32.781997123+03:00","level":"WARN","msg":"No source type found, not creating job artifact"} +{"time":"2025-05-05T00:49:32.782008311+03:00","level":"INFO","msg":"sender: sendDefer: no job artifact to save"} +{"time":"2025-05-05T00:49:33.357099059+03:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-05-05T00:49:33.741524339+03:00","level":"INFO","msg":"handler: closed","stream_id":"0ictlmwf"} +{"time":"2025-05-05T00:49:33.741583153+03:00","level":"INFO","msg":"writer: Close: closed","stream_id":"0ictlmwf"} +{"time":"2025-05-05T00:49:33.741593811+03:00","level":"INFO","msg":"sender: closed","stream_id":"0ictlmwf"} +{"time":"2025-05-05T00:49:33.741652369+03:00","level":"INFO","msg":"stream: closed","id":"0ictlmwf"} diff --git a/wandb/run-20250504_172503-0ictlmwf/logs/debug.log b/wandb/run-20250504_172503-0ictlmwf/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..627abd37727afa0dddc772a5f08d1d451156833a --- /dev/null +++ b/wandb/run-20250504_172503-0ictlmwf/logs/debug.log @@ -0,0 +1,27 @@ +2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Current SDK version is 0.18.7 +2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Configure stats pid to 3189710 +2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Loading settings from /arf/home/zisik/.config/wandb/settings +2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Loading settings from /arf/scratch/zisik/prott5_bc_ft/wandb/settings +2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2025-05-04 17:25:03,365 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'finetuning_bc_prott5.py', 'program_abspath': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py', 'program': '/arf/scratch/zisik/prott5_bc_ft/finetuning_bc_prott5.py'} +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:_log_setup():533] Logging user logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_172503-0ictlmwf/logs/debug.log +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:_log_setup():534] Logging internal logs to /arf/scratch/zisik/prott5_bc_ft/wandb/run-20250504_172503-0ictlmwf/logs/debug-internal.log +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():619] calling init triggers +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():626] wandb.init called with sweep_config: {} +config: {} +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():669] starting backend +2025-05-04 17:25:03,366 INFO MainThread:3189710 [wandb_init.py:init():673] sending inform_init request +2025-05-04 17:25:03,371 INFO MainThread:3189710 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-04 17:25:03,371 INFO MainThread:3189710 [wandb_init.py:init():686] backend started and connected +2025-05-04 17:25:03,379 INFO MainThread:3189710 [wandb_init.py:init():781] updated telemetry +2025-05-04 17:25:03,382 INFO MainThread:3189710 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout +2025-05-04 17:25:03,852 INFO MainThread:3189710 [wandb_init.py:init():867] starting run threads in backend +2025-05-04 17:25:05,277 INFO MainThread:3189710 [wandb_run.py:_console_start():2456] atexit reg +2025-05-04 17:25:05,278 INFO MainThread:3189710 [wandb_run.py:_redirect():2305] redirect: wrap_raw +2025-05-04 17:25:05,278 INFO MainThread:3189710 [wandb_run.py:_redirect():2370] Wrapping output streams. +2025-05-04 17:25:05,278 INFO MainThread:3189710 [wandb_run.py:_redirect():2395] Redirects installed. +2025-05-04 17:25:05,283 INFO MainThread:3189710 [wandb_init.py:init():911] run started, returning control to user process +2025-05-04 17:25:53,069 INFO MainThread:3189710 [wandb_run.py:_config_callback():1387] config_cb None None {'output_dir': 't5-bc-out', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 't5-bc-out/runs/May04_17-25-43_kolyoz1', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 't5-bc-out', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'epoch', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2025-05-05 00:49:32,578 WARNING MsgRouterThr:3189710 [router.py:message_loop():75] message_loop has been closed diff --git a/wandb/run-20250504_172503-0ictlmwf/run-0ictlmwf.wandb b/wandb/run-20250504_172503-0ictlmwf/run-0ictlmwf.wandb new file mode 100644 index 0000000000000000000000000000000000000000..f8cc6f4ca642c6b7dd3b09e91225198cbf4cc952 --- /dev/null +++ b/wandb/run-20250504_172503-0ictlmwf/run-0ictlmwf.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:006a1cac0ce47f9249802031630141a9f36a796d14d7c386b77939289e4b498e +size 17270813