isikz
/

phosphorylation_binaryclassification_prot_t5

PyTorch

Model card Files Files and versions

xet

Community

isikz commited on May 5, 2025

Commit

53149f4

verified ·

1 Parent(s): 538c420

Delete finetuning_bc_prott5.py

Browse files

Files changed (1) hide show

finetuning_bc_prott5.py +0 -149

finetuning_bc_prott5.py DELETED Viewed

@@ -1,149 +0,0 @@
-import torch, torch.nn as nn
-from transformers import (T5EncoderModel, T5Tokenizer,
-                          Trainer, TrainingArguments)
-from transformers.modeling_outputs import SequenceClassifierOutput
-from datasets import load_dataset
-from sklearn.metrics import accuracy_score
-import pandas as pd
-import wandb
-from huggingface_hub import login
-import re
-from datasets import Dataset
-# ---------------------------
-# 1. GİRİŞ‑ÇIKIŞ ve LOGIN
-# ---------------------------
-wandb.login()
-wandb.init(project='finetuning-bc-protT5')
-# ---------------------------
-# 2. DATA HAZIRLIK (seninkiler)
-# ---------------------------
-data = pd.read_csv("ready_to_train.csv")
-pos = data.loc[data["SITE_+/-7_AA"].str.len()==15]["SITE_+/-7_AA"].tolist()
-neg = data.loc[data["NON_PH_SITE"].str.len()==15]["NON_PH_SITE"].tolist()
-labels = [1]*len(pos)+[0]*len(neg)
-texts  = pos+neg
-prep_texts = [" ".join(list(t.upper())) for t in texts]
-prep_texts = [re.sub(r"[UZOB]", "X", pt).replace("_","-")for pt in prep_texts]
-from sklearn.model_selection import train_test_split
-X_train, X_temp, y_train, y_temp = train_test_split(prep_texts, labels, test_size=0.30, random_state=42)
-X_val, X_test, y_val, y_test     = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)
-tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50")
-def tokenize(batch):
-    return tokenizer(batch["text"],
-                     padding="max_length",
-                     truncation=True,
-                     max_length=64)
-def to_hf_dataset(texts, labels):
-    return {"text": texts, "label": labels}
-train_ds = Dataset.from_dict({"text": X_train, "label": y_train})
-val_ds   = Dataset.from_dict({"text": X_val,   "label": y_val})
-train_ds = train_ds.map(tokenize, batched=True).with_format("torch")
-val_ds   = val_ds.map(tokenize,   batched=True).with_format("torch")
-# ---------------------------
-# 3. MODEL: T5 + Classification Head
-# ---------------------------
-class T5BinaryClassifier(nn.Module):
-    def __init__(self, model_name, dropout=0.1):
-        super().__init__()
-        self.encoder = T5EncoderModel.from_pretrained(model_name)
-        enc_dim      = self.encoder.config.d_model       # 1024 (prot_t5_xl)
-        self.dropout = nn.Dropout(dropout)
-        self.cls     = nn.Linear(enc_dim, 2)             # binary
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                labels=None,
-                **kwargs):
-        enc_out = self.encoder(input_ids=input_ids,
-                               attention_mask=attention_mask,
-                               return_dict=True)
-        # [CLS]-benzeri vektör: <pad> token pozisyonu (id=0) yerine mean‑pool
-        hidden = enc_out.last_hidden_state        # (B, L, D)
-        pooled = hidden.mean(dim=1)               # (B, D)
-        logits = self.cls(self.dropout(pooled))
-        loss = None
-        if labels is not None:
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(logits, labels)
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=enc_out.hidden_states,
-            attentions=enc_out.attentions,
-        )
-model = T5BinaryClassifier("Rostlab/prot_t5_xl_uniref50").cuda()
-# ---------------------------
-# 4. TRAINING ARGUMENTS
-# ---------------------------
-args = TrainingArguments(
-    output_dir="t5-bc-out",
-    num_train_epochs=3,
-    learning_rate=5e-5,
-    per_device_train_batch_size=8,     # prot_t5_xl büyük; 512 yerine 8‑16 önerilir
-    per_device_eval_batch_size=8,
-    gradient_accumulation_steps=4,     # efektif 32
-    evaluation_strategy="epoch",
-    load_best_model_at_end=True,
-    save_strategy="epoch",
-    save_safetensors=False,
-    report_to=["wandb"],
-    fp16=True,
-)
-def compute_metrics(eval_pred):
-    logits, labels = eval_pred
-    preds = logits.argmax(-1)
-    acc  = accuracy_score(labels, preds)
-    return {"accuracy": acc}
-trainer = Trainer(
-    model=model,
-    args=args,
-    train_dataset=train_ds,
-    eval_dataset=val_ds,
-    compute_metrics=compute_metrics,
-)
-trainer.train()
-# ---------------------------
-# 5. TEST & SAVE
-# ---------------------------
-# Python dict → Hugging Face Dataset
-test_ds = Dataset.from_dict({"text": X_test, "label": y_test})
-# Tokenize ve tensor formatına çevir
-test_ds = test_ds.map(tokenize, batched=True).with_format("torch")
-metrics  = trainer.evaluate(test_ds)
-print(metrics)
-# ---- Manuel kaydetme ----
-trainer.save_model(
-    "/arf/scratch/zisik/prott5_bc_ft"
-)
-tokenizer.save_pretrained("/arf/scratch/zisik/prott5_bc_ft")
-#model.push_to_hub("isikz/prot_t5_binary_classifier")
-#tokenizer.push_to_hub("isikz/prot_t5_binary_classifier")
-#wandb.finish()