isikz commited on
Commit
53149f4
·
verified ·
1 Parent(s): 538c420

Delete finetuning_bc_prott5.py

Browse files
Files changed (1) hide show
  1. finetuning_bc_prott5.py +0 -149
finetuning_bc_prott5.py DELETED
@@ -1,149 +0,0 @@
1
- import torch, torch.nn as nn
2
- from transformers import (T5EncoderModel, T5Tokenizer,
3
- Trainer, TrainingArguments)
4
- from transformers.modeling_outputs import SequenceClassifierOutput
5
- from datasets import load_dataset
6
- from sklearn.metrics import accuracy_score
7
- import pandas as pd
8
- import wandb
9
- from huggingface_hub import login
10
- import re
11
- from datasets import Dataset
12
-
13
- # ---------------------------
14
- # 1. GİRİŞ‑ÇIKIŞ ve LOGIN
15
- # ---------------------------
16
-
17
- wandb.login()
18
- wandb.init(project='finetuning-bc-protT5')
19
-
20
- # ---------------------------
21
- # 2. DATA HAZIRLIK (seninkiler)
22
- # ---------------------------
23
- data = pd.read_csv("ready_to_train.csv")
24
- pos = data.loc[data["SITE_+/-7_AA"].str.len()==15]["SITE_+/-7_AA"].tolist()
25
- neg = data.loc[data["NON_PH_SITE"].str.len()==15]["NON_PH_SITE"].tolist()
26
- labels = [1]*len(pos)+[0]*len(neg)
27
- texts = pos+neg
28
- prep_texts = [" ".join(list(t.upper())) for t in texts]
29
- prep_texts = [re.sub(r"[UZOB]", "X", pt).replace("_","-")for pt in prep_texts]
30
-
31
-
32
- from sklearn.model_selection import train_test_split
33
- X_train, X_temp, y_train, y_temp = train_test_split(prep_texts, labels, test_size=0.30, random_state=42)
34
- X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)
35
-
36
- tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50")
37
-
38
- def tokenize(batch):
39
- return tokenizer(batch["text"],
40
- padding="max_length",
41
- truncation=True,
42
- max_length=64)
43
-
44
- def to_hf_dataset(texts, labels):
45
- return {"text": texts, "label": labels}
46
-
47
- train_ds = Dataset.from_dict({"text": X_train, "label": y_train})
48
- val_ds = Dataset.from_dict({"text": X_val, "label": y_val})
49
-
50
- train_ds = train_ds.map(tokenize, batched=True).with_format("torch")
51
- val_ds = val_ds.map(tokenize, batched=True).with_format("torch")
52
-
53
-
54
-
55
- # ---------------------------
56
- # 3. MODEL: T5 + Classification Head
57
- # ---------------------------
58
- class T5BinaryClassifier(nn.Module):
59
- def __init__(self, model_name, dropout=0.1):
60
- super().__init__()
61
- self.encoder = T5EncoderModel.from_pretrained(model_name)
62
- enc_dim = self.encoder.config.d_model # 1024 (prot_t5_xl)
63
- self.dropout = nn.Dropout(dropout)
64
- self.cls = nn.Linear(enc_dim, 2) # binary
65
-
66
- def forward(self,
67
- input_ids=None,
68
- attention_mask=None,
69
- labels=None,
70
- **kwargs):
71
- enc_out = self.encoder(input_ids=input_ids,
72
- attention_mask=attention_mask,
73
- return_dict=True)
74
- # [CLS]-benzeri vektör: <pad> token pozisyonu (id=0) yerine mean‑pool
75
- hidden = enc_out.last_hidden_state # (B, L, D)
76
- pooled = hidden.mean(dim=1) # (B, D)
77
-
78
- logits = self.cls(self.dropout(pooled))
79
-
80
- loss = None
81
- if labels is not None:
82
- loss_fct = nn.CrossEntropyLoss()
83
- loss = loss_fct(logits, labels)
84
-
85
- return SequenceClassifierOutput(
86
- loss=loss,
87
- logits=logits,
88
- hidden_states=enc_out.hidden_states,
89
- attentions=enc_out.attentions,
90
- )
91
-
92
- model = T5BinaryClassifier("Rostlab/prot_t5_xl_uniref50").cuda()
93
-
94
- # ---------------------------
95
- # 4. TRAINING ARGUMENTS
96
- # ---------------------------
97
- args = TrainingArguments(
98
- output_dir="t5-bc-out",
99
- num_train_epochs=3,
100
- learning_rate=5e-5,
101
- per_device_train_batch_size=8, # prot_t5_xl büyük; 512 yerine 8‑16 önerilir
102
- per_device_eval_batch_size=8,
103
- gradient_accumulation_steps=4, # efektif 32
104
- evaluation_strategy="epoch",
105
- load_best_model_at_end=True,
106
- save_strategy="epoch",
107
- save_safetensors=False,
108
- report_to=["wandb"],
109
- fp16=True,
110
- )
111
-
112
- def compute_metrics(eval_pred):
113
- logits, labels = eval_pred
114
- preds = logits.argmax(-1)
115
- acc = accuracy_score(labels, preds)
116
- return {"accuracy": acc}
117
-
118
- trainer = Trainer(
119
- model=model,
120
- args=args,
121
- train_dataset=train_ds,
122
- eval_dataset=val_ds,
123
- compute_metrics=compute_metrics,
124
- )
125
-
126
- trainer.train()
127
-
128
- # ---------------------------
129
- # 5. TEST & SAVE
130
- # ---------------------------
131
-
132
- # Python dict → Hugging Face Dataset
133
- test_ds = Dataset.from_dict({"text": X_test, "label": y_test})
134
-
135
- # Tokenize ve tensor formatına çevir
136
- test_ds = test_ds.map(tokenize, batched=True).with_format("torch")
137
-
138
- metrics = trainer.evaluate(test_ds)
139
- print(metrics)
140
- # ---- Manuel kaydetme ----
141
- trainer.save_model(
142
- "/arf/scratch/zisik/prott5_bc_ft"
143
- )
144
- tokenizer.save_pretrained("/arf/scratch/zisik/prott5_bc_ft")
145
-
146
-
147
- #model.push_to_hub("isikz/prot_t5_binary_classifier")
148
- #tokenizer.push_to_hub("isikz/prot_t5_binary_classifier")
149
- #wandb.finish()