github-actions[bot] commited on
Commit
8e13241
·
1 Parent(s): 7bea42e

Sync turing folder from GitHub

Browse files
turing/config.py CHANGED
@@ -75,6 +75,12 @@ MODEL_CONFIG = {
75
  "model_class_module": "turing.modeling.models.randomForestTfIdf",
76
  "model_class_name": "RandomForestTfIdf",
77
  },
 
 
 
 
 
 
78
  }
79
  DEFAULT_NUM_ITERATIONS = 20
80
 
@@ -82,6 +88,7 @@ DEFAULT_NUM_ITERATIONS = 20
82
  EXISTING_MODELS = [
83
  "randomForestTfIdf",
84
  "codeBerta",
 
85
  ]
86
 
87
  # If tqdm is installed, configure loguru with tqdm.write
 
75
  "model_class_module": "turing.modeling.models.randomForestTfIdf",
76
  "model_class_name": "RandomForestTfIdf",
77
  },
78
+ "deberta": {
79
+ "model_name": "DeBERTa-v3-xsmall-raw",
80
+ "exp_name": "fine-tuned-DeBERTa",
81
+ "model_class_module": "turing.modeling.models.DeBERTa",
82
+ "model_class_name": "DebertaXSmall",
83
+ },
84
  }
85
  DEFAULT_NUM_ITERATIONS = 20
86
 
 
88
  EXISTING_MODELS = [
89
  "randomForestTfIdf",
90
  "codeBerta",
91
+ "deBERTa",
92
  ]
93
 
94
  # If tqdm is installed, configure loguru with tqdm.write
turing/modeling/models/DeBERTa.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import shutil
4
+ import warnings
5
+
6
+ from loguru import logger
7
+ import mlflow
8
+ import numpy as np
9
+ from sklearn.metrics import (
10
+ accuracy_score,
11
+ classification_report,
12
+ f1_score,
13
+ precision_score,
14
+ recall_score,
15
+ )
16
+ import torch
17
+ from torch.utils.data import Dataset
18
+ from transformers import (
19
+ AutoModelForSequenceClassification,
20
+ AutoTokenizer,
21
+ EarlyStoppingCallback,
22
+ Trainer,
23
+ TrainingArguments,
24
+ )
25
+
26
+ from turing.config import MODELS_DIR
27
+
28
+ from ..baseModel import BaseModel
29
+
30
+ warnings.filterwarnings("ignore")
31
+
32
+ def compute_metrics(eval_pred):
33
+ predictions, labels = eval_pred
34
+ # Convert logits to probabilities
35
+ probs = 1 / (1 + np.exp(-predictions))
36
+
37
+ preds = (probs > 0.35).astype(int)
38
+
39
+ # metrics
40
+ f1 = f1_score(labels, preds, average="micro")
41
+ accuracy = accuracy_score(labels, preds)
42
+ precision = precision_score(labels, preds, average="micro")
43
+ recall = recall_score(labels, preds, average="micro")
44
+ return {
45
+ "f1": f1,
46
+ "accuracy": accuracy,
47
+ "precision": precision,
48
+ "recall": recall,
49
+ }
50
+
51
+ class DebertaDataset(Dataset):
52
+ """
53
+ Internal Dataset class for DeBERTa.
54
+ """
55
+ def __init__(self, encodings, labels=None, num_labels=None):
56
+ self.encodings = {key: torch.tensor(val) for key, val in encodings.items()}
57
+
58
+ if labels is not None:
59
+ if not isinstance(labels, (np.ndarray, torch.Tensor)):
60
+ labels = np.array(labels)
61
+
62
+ # Handle standard label list or flattened format
63
+ if num_labels is not None and (len(labels.shape) == 1 or (len(labels.shape) == 2 and labels.shape[1] == 1)):
64
+ labels_flat = labels.flatten()
65
+ one_hot = np.zeros((len(labels_flat), num_labels), dtype=np.float32)
66
+ valid_indices = labels_flat < num_labels
67
+ one_hot[valid_indices, labels_flat[valid_indices]] = 1.0
68
+ self.labels = torch.tensor(one_hot, dtype=torch.float)
69
+ else:
70
+ self.labels = torch.tensor(labels, dtype=torch.float)
71
+ else:
72
+ self.labels = None
73
+
74
+ def __getitem__(self, idx):
75
+ item = {key: val[idx] for key, val in self.encodings.items()}
76
+ if self.labels is not None:
77
+ item['labels'] = self.labels[idx]
78
+ return item
79
+
80
+ def __len__(self):
81
+ return len(self.encodings['input_ids'])
82
+
83
+ class WeightedTrainer(Trainer):
84
+ def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
85
+ labels = inputs.get("labels")
86
+ outputs = model(**inputs)
87
+ logits = outputs.get("logits")
88
+
89
+ pos_weight = torch.ones([logits.shape[1]]).to(logits.device) * 4.0
90
+ loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
91
+
92
+ loss = loss_fct(logits, labels.float())
93
+ return (loss, outputs) if return_outputs else loss
94
+
95
+ class DebertaXSmall(BaseModel):
96
+ """
97
+ Wrapper for Microsoft DeBERTa-v3-xsmall.
98
+ """
99
+
100
+ def __init__(self, language, path=None):
101
+
102
+ epochs = 10 if language == "java" else 20
103
+ lr = 2e-5 if language == "java" else 3e-5
104
+
105
+ self.params = {
106
+ "model_name_hf": "microsoft/deberta-v3-xsmall",
107
+ # Java: 7, Python: 5, Pharo: 6
108
+ "num_labels": 7 if language == "java" else 5 if language == "python" else 6,
109
+ "max_length": 128,
110
+ "epochs": epochs,
111
+ "batch_size_train": 32,
112
+ "batch_size_eval": 64,
113
+ "learning_rate": lr,
114
+ "weight_decay": 0.01,
115
+ "train_size": 0.8,
116
+ "early_stopping_patience": 3,
117
+ "early_stopping_threshold": 0.005,
118
+ "warmup_steps": 100
119
+ }
120
+
121
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
122
+ self.tokenizer = None
123
+ super().__init__(language, path)
124
+
125
+ def setup_model(self):
126
+ logger.info(f"Initializing {self.params['model_name_hf']} on {self.device}...")
127
+
128
+ self.tokenizer = AutoTokenizer.from_pretrained(self.params["model_name_hf"], use_fast=False)
129
+
130
+ self.model = AutoModelForSequenceClassification.from_pretrained(
131
+ self.params["model_name_hf"],
132
+ num_labels=self.params["num_labels"],
133
+ problem_type="multi_label_classification"
134
+ ).to(self.device)
135
+ logger.success("DeBERTa-v3-xsmall model initialized.")
136
+
137
+ def _tokenize(self, texts):
138
+ safe_texts = []
139
+ for t in texts:
140
+ # Handle potential NaNs or non-strings
141
+ safe_texts.append(str(t) if t is not None and t == t else "")
142
+
143
+ return self.tokenizer(
144
+ safe_texts,
145
+ truncation=True,
146
+ padding=True,
147
+ max_length=self.params["max_length"]
148
+ )
149
+
150
+ def train(self, X_train, y_train) -> dict:
151
+ if self.model is None:
152
+ raise ValueError("Model not initialized.")
153
+
154
+ params_to_log = {k: v for k, v in self.params.items() if k != "model_name_hf"}
155
+ logger.info(f"Starting training for: {self.language.upper()}")
156
+
157
+ train_encodings = self._tokenize(X_train)
158
+ full_dataset = DebertaDataset(train_encodings, y_train, num_labels=self.params["num_labels"])
159
+
160
+ train_len = int(self.params["train_size"] * len(full_dataset))
161
+ val_len = len(full_dataset) - train_len
162
+ train_ds, val_ds = torch.utils.data.random_split(full_dataset, [train_len, val_len])
163
+
164
+ temp_ckpt_dir = os.path.join(MODELS_DIR, "temp_deberta_ckpt")
165
+
166
+ training_args = TrainingArguments(
167
+ output_dir=temp_ckpt_dir,
168
+ num_train_epochs=self.params["epochs"],
169
+ per_device_train_batch_size=self.params["batch_size_train"],
170
+ per_device_eval_batch_size=self.params["batch_size_eval"],
171
+ learning_rate=self.params["learning_rate"],
172
+ weight_decay=self.params["weight_decay"],
173
+ eval_strategy="epoch",
174
+ save_strategy="epoch",
175
+ load_best_model_at_end=True,
176
+ metric_for_best_model="f1",
177
+ greater_is_better=True,
178
+ save_total_limit=1,
179
+ logging_dir='./logs',
180
+ report_to="none",
181
+ fp16=torch.cuda.is_available()
182
+ )
183
+
184
+ trainer = WeightedTrainer(
185
+ model=self.model,
186
+ args=training_args,
187
+ train_dataset=train_ds,
188
+ eval_dataset=val_ds,
189
+ compute_metrics=compute_metrics,
190
+ callbacks=[EarlyStoppingCallback(
191
+ early_stopping_patience=self.params["early_stopping_patience"],
192
+ early_stopping_threshold=self.params["early_stopping_threshold"]
193
+ )]
194
+ )
195
+
196
+ trainer.train()
197
+
198
+ if os.path.exists(temp_ckpt_dir):
199
+ shutil.rmtree(temp_ckpt_dir)
200
+
201
+ return params_to_log
202
+
203
+ def evaluate(self, X_test, y_test) -> dict:
204
+ y_pred = self.predict(X_test)
205
+
206
+ y_test_np = np.array(y_test) if not isinstance(y_test, np.ndarray) else y_test
207
+
208
+ # Handle 1D array conversion for metrics if necessary
209
+ if y_test_np.ndim == 1 or (y_test_np.ndim == 2 and y_test_np.shape[1] == 1):
210
+ y_test_expanded = np.zeros((y_test_np.shape[0], self.params["num_labels"]), dtype=int)
211
+ indices = y_test_np.flatten()
212
+ for i, label_idx in enumerate(indices):
213
+ if 0 <= label_idx < self.params["num_labels"]:
214
+ y_test_expanded[i, int(label_idx)] = 1
215
+ y_test_np = y_test_expanded
216
+
217
+ report = classification_report(y_test_np, y_pred, zero_division=0)
218
+ print(f"\n[DeBERTa {self.language}] Classification Report:\n{report}")
219
+
220
+ metrics = {
221
+ "accuracy": accuracy_score(y_test_np, y_pred),
222
+ "f1_score_micro": f1_score(y_test_np, y_pred, average="micro"),
223
+ "f1_score_weighted": f1_score(y_test_np, y_pred, average="weighted"),
224
+ }
225
+
226
+ mlflow.log_metrics(metrics)
227
+ return metrics
228
+
229
+ def predict(self, X) -> np.ndarray:
230
+ if self.model is None:
231
+ raise ValueError("Model not trained.")
232
+
233
+ self.model.eval()
234
+ encodings = self._tokenize(X)
235
+ dataset = DebertaDataset(encodings, labels=None)
236
+
237
+ training_args = TrainingArguments(
238
+ output_dir="./pred_temp_deberta",
239
+ per_device_eval_batch_size=self.params["batch_size_eval"],
240
+ fp16=torch.cuda.is_available(),
241
+ report_to="none"
242
+ )
243
+
244
+ trainer = Trainer(model=self.model, args=training_args)
245
+ output = trainer.predict(dataset)
246
+
247
+ if os.path.exists("./pred_temp_deberta"):
248
+ shutil.rmtree("./pred_temp_deberta")
249
+
250
+ logits = output.predictions
251
+ probs = 1 / (1 + np.exp(-logits))
252
+
253
+ return (probs > 0.35).astype(int)
254
+
255
+ def save(self, path, model_name):
256
+ """
257
+ save model
258
+ """
259
+ if self.model is None:
260
+ raise ValueError("Model not trained.")
261
+
262
+ complete_path = os.path.join(path, self.language, model_name)
263
+
264
+ if os.path.exists(complete_path):
265
+ shutil.rmtree(complete_path)
266
+
267
+ logger.info(f"Saving model to: {complete_path}")
268
+
269
+ self.model.save_pretrained(complete_path)
270
+ self.tokenizer.save_pretrained(complete_path)
271
+
272
+ config_data = {
273
+ "language": self.language,
274
+ "num_labels": self.params["num_labels"],
275
+ "model_name": model_name
276
+ }
277
+ with open(os.path.join(complete_path, "config_custom.json"), "w") as f:
278
+ json.dump(config_data, f)
279
+
280
+ logger.info("Model saved locally.")
281
+
282
+ try:
283
+ # Log on MLflow
284
+ logger.info("Logging artifacts to MLflow...")
285
+ mlflow.log_artifacts(local_dir=complete_path, artifact_path=f"{self.language}/{model_name}")
286
+ except Exception as e:
287
+ logger.error(f"Failed to log model artifacts to MLflow: {e}")