github-actions[bot] commited on
Commit
fae8ff7
·
1 Parent(s): 66e683e

Sync turing folder from GitHub

Browse files
turing/config.py CHANGED
@@ -81,6 +81,12 @@ MODEL_CONFIG = {
81
  "model_class_module": "turing.modeling.models.randomForestTfIdf",
82
  "model_class_name": "RandomForestTfIdf",
83
  },
 
 
 
 
 
 
84
  "deberta": {
85
  "model_name": "DeBERTa-v3-xsmall-raw",
86
  "exp_name": "fine-tuned-DeBERTa",
 
81
  "model_class_module": "turing.modeling.models.randomForestTfIdf",
82
  "model_class_name": "RandomForestTfIdf",
83
  },
84
+ "minilm": {
85
+ "model_name": "MiniLM",
86
+ "exp_name": "fine-tuned-MiniLm",
87
+ "model_class_module": "turing.modeling.models.miniLM",
88
+ "model_class_name": "MiniLMModel",
89
+ },
90
  "deberta": {
91
  "model_name": "DeBERTa-v3-xsmall-raw",
92
  "exp_name": "fine-tuned-DeBERTa",
turing/modeling/models/MiniLMClassifierWrapper.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ import mlflow.pyfunc
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+
6
+ class MiniLMClassifierWrapper(mlflow.pyfunc.PythonModel):
7
+ def load_context(self, context):
8
+ self.encoder = SentenceTransformer(context.artifacts["encoder_path"])
9
+ self.classifier = joblib.load(context.artifacts["classifier_path"])
10
+
11
+ def predict(self, context, model_input):
12
+ embeddings = self.encoder.encode(model_input)
13
+ predictions = self.classifier.predict(embeddings)
14
+ return predictions
turing/modeling/models/miniLM.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import shutil
4
+
5
+ from datasets import Dataset
6
+ import joblib
7
+ from loguru import logger
8
+ import mlflow
9
+ import numpy as np
10
+ from numpy import ndarray
11
+ from peft import LoraConfig, TaskType, get_peft_model
12
+ from sentence_transformers import SentenceTransformer
13
+ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
14
+ from sklearn.multioutput import MultiOutputClassifier
15
+ import torch
16
+ import torch.nn as nn
17
+ from transformers import get_linear_schedule_with_warmup
18
+ from xgboost import XGBClassifier
19
+
20
+ from turing.modeling.baseModel import BaseModel
21
+ from turing.modeling.models.MiniLMClassifierWrapper import MiniLMClassifierWrapper
22
+
23
+
24
+ def drop_tokens(text, drop_prob=0.1):
25
+ """
26
+ Randomly drops tokens from the input text based on the specified drop probability.
27
+ """
28
+
29
+ tokens = text.split()
30
+ if len(tokens) <= 3:
31
+ return text
32
+
33
+ return " ".join(
34
+ t for t in tokens if random.random() > drop_prob
35
+ )
36
+
37
+ def drop_tokens_batch(texts, drop_prob=0.1, apply_prob=0.3):
38
+ """
39
+ Apply token dropping augmentation to a batch of texts.
40
+ """
41
+ augmented = []
42
+ for text in texts:
43
+ if random.random() < apply_prob:
44
+ augmented.append(drop_tokens(text, drop_prob))
45
+ elif random.random() < 0.15:
46
+ x = " ".join(text.split())
47
+ augmented.append(x)
48
+ else:
49
+ augmented.append(text)
50
+ return augmented
51
+
52
+
53
+ def finetune_miniLM(X_train, y_train, device,model_save_path="sentence-transformers/minilm.pt"):
54
+ """
55
+ Train MiniLM model with temporary classification head using java dataset only.
56
+
57
+ Args:
58
+ X_train: Input training data.
59
+ y_train: True labels for training data.
60
+ """
61
+ encoder = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2').to(device)
62
+ peft_config = LoraConfig(
63
+ task_type=TaskType.FEATURE_EXTRACTION,
64
+ lora_alpha=16,
65
+ bias="none",
66
+ lora_dropout=0.1,
67
+ )
68
+
69
+ encoder[0].auto_model = get_peft_model(
70
+ encoder[0].auto_model,
71
+ peft_config
72
+ )
73
+
74
+ encoder[0].auto_model.print_trainable_parameters()
75
+
76
+ y_train = np.array(y_train,dtype=np.float32)
77
+
78
+ dataset = Dataset.from_dict({"text": X_train, "labels": y_train})
79
+
80
+ split_set = dataset.train_test_split(test_size= 0.2, seed=42)
81
+ train_set = split_set['train']
82
+ eval_set = split_set['test']
83
+
84
+ epoch = 10
85
+ batch_size = 32
86
+ total_steps = len(train_set) // batch_size * epoch
87
+ warm_up_steps = int(0.1 * total_steps)
88
+
89
+ y_train = np.array(y_train,dtype=np.float32)
90
+
91
+ classifier = nn.Sequential(
92
+ nn.Linear(encoder.get_sentence_embedding_dimension(), 128),
93
+ nn.ReLU(),
94
+ nn.Dropout(0.1),
95
+ nn.Linear(128, len(y_train[0]))
96
+ ).to(device)
97
+
98
+ logger.info(f"Training set size: {len(train_set)}, Evaluation set size: {len(eval_set)}")
99
+
100
+ criterion = nn.BCEWithLogitsLoss()
101
+
102
+ optimizer = torch.optim.AdamW(list(classifier.parameters()) + list(encoder.parameters()), lr=1e-4, weight_decay=0.01)
103
+
104
+ scheduler = get_linear_schedule_with_warmup(
105
+ optimizer,
106
+ num_warmup_steps= warm_up_steps,
107
+ num_training_steps=total_steps
108
+ )
109
+
110
+ logger.info("Starting training of MiniLM model with classification head...")
111
+
112
+ low_loss = float('inf')
113
+
114
+ patience_counter = 0
115
+ for epoch in range(epoch):
116
+ encoder.train()
117
+ classifier.train()
118
+ losses = []
119
+ for i in range(0, len(train_set), batch_size):
120
+
121
+ batch = train_set[i:i+batch_size]
122
+
123
+ labels = torch.tensor(batch['labels']).to(device)
124
+
125
+ texts = drop_tokens_batch(batch['text'])
126
+
127
+ features = encoder.tokenize(texts)
128
+
129
+ features = {k: v.to(device) for k, v in features.items()}
130
+
131
+ embeddings = encoder(features)['sentence_embedding']
132
+ embeddings = torch.tensor(embeddings).to(device)
133
+ logits = classifier(embeddings)
134
+
135
+ loss = criterion(logits, labels)
136
+
137
+ optimizer.zero_grad()
138
+ loss.backward()
139
+ optimizer.step()
140
+ scheduler.step()
141
+ if i % 100 == 0:
142
+ logger.info("Done {} out of {} batches".format(i, len(train_set)))
143
+
144
+ encoder.eval()
145
+ classifier.eval()
146
+ with torch.no_grad():
147
+ for i in range(0, len(eval_set), batch_size):
148
+ batch = eval_set[i:i+batch_size]
149
+
150
+ labels = torch.tensor(batch['labels']).to(device)
151
+
152
+ embeddings = encoder.encode(batch['text'])
153
+ embeddings = torch.tensor(embeddings).to(device)
154
+ logits = classifier(embeddings)
155
+
156
+ loss = criterion(logits, labels)
157
+
158
+ losses.append(loss.item())
159
+
160
+ avg_loss = sum(losses) / len(losses)
161
+ logger.info(f"Epoch {epoch+1} completed, Loss: {avg_loss:.4f}")
162
+
163
+ if(avg_loss < low_loss):
164
+ low_loss = avg_loss
165
+ patience_counter = 0
166
+ encoder.save(model_save_path)
167
+ logger.info(f"encoder saved at {model_save_path}.")
168
+ else:
169
+ patience_counter += 1
170
+ if(patience_counter >= 2):
171
+ logger.info("Early stopping triggered.")
172
+ break
173
+ logger.info("MiniLM model trained with classification head.")
174
+ return {
175
+ "total_steps": total_steps,
176
+ "warm_up_steps": warm_up_steps,
177
+ "batch_size": batch_size,
178
+ "epochs": epoch,
179
+ "model_save_path": model_save_path
180
+ }
181
+
182
+
183
+ class MiniLMModel(BaseModel):
184
+ """
185
+ MiniLM model implementation for efficient text embeddings.
186
+ """
187
+
188
+ def __init__(self, language, path=None):
189
+ """
190
+ Initialize the MiniLM model with configuration parameters.
191
+
192
+ Args:
193
+ language (str): Language for the model.
194
+ path (str, optional): Path to load a pre-trained model. Defaults to None.
195
+ If None, a new model is initialized.
196
+ """
197
+ self.number_of_estimators = 300
198
+ self.learning_rate = 0.1
199
+ self.max_depth = 4
200
+ self.tree_method = 'hist'
201
+ self.objective = 'binary:logistic'
202
+ self.eval_metric = 'logloss'
203
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
204
+ self.params = {
205
+ "number_of_estimators": self.number_of_estimators,
206
+ "learning_rate": self.learning_rate,
207
+ "max_depth": self.max_depth,
208
+ "tree_method": self.tree_method,
209
+ "objective": self.objective,
210
+ "eval_metric": self.eval_metric
211
+ }
212
+ super().__init__(language, path)
213
+
214
+ def setup_model(self):
215
+ """
216
+ Initialize the MiniLM SentenceTransformer model.
217
+ """
218
+ self.encoder = None
219
+ self.model_path = "sentence-transformers/minilm.pt"
220
+ xgb_classifier = XGBClassifier(n_estimators=self.number_of_estimators,
221
+ eval_metric=self.eval_metric,
222
+ objective=self.objective,
223
+ learning_rate=self.learning_rate,
224
+ max_depth=self.max_depth,
225
+ tree_method=self.tree_method)
226
+
227
+ self.classifier = MultiOutputClassifier(xgb_classifier)
228
+ logger.info("MiniLM model initialized.")
229
+
230
+
231
+
232
+ def train(self, X_train, y_train):
233
+ """
234
+ Train the MiniLM model with a classification head.
235
+
236
+ Args:
237
+ X_train: Input training data.
238
+ y_train: True labels for training data.
239
+ """
240
+ if self.encoder is None and self.language == "java":
241
+ if os.path.exists(self.model_path):
242
+ logger.info(f"Loading existing MiniLM model from {self.model_path} for fine-tuning...")
243
+ self.encoder = SentenceTransformer(self.model_path).to(self.device)
244
+ else:
245
+ logger.info(f"Fine-tuning MiniLM encoder using {self.language} training data...")
246
+ parameters = finetune_miniLM(X_train, y_train,device=self.device, model_save_path=self.model_path)
247
+ self.params.update(parameters)
248
+ self.encoder = SentenceTransformer(parameters["model_save_path"]).to(self.device)
249
+
250
+ if self.encoder is None:
251
+ self.encoder = SentenceTransformer(self.model_path).to(self.device)
252
+
253
+ y_train = np.array(y_train,dtype=np.float32)
254
+
255
+ train_embeddings = self.encoder.encode(X_train)
256
+
257
+ logger.info("Starting training of MiniLM model with Xgboost...")
258
+
259
+ self.classifier.fit(train_embeddings, y_train)
260
+
261
+ return {
262
+ "n_estimators": self.number_of_estimators,
263
+ "learning_rate": self.learning_rate,
264
+ "max_depth": self.max_depth,
265
+ "tree_method": self.tree_method,
266
+ "objective": self.objective,
267
+ "eval_metric": self.eval_metric
268
+ }
269
+
270
+
271
+ def evaluate(self, X_test, y_test) -> dict[str,any]:
272
+ """
273
+ Evaluate the MiniLM model on test data.
274
+
275
+ Args:
276
+ X_test: Input test data.
277
+ y_test: True labels for test data.
278
+ """
279
+ y_test = np.array(y_test,dtype=np.float32)
280
+
281
+ test_embeddings = self.encoder.encode(X_test)
282
+
283
+ predictions = self.classifier.predict(test_embeddings)
284
+
285
+ accuracy = accuracy_score(y_test, predictions)
286
+
287
+ f1_micro = f1_score(y_test, predictions, average='micro')
288
+ f1_macro = f1_score(y_test, predictions, average='macro')
289
+ f1_weighted = f1_score(y_test, predictions, average='weighted')
290
+
291
+ recall = recall_score(y_test, predictions, average='weighted')
292
+
293
+ precision = precision_score(y_test, predictions, average='weighted')
294
+
295
+ metrics = {
296
+ "accuracy": accuracy,
297
+ "f1_micro_score": f1_micro,
298
+ "f1_macro_score": f1_macro,
299
+ "f1_weighted_score": f1_weighted,
300
+ "recall": recall,
301
+ "precision": precision
302
+ }
303
+
304
+ return metrics
305
+
306
+ def predict(self, X) -> ndarray:
307
+ """
308
+ Make predictions using the trained MiniLM model.
309
+
310
+ Args:
311
+ X: Input data for prediction.
312
+
313
+ Returns:
314
+ Predictions made by the model.
315
+ """
316
+
317
+ if self.encoder is None or self.classifier is None:
318
+ raise ValueError("Model is not trained. Call train() or load() before prediction.")
319
+
320
+ encodedText = self.encoder.encode(X)
321
+
322
+ predictions = self.classifier.predict(encodedText)
323
+
324
+ logger.info(f"Predictions: {predictions}.")
325
+
326
+ return predictions
327
+
328
+ def save(self, path, model_name):
329
+ """
330
+ Save model and log to MLflow.
331
+
332
+ Args:
333
+ path (str): Path to save the model.
334
+ model_name (str): Name to use when saving the model (without extension).
335
+ """
336
+
337
+ if self.encoder is None and self.classifier is None:
338
+ raise ValueError("Model is not trained. Cannot save uninitialized model.")
339
+
340
+ complete_path = os.path.join(path, model_name)
341
+ encoder_path = complete_path+f"_encoder_{self.language}"
342
+ classifier_path = complete_path+f"_xgb_classifier_{self.language}.joblib"
343
+
344
+ if os.path.exists(complete_path) and os.path.isdir(complete_path):
345
+ shutil.rmtree(complete_path)
346
+
347
+ self.encoder.save(encoder_path)
348
+ joblib.dump(self.classifier, classifier_path)
349
+
350
+ try:
351
+ # Log to MLflow
352
+ logger.info("Logging artifacts to MLflow...")
353
+ mlflow.pyfunc.log_model(
354
+ artifact_path=f"{model_name}_{self.language}",
355
+ python_model=MiniLMClassifierWrapper(),
356
+ artifacts={
357
+ "encoder_path": encoder_path,
358
+ "classifier_path": classifier_path
359
+ },
360
+ code_paths=["turing/modeling/models/MiniLMClassifierWrapper.py"]
361
+ )
362
+ except Exception as e:
363
+ logger.error(f"Failed to log model artifacts to MLflow: {e}")
364
+
365
+