github-actions[bot] commited on
Commit
5ecd2f9
·
1 Parent(s): 38593e7

Sync turing folder from GitHub

Browse files
turing/modeling/models/codeBerta.py CHANGED
@@ -32,7 +32,6 @@ warnings.filterwarnings("ignore")
32
 
33
  def compute_metrics(eval_pred):
34
  predictions, labels = eval_pred
35
-
36
  # Sigmoid function to convert logits to probabilities
37
  probs = 1 / (1 + np.exp(-predictions))
38
 
@@ -67,11 +66,11 @@ class CodeBERTaDataset(Dataset):
67
  """
68
 
69
  self.encodings = {key: torch.tensor(val) for key, val in encodings.items()}
70
-
71
  if labels is not None:
72
  if not isinstance(labels, (np.ndarray, torch.Tensor)):
73
  labels = np.array(labels)
74
-
75
  # Case A: labels are indices (integers)
76
  if num_labels is not None and (len(labels.shape) == 1 or (len(labels.shape) == 2 and labels.shape[1] == 1)):
77
  labels_flat = labels.flatten()
@@ -149,12 +148,11 @@ class CodeBERTa(BaseModel):
149
  "early_stopping_patience": 3,
150
  "early_stopping_threshold": 0.005
151
  }
152
-
153
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
154
  self.tokenizer = None
155
-
156
- super().__init__(language, path)
157
 
 
158
 
159
  def setup_model(self):
160
  """
@@ -162,7 +160,7 @@ class CodeBERTa(BaseModel):
162
  """
163
 
164
  logger.info(f"Initializing {self.params['model_name_hf']} on {self.device}...")
165
-
166
  self.tokenizer = AutoTokenizer.from_pretrained(self.params["model_name_hf"])
167
  self.model = AutoModelForSequenceClassification.from_pretrained(
168
  self.params["model_name_hf"],
@@ -218,23 +216,21 @@ class CodeBERTa(BaseModel):
218
  if self.model is None:
219
  raise ValueError("Model is not initialized. Call setup_model() before training.")
220
 
221
- # log parameters to MLflow without model_name_hf
222
  params_to_log = {k: v for k, v in self.params.items() if k != "model_name_hf" and k != "num_labels"}
223
-
224
  logger.info(f"Starting training for: {self.language.upper()}")
225
-
226
  # Prepare dataset (train/val split)
227
  train_encodings = self._tokenize(X_train)
228
  full_dataset = CodeBERTaDataset(train_encodings, y_train, num_labels=self.params["num_labels"])
 
229
  train_size = int(self.params["train_size"] * len(full_dataset))
230
  val_size = len(full_dataset) - train_size
231
  train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])
232
 
233
  temp_ckpt_dir = os.path.join(MODELS_DIR, "temp_checkpoints")
234
-
235
  use_fp16 = torch.cuda.is_available()
236
- if not use_fp16:
237
- logger.info("Mixed Precision (fp16) disabled because CUDA is not available.")
238
 
239
  training_args = TrainingArguments(
240
  output_dir=temp_ckpt_dir,
@@ -314,9 +310,8 @@ class CodeBERTa(BaseModel):
314
  idx = int(label_idx)
315
  if 0 <= idx < num_labels:
316
  y_test_expanded[i, idx] = 1
317
-
318
- y_test_np = y_test_expanded
319
 
 
320
  # Generate classification report
321
  report = classification_report(y_test_np, y_pred, zero_division=0)
322
  print("\n" + "=" * 50)
@@ -330,12 +325,8 @@ class CodeBERTa(BaseModel):
330
  "recall": recall_score(y_test_np, y_pred, average="macro", zero_division=0),
331
  "f1_score": f1_score(y_test_np, y_pred, average="macro"),
332
  }
333
-
334
  mlflow.log_metrics(metrics)
335
-
336
- logger.info(
337
- f"Evaluation completed — Accuracy: {metrics['accuracy']:.3f}, F1: {metrics['f1_score']:.3f}"
338
- )
339
  return metrics
340
 
341
 
@@ -350,36 +341,28 @@ class CodeBERTa(BaseModel):
350
  Returns:
351
  np.ndarray: Multi-Hot Encoded predictions (e.g., [[0, 1, 1, 0], ...])
352
  """
353
-
354
  if self.model is None:
355
  raise ValueError("Model is not trained. Call train() or load() before prediction.")
356
 
357
  # Set model to evaluation mode
358
  self.model.eval()
359
 
 
360
  encodings = self._tokenize(X)
361
- # Pass None as labels because we are in inference
362
- dataset = CodeBERTaDataset(encodings, labels=None)
363
 
364
- use_fp16 = torch.cuda.is_available()
 
365
 
366
- training_args = TrainingArguments(
367
- output_dir="./pred_temp",
368
- per_device_eval_batch_size=self.params["batch_size_eval"],
369
- fp16=use_fp16,
370
- report_to="none",
371
- no_cuda=not torch.cuda.is_available()
372
- )
373
 
374
- trainer = Trainer(model=self.model, args=training_args)
375
- output = trainer.predict(dataset)
376
 
377
- # Clean up temporary prediction directory
378
- if os.path.exists("./pred_temp"):
379
- shutil.rmtree("./pred_temp")
380
-
381
- # Convert logits to probabilities
382
- logits = output.predictions
383
  probs = 1 / (1 + np.exp(-logits))
384
 
385
  # Apply a threshold of 0.5 (if prob > 0.5, predict 1 else 0)
@@ -387,7 +370,6 @@ class CodeBERTa(BaseModel):
387
 
388
  return preds_binary
389
 
390
-
391
  def save(self, path, model_name):
392
  """
393
  Save model locally and log to MLflow as artifact.
@@ -420,7 +402,6 @@ class CodeBERTa(BaseModel):
420
  except Exception as e:
421
  logger.error(f"Failed to log model artifacts to MLflow: {e}")
422
 
423
-
424
  def load(self, model_path):
425
  """
426
  Load model from a local path OR an MLflow URI.
@@ -447,14 +428,14 @@ class CodeBERTa(BaseModel):
447
  try:
448
  if not os.path.exists(local_model_path):
449
  raise FileNotFoundError(f"Model path not found: {local_model_path}")
450
-
451
  # Load tokenizer and model from local path
452
  self.tokenizer = AutoTokenizer.from_pretrained(local_model_path)
453
  self.model = AutoModelForSequenceClassification.from_pretrained(
454
- local_model_path
 
455
  ).to(self.device)
456
  logger.info("Model loaded from local path successfully.")
457
-
458
  except Exception as e:
459
  logger.error(f"Failed to load model from local path: {e}")
460
  raise e
 
32
 
33
  def compute_metrics(eval_pred):
34
  predictions, labels = eval_pred
 
35
  # Sigmoid function to convert logits to probabilities
36
  probs = 1 / (1 + np.exp(-predictions))
37
 
 
66
  """
67
 
68
  self.encodings = {key: torch.tensor(val) for key, val in encodings.items()}
69
+
70
  if labels is not None:
71
  if not isinstance(labels, (np.ndarray, torch.Tensor)):
72
  labels = np.array(labels)
73
+
74
  # Case A: labels are indices (integers)
75
  if num_labels is not None and (len(labels.shape) == 1 or (len(labels.shape) == 2 and labels.shape[1] == 1)):
76
  labels_flat = labels.flatten()
 
148
  "early_stopping_patience": 3,
149
  "early_stopping_threshold": 0.005
150
  }
151
+
152
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
153
  self.tokenizer = None
 
 
154
 
155
+ super().__init__(language, path)
156
 
157
  def setup_model(self):
158
  """
 
160
  """
161
 
162
  logger.info(f"Initializing {self.params['model_name_hf']} on {self.device}...")
163
+
164
  self.tokenizer = AutoTokenizer.from_pretrained(self.params["model_name_hf"])
165
  self.model = AutoModelForSequenceClassification.from_pretrained(
166
  self.params["model_name_hf"],
 
216
  if self.model is None:
217
  raise ValueError("Model is not initialized. Call setup_model() before training.")
218
 
219
+ # log parameters to MLflow without model_name_hf
220
  params_to_log = {k: v for k, v in self.params.items() if k != "model_name_hf" and k != "num_labels"}
 
221
  logger.info(f"Starting training for: {self.language.upper()}")
222
+
223
  # Prepare dataset (train/val split)
224
  train_encodings = self._tokenize(X_train)
225
  full_dataset = CodeBERTaDataset(train_encodings, y_train, num_labels=self.params["num_labels"])
226
+ full_dataset = CodeBERTaDataset(train_encodings, y_train, num_labels=self.params["num_labels"])
227
  train_size = int(self.params["train_size"] * len(full_dataset))
228
  val_size = len(full_dataset) - train_size
229
  train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])
230
 
231
  temp_ckpt_dir = os.path.join(MODELS_DIR, "temp_checkpoints")
232
+
233
  use_fp16 = torch.cuda.is_available()
 
 
234
 
235
  training_args = TrainingArguments(
236
  output_dir=temp_ckpt_dir,
 
310
  idx = int(label_idx)
311
  if 0 <= idx < num_labels:
312
  y_test_expanded[i, idx] = 1
 
 
313
 
314
+ y_test_np = y_test_expanded
315
  # Generate classification report
316
  report = classification_report(y_test_np, y_pred, zero_division=0)
317
  print("\n" + "=" * 50)
 
325
  "recall": recall_score(y_test_np, y_pred, average="macro", zero_division=0),
326
  "f1_score": f1_score(y_test_np, y_pred, average="macro"),
327
  }
 
328
  mlflow.log_metrics(metrics)
329
+ logger.info(f"Evaluation completed — Accuracy: {metrics['accuracy']:.3f}, F1: {metrics['f1_score']:.3f}")
 
 
 
330
  return metrics
331
 
332
 
 
341
  Returns:
342
  np.ndarray: Multi-Hot Encoded predictions (e.g., [[0, 1, 1, 0], ...])
343
  """
344
+
345
  if self.model is None:
346
  raise ValueError("Model is not trained. Call train() or load() before prediction.")
347
 
348
  # Set model to evaluation mode
349
  self.model.eval()
350
 
351
+ # Tokenize inputs
352
  encodings = self._tokenize(X)
 
 
353
 
354
+ # Convert lists to tensors and move to device
355
+ inputs = {key: torch.tensor(val).to(self.device) for key, val in encodings.items()}
356
 
357
+ # Inference (no gradients, lightweight)
358
+ with torch.no_grad():
359
+ outputs = self.model(**inputs)
360
+ logits = outputs.logits
 
 
 
361
 
362
+ # Move back to CPU and convert to numpy
363
+ logits = logits.cpu().numpy()
364
 
365
+ # Sigmoid + Threshold
 
 
 
 
 
366
  probs = 1 / (1 + np.exp(-logits))
367
 
368
  # Apply a threshold of 0.5 (if prob > 0.5, predict 1 else 0)
 
370
 
371
  return preds_binary
372
 
 
373
  def save(self, path, model_name):
374
  """
375
  Save model locally and log to MLflow as artifact.
 
402
  except Exception as e:
403
  logger.error(f"Failed to log model artifacts to MLflow: {e}")
404
 
 
405
  def load(self, model_path):
406
  """
407
  Load model from a local path OR an MLflow URI.
 
428
  try:
429
  if not os.path.exists(local_model_path):
430
  raise FileNotFoundError(f"Model path not found: {local_model_path}")
431
+
432
  # Load tokenizer and model from local path
433
  self.tokenizer = AutoTokenizer.from_pretrained(local_model_path)
434
  self.model = AutoModelForSequenceClassification.from_pretrained(
435
+ local_model_path,
436
+ low_cpu_mem_usage=False
437
  ).to(self.device)
438
  logger.info("Model loaded from local path successfully.")
 
439
  except Exception as e:
440
  logger.error(f"Failed to load model from local path: {e}")
441
  raise e
turing/modeling/models/graphCodeBert.py CHANGED
@@ -353,39 +353,31 @@ class GraphCodeBERTClassifier(BaseModel):
353
 
354
  Returns:
355
  np.ndarray: Multi-Hot Encoded predictions (e.g., [[0, 1, 1, 0], ...])
 
356
  """
357
-
358
  if self.model is None:
359
  raise ValueError("Model is not trained. Call train() or load() before prediction.")
360
 
361
  # Set model to evaluation mode
362
  self.model.eval()
363
 
 
364
  encodings = self._tokenize(X)
365
- # Pass None as labels because we are in inference
366
- dataset = GraphCodeBERTDataset(encodings, labels=None)
367
-
368
- use_fp16 = torch.cuda.is_available()
369
-
370
- training_args = TrainingArguments(
371
- output_dir="./pred_temp",
372
- per_device_eval_batch_size=self.params["batch_size_eval"],
373
- fp16=use_fp16,
374
- report_to="none",
375
- no_cuda=not torch.cuda.is_available(),
376
- )
377
 
378
- trainer = Trainer(model=self.model, args=training_args)
379
- output = trainer.predict(dataset)
 
 
380
 
381
- # Clean up temporary prediction directory
382
- if os.path.exists("./pred_temp"):
383
- shutil.rmtree("./pred_temp")
384
 
385
- # Convert logits to probabilities
386
- logits = output.predictions
387
  probs = 1 / (1 + np.exp(-logits))
388
-
389
  # Apply a threshold of 0.5 (if prob > 0.5, predict 1 else 0)
390
  preds_binary = (probs > 0.5).astype(int)
391
 
@@ -456,9 +448,10 @@ class GraphCodeBERTClassifier(BaseModel):
456
 
457
  # Load tokenizer and model from local path
458
  self.tokenizer = AutoTokenizer.from_pretrained(local_model_path)
459
- self.model = AutoModelForSequenceClassification.from_pretrained(local_model_path).to(
460
- self.device
461
- )
 
462
  logger.info("Model loaded from local path successfully.")
463
 
464
  except Exception as e:
 
353
 
354
  Returns:
355
  np.ndarray: Multi-Hot Encoded predictions (e.g., [[0, 1, 1, 0], ...])
356
+ Make predictions for Multi-Label classification using direct PyTorch inference.
357
  """
 
358
  if self.model is None:
359
  raise ValueError("Model is not trained. Call train() or load() before prediction.")
360
 
361
  # Set model to evaluation mode
362
  self.model.eval()
363
 
364
+ # Tokenize inputs
365
  encodings = self._tokenize(X)
366
+
367
+ # Convert lists to tensors and move to device
368
+ inputs = {key: torch.tensor(val).to(self.device) for key, val in encodings.items()}
 
 
 
 
 
 
 
 
 
369
 
370
+ # Inference (no gradients, lightweight)
371
+ with torch.no_grad():
372
+ outputs = self.model(**inputs)
373
+ logits = outputs.logits
374
 
375
+ # Move back to CPU and convert to numpy
376
+ logits = logits.cpu().numpy()
 
377
 
378
+ # Sigmoid + Threshold
 
379
  probs = 1 / (1 + np.exp(-logits))
380
+
381
  # Apply a threshold of 0.5 (if prob > 0.5, predict 1 else 0)
382
  preds_binary = (probs > 0.5).astype(int)
383
 
 
448
 
449
  # Load tokenizer and model from local path
450
  self.tokenizer = AutoTokenizer.from_pretrained(local_model_path)
451
+ self.model = AutoModelForSequenceClassification.from_pretrained(
452
+ local_model_path,
453
+ low_cpu_mem_usage=False
454
+ ).to(self.device)
455
  logger.info("Model loaded from local path successfully.")
456
 
457
  except Exception as e:
turing/modeling/predict.py CHANGED
@@ -39,6 +39,7 @@ class ModelInference:
39
  warnings.filterwarnings("ignore")
40
  self.dataset_manager = DatasetManager()
41
  self.use_best_model_tags = use_best_model_tags
 
42
 
43
  # Initialize model registry based on configuration
44
  if use_best_model_tags:
@@ -141,20 +142,26 @@ class ModelInference:
141
  model_config = self.model_registry[language]
142
  run_id = model_config["run_id"]
143
  artifact_name = model_config["artifact"]
144
- model_id = model_config["model_id"]
145
-
146
- # Dynamically import model class
147
- config_entry = MODEL_CONFIG[model_id]
148
- module_name = config_entry["model_class_module"]
149
- class_name = config_entry["model_class_name"]
150
- module = importlib.import_module(module_name)
151
- model_class = getattr(module, class_name)
152
-
153
- # 2. Get Model Path (Local Cache or Download)
154
- model_path = self._get_cached_model_path(run_id, artifact_name, language)
155
-
156
- # Load Model
157
- model = model_class(language=language, path=model_path)
 
 
 
 
 
 
158
 
159
  # 3. Predict
160
  raw_predictions = model.predict(texts)
 
39
  warnings.filterwarnings("ignore")
40
  self.dataset_manager = DatasetManager()
41
  self.use_best_model_tags = use_best_model_tags
42
+ self.loaded_models = {}
43
 
44
  # Initialize model registry based on configuration
45
  if use_best_model_tags:
 
142
  model_config = self.model_registry[language]
143
  run_id = model_config["run_id"]
144
  artifact_name = model_config["artifact"]
145
+ if language not in self.loaded_models:
146
+ logger.info(f"Model for {language} not in memory. Loading...")
147
+
148
+ model_id = model_config["model_id"]
149
+
150
+ # Dynamically import model class
151
+ config_entry = MODEL_CONFIG[model_id]
152
+ module_name = config_entry["model_class_module"]
153
+ class_name = config_entry["model_class_name"]
154
+ module = importlib.import_module(module_name)
155
+ model_class = getattr(module, class_name)
156
+
157
+ # Get Model Path (Local Cache or Download)
158
+ model_path = self._get_cached_model_path(run_id, artifact_name, language)
159
+
160
+ # Load Model and store in cache
161
+ self.loaded_models[language] = model_class(language=language, path=model_path)
162
+ logger.success(f"Model for {language} loaded into memory.")
163
+
164
+ model = self.loaded_models[language]
165
 
166
  # 3. Predict
167
  raw_predictions = model.predict(texts)
turing/monitoring/locustfile.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ from locust import HttpUser, between, task
4
+
5
+
6
+ class TuringApiUser(HttpUser):
7
+ # Wait time between requests
8
+ wait_time = between(1, 5)
9
+
10
+ # List of supported languages
11
+ languages = ["python", "java", "pharo"]
12
+
13
+ # Code examples
14
+ code_snippets = {
15
+ "python": ["def init(self): pass", "print('Hello World')", "import os"],
16
+ "java": ["public static void main(String[] args)", "System.out.println(e);", "private int x = 0;"],
17
+ "pharo": ["Transcript show: 'Hello'.", "^ self size", "Object subclass: #Name"]
18
+ }
19
+
20
+ @task(1)
21
+ def health_check(self):
22
+ """
23
+ Checks if the API is alive.
24
+ """
25
+ self.client.get("/")
26
+
27
+ @task(3)
28
+ def predict_code_classification(self):
29
+ """
30
+ Sends a prediction request by choosing a random language.
31
+ """
32
+ # Randomly selects one of the three languages
33
+ selected_lang = random.choice(self.languages)
34
+
35
+ # Selects consistent snippets (
36
+ texts = self.code_snippets.get(selected_lang, ["generic code"])
37
+
38
+ payload = {
39
+ "texts": texts,
40
+ "language": selected_lang
41
+ }
42
+
43
+ headers = {'Content-Type': 'application/json'}
44
+
45
+ # Perform the request
46
+ self.client.post("/predict", json=payload, headers=headers, name="/predict (random lang)")