Yusuf commited on
Commit
c638d1e
·
1 Parent(s): 7b10a4d

feat: clearml training metrics

Browse files
trainingModel/Training.py CHANGED
@@ -139,5 +139,12 @@ def train_model(
139
  print(f"\nTraining finished. Best val accuracy: {best_accuracy:.4f}")
140
  print(f"Best model weights saved to: {save_path}")
141
 
142
- return training_losses, training_accuracies, val_accuracies, best_accuracy
 
 
 
 
 
 
 
143
 
 
139
  print(f"\nTraining finished. Best val accuracy: {best_accuracy:.4f}")
140
  print(f"Best model weights saved to: {save_path}")
141
 
142
+ training_metrics = {
143
+ "losses": training_losses,
144
+ "accuracies": training_accuracies,
145
+ "val_accuracies": val_accuracies,
146
+ "best_accuracy": best_accuracy,
147
+ }
148
+
149
+ return training_metrics
150
 
trainingModel/run_training.py CHANGED
@@ -10,7 +10,8 @@ from models.modelOne import modelOne
10
  from trainingModel.Training import train_model
11
 
12
 
13
- # Latest Data Prep Task
 
14
  all_tasks = Task.get_tasks(project_name="Small Group Project")
15
  if not all_tasks:
16
  raise RuntimeError("No tasks found in project 'Small Group Project'")
@@ -19,6 +20,7 @@ dp_tasks = [t for t in all_tasks if t.name == "Data Preparation"]
19
  if not dp_tasks:
20
  raise RuntimeError("No 'Data Preparation' tasks found in this project!")
21
 
 
22
  latest_task = max(dp_tasks, key=lambda t: t.id)
23
  DYNAMIC_TASK_ID = latest_task.id
24
  DATA_PREP = Task.get_task(task_id=DYNAMIC_TASK_ID)
@@ -32,8 +34,7 @@ dataset_id = raw_meta.split("=")[1].strip().replace('"', "")
32
  subset_clearml = Dataset.get(dataset_id=dataset_id)
33
  local_folder = subset_clearml.get_local_copy()
34
 
35
- subset_indices_path = os.path.join(local_folder, "subset_indices.npy")
36
- subset_indices = np.load(subset_indices_path)
37
 
38
  # Load Dataset Parameters
39
  data_params = DATA_PREP.get_parameters()
@@ -84,24 +85,65 @@ print("\n--- Handoff Test Successful ---")
84
  print(f"Train loader batches: {len(full_loaders['train'])}")
85
  print(f"Validation loader batches: {len(full_loaders['val'])}")
86
  print(f"Test loader batches: {len(full_loaders['test'])}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
 
89
  # -------- Build the ML model --------
90
- model = modelOne(noOfClasses=39)
91
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
92
 
93
 
94
  # ------- Train the model (on subset for now) -------
95
 
96
- #When calling this function, the model should be trained on the given dataset
97
-
98
  print("\n--- Starting Model Training on Subset ---")
99
- train_model(
100
  model=model,
101
  train_loader=subset_loaders['train'],
102
  val_loader=subset_loaders['val'],
103
  device=device,
104
- n_epochs=10,
105
- lr=1e-3,
106
- save_path="best_model.pt",
107
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  from trainingModel.Training import train_model
11
 
12
 
13
+ # -------------- Load Data --------------
14
+
15
  all_tasks = Task.get_tasks(project_name="Small Group Project")
16
  if not all_tasks:
17
  raise RuntimeError("No tasks found in project 'Small Group Project'")
 
20
  if not dp_tasks:
21
  raise RuntimeError("No 'Data Preparation' tasks found in this project!")
22
 
23
+ # Latest Data Prep Task
24
  latest_task = max(dp_tasks, key=lambda t: t.id)
25
  DYNAMIC_TASK_ID = latest_task.id
26
  DATA_PREP = Task.get_task(task_id=DYNAMIC_TASK_ID)
 
34
  subset_clearml = Dataset.get(dataset_id=dataset_id)
35
  local_folder = subset_clearml.get_local_copy()
36
 
37
+ subset_indices = np.load(os.path.join(local_folder, "subset_indices.npy"))
 
38
 
39
  # Load Dataset Parameters
40
  data_params = DATA_PREP.get_parameters()
 
85
  print(f"Train loader batches: {len(full_loaders['train'])}")
86
  print(f"Validation loader batches: {len(full_loaders['val'])}")
87
  print(f"Test loader batches: {len(full_loaders['test'])}")
88
+ # -------------- DATA PREP ENDS --------------
89
+
90
+
91
+ # -------- ClearML Training Task Setup --------
92
+ training_task = Task.init(
93
+ project_name="Small Group Project",
94
+ task_name="Model Training",
95
+ reuse_last_task_id=False,
96
+ )
97
+
98
+ training_logger = training_task.get_logger()
99
+ training_task.connect({"data_prep_task_used": DYNAMIC_TASK_ID})
100
+
101
+ # Training parameters - Modify these to experiment
102
+ training_config = {
103
+ "num_classes": 39,
104
+ "n_epochs": 1,
105
+ "learning_rate": 1e-3,
106
+ "batch_size": batch_size,
107
+ "save_path": "best_model.pt",
108
+ }
109
+ training_task.connect(training_config)
110
 
111
 
112
  # -------- Build the ML model --------
113
+ model = modelOne(noOfClasses=training_config["num_classes"])
114
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
115
 
116
 
117
  # ------- Train the model (on subset for now) -------
118
 
 
 
119
  print("\n--- Starting Model Training on Subset ---")
120
+ training_metrics = train_model(
121
  model=model,
122
  train_loader=subset_loaders['train'],
123
  val_loader=subset_loaders['val'],
124
  device=device,
125
+ n_epochs=training_config["n_epochs"],
126
+ lr=training_config["learning_rate"],
127
+ save_path=training_config["save_path"],
128
+ )
129
+
130
+
131
+ # ----------- Log metrics to ClearML -----------
132
+ # Per-batch training losses and accuracies
133
+ for i, loss in enumerate(training_metrics["losses"]):
134
+ training_logger.report_scalar("train", "loss_per_batch", value=loss, iteration=i)
135
+
136
+ for i, acc in enumerate(training_metrics["accuracies"]):
137
+ training_logger.report_scalar("train", "accuracy_per_batch", value=acc, iteration=i)
138
+
139
+ # Per-epoch validation accuracy
140
+ for epoch, acc in enumerate(training_metrics["val_accuracies"]):
141
+ training_logger.report_scalar("validation", "accuracy_per_epoch", value=acc, iteration=epoch)
142
+
143
+ training_logger.report_single_value("best_val_accuracy", training_metrics["best_accuracy"])
144
+
145
+ # Upload best model as artifact
146
+ training_task.upload_artifact("best_model", training_config["save_path"])
147
+
148
+ print("\nTraining complete.")
149
+ training_task.close()