Yusuf commited on
Commit
25fbc07
·
1 Parent(s): ec1eb7e

fix: visualise batch & epoch metrics separately

Browse files
trainingModel/Training.py CHANGED
@@ -15,10 +15,10 @@ def train_model(
15
  device: torch.device,
16
  n_epochs: int = 4,
17
  lr: float = 1e-3,
 
 
 
18
  save_path: str = "best_model.pt",
19
- flatten_input = False,
20
- num_classes : int = 39,
21
-
22
  ):
23
  """
24
  Trains the given model and returns:
@@ -40,7 +40,11 @@ def train_model(
40
 
41
  # Loss and optimizer
42
  criterion = nn.CrossEntropyLoss()
43
- optimizer = torch.optim.Adam(model.parameters(), lr=lr ) # might add momentum 0.9 later
 
 
 
 
44
 
45
  # Metric trackers
46
  train_accuracy_fn = MulticlassAccuracy(num_classes=num_classes)
@@ -49,20 +53,31 @@ def train_model(
49
  # Arrays to log metrics
50
  num_batches = len(train_loader)
51
 
 
 
 
 
 
 
 
 
 
 
52
  if num_batches == 0:
53
  raise RuntimeError("UH OH!!!! empty train loader")
54
 
55
  # Store training losses and accuracies for every batch
56
  # num_batches is the number of batches for every epoch
57
- training_losses = np.zeros(num_batches * n_epochs)
58
- training_accuracies = np.zeros(num_batches * n_epochs)
59
 
60
  # store validation accuracy for every epoch
61
- val_accuracies = np.zeros(n_epochs)
62
 
63
  # keep track of best validation accuracy and best model
64
  best_accuracy = 0.0
65
 
 
66
  #----------------------
67
  # training loop
68
  #----------------------
@@ -71,8 +86,12 @@ def train_model(
71
  model.train()
72
  train_accuracy_fn.reset()
73
 
 
 
 
 
74
  # iterate over all the dataloader's mini-batches
75
- for i, batch in enumerate(train_loader):
76
 
77
  # move to GPU memory
78
  inputs = batch["image"].to(device)
@@ -88,22 +107,30 @@ def train_model(
88
  outputs = model(inputs)
89
  loss = criterion(outputs, labels)
90
 
91
- # Backward pass
92
  loss.backward()
93
-
94
- # updates the parameters
95
  optimizer.step()
96
-
97
- # log the loss value
98
- training_losses[epoch * num_batches + i] = loss.item()
99
 
100
- #updates the accuracy computation with new data
101
- train_accuracy_fn.update(outputs, labels)
 
 
 
 
 
 
 
 
102
 
103
- #compute accuracy with the current data
104
- training_accuracies[epoch * num_batches + i] = train_accuracy_fn.compute().item()
 
105
 
106
- print(f'Epoch {epoch + 1} training complete')
 
 
 
 
107
 
108
  # ----------------------
109
  # validation loop
@@ -123,25 +150,30 @@ def train_model(
123
  inputs = inputs.view(inputs.size(0), -1)
124
 
125
  outputs = model(inputs)
126
-
127
  val_accuracy_fn.update(outputs, labels)
128
 
129
- current_accuracy = val_accuracy_fn.compute().item()
130
- val_accuracies[epoch] = current_accuracy
 
 
 
131
 
132
  # keep track of best validation accuracy and save best model so far
133
- if current_accuracy > best_accuracy:
134
- best_accuracy = current_accuracy
135
  torch.save(model.state_dict(), save_path)
136
- print(f'Epoch {epoch + 1} (validation accuracy: {best_accuracy})')
 
137
  print(f'Epoch {epoch + 1} validation complete')
138
 
139
  print(f"\nTraining finished. Best val accuracy: {best_accuracy:.4f}")
140
  print(f"Best model weights saved to: {save_path}")
141
 
142
  training_metrics = {
143
- "losses": training_losses,
144
- "accuracies": training_accuracies,
 
 
145
  "val_accuracies": val_accuracies,
146
  "best_accuracy": best_accuracy,
147
  }
 
15
  device: torch.device,
16
  n_epochs: int = 4,
17
  lr: float = 1e-3,
18
+ num_classes: int = 39,
19
+ optimizer_type: str = "adam",
20
+ flatten_input: bool = False,
21
  save_path: str = "best_model.pt",
 
 
 
22
  ):
23
  """
24
  Trains the given model and returns:
 
40
 
41
  # Loss and optimizer
42
  criterion = nn.CrossEntropyLoss()
43
+
44
+ if optimizer_type.lower() == "adam":
45
+ optimizer = torch.optim.Adam(model.parameters(), lr=lr ) # might add momentum 0.9 later
46
+ else:
47
+ optimizer = torch.optim.AdamW(model.parameters(), lr=lr )
48
 
49
  # Metric trackers
50
  train_accuracy_fn = MulticlassAccuracy(num_classes=num_classes)
 
53
  # Arrays to log metrics
54
  num_batches = len(train_loader)
55
 
56
+ # Batch-level logs
57
+ batch_losses = []
58
+ batch_accuracies = []
59
+
60
+ # Epoch-level logs
61
+ epoch_losses = np.zeros(n_epochs)
62
+ epoch_accuracies = np.zeros(n_epochs)
63
+ val_accuracies = np.zeros(n_epochs)
64
+
65
+
66
  if num_batches == 0:
67
  raise RuntimeError("UH OH!!!! empty train loader")
68
 
69
  # Store training losses and accuracies for every batch
70
  # num_batches is the number of batches for every epoch
71
+ #training_losses = np.zeros(num_batches * n_epochs)
72
+ #training_accuracies = np.zeros(num_batches * n_epochs)
73
 
74
  # store validation accuracy for every epoch
75
+
76
 
77
  # keep track of best validation accuracy and best model
78
  best_accuracy = 0.0
79
 
80
+
81
  #----------------------
82
  # training loop
83
  #----------------------
 
86
  model.train()
87
  train_accuracy_fn.reset()
88
 
89
+ running_loss = 0.0
90
+ running_correct = 0
91
+ running_total = 0
92
+
93
  # iterate over all the dataloader's mini-batches
94
+ for batch in train_loader:
95
 
96
  # move to GPU memory
97
  inputs = batch["image"].to(device)
 
107
  outputs = model(inputs)
108
  loss = criterion(outputs, labels)
109
 
110
+ # Backward pass & update params
111
  loss.backward()
 
 
112
  optimizer.step()
 
 
 
113
 
114
+ # Log batch-level metrics
115
+ batch_losses.append(loss.item())
116
+ batch_acc = (outputs.argmax(dim=1) == labels).float().mean().item()
117
+ batch_accuracies.append(batch_acc)
118
+
119
+ # Sum epoch stats
120
+ running_loss += loss.item() * inputs.size(0)
121
+ running_correct += (outputs.argmax(dim=1) == labels).sum().item()
122
+ running_total += labels.size(0)
123
+
124
 
125
+ # Epoch-level metrics (average over all batches)
126
+ epoch_loss_avg = running_loss / running_total
127
+ epoch_acc_avg = running_correct / running_total
128
 
129
+ epoch_losses[epoch] = epoch_loss_avg
130
+ epoch_accuracies[epoch] = epoch_acc_avg
131
+
132
+ print(f"\n--- Epoch {epoch + 1}: ---")
133
+ print(f'Train loss={epoch_loss_avg:.4f}\nTrain accuracy={epoch_acc_avg:.4f}')
134
 
135
  # ----------------------
136
  # validation loop
 
150
  inputs = inputs.view(inputs.size(0), -1)
151
 
152
  outputs = model(inputs)
 
153
  val_accuracy_fn.update(outputs, labels)
154
 
155
+
156
+ current_val_accuracy = val_accuracy_fn.compute().item()
157
+ val_accuracies[epoch] = current_val_accuracy
158
+
159
+ print(f"Epoch {epoch+1}: val acc={current_val_accuracy:.4f}")
160
 
161
  # keep track of best validation accuracy and save best model so far
162
+ if current_val_accuracy > best_accuracy:
163
+ best_accuracy = current_val_accuracy
164
  torch.save(model.state_dict(), save_path)
165
+
166
+
167
  print(f'Epoch {epoch + 1} validation complete')
168
 
169
  print(f"\nTraining finished. Best val accuracy: {best_accuracy:.4f}")
170
  print(f"Best model weights saved to: {save_path}")
171
 
172
  training_metrics = {
173
+ "batch_losses": np.array(batch_losses),
174
+ "batch_accuracies": np.array(batch_accuracies),
175
+ "epoch_losses": epoch_losses,
176
+ "epoch_accuracies": epoch_accuracies,
177
  "val_accuracies": val_accuracies,
178
  "best_accuracy": best_accuracy,
179
  }
trainingModel/run_training.py CHANGED
@@ -48,8 +48,6 @@ except Exception as e:
48
 
49
  full_dataset = ds['train']
50
 
51
-
52
-
53
  # Apply subset indices to full dataset - this gives you the same subset as data prep
54
  subset_dataset = full_dataset.select(subset_indices)
55
 
@@ -95,15 +93,24 @@ training_task = Task.init(
95
  reuse_last_task_id=False,
96
  )
97
 
 
98
  training_logger = training_task.get_logger()
99
- training_task.connect({"data_prep_task_used": DYNAMIC_TASK_ID})
 
 
 
 
 
 
 
100
 
101
  # Training parameters - Modify these to experiment
102
  training_config = {
103
  "num_classes": 39,
104
- "n_epochs": 1,
105
  "learning_rate": 1e-3,
106
  "batch_size": batch_size,
 
107
  "save_path": "best_model.pt",
108
  }
109
  training_task.connect(training_config)
@@ -124,21 +131,30 @@ training_metrics = train_model(
124
  device=device,
125
  n_epochs=training_config["n_epochs"],
126
  lr=training_config["learning_rate"],
 
 
127
  save_path=training_config["save_path"],
128
  )
129
 
130
 
131
  # ----------- Log metrics to ClearML -----------
132
  # Per-batch training losses and accuracies
133
- for i, loss in enumerate(training_metrics["losses"]):
134
- training_logger.report_scalar("train", "loss_per_batch", value=loss, iteration=i)
 
 
 
 
135
 
136
- for i, acc in enumerate(training_metrics["accuracies"]):
137
- training_logger.report_scalar("train", "accuracy_per_batch", value=acc, iteration=i)
 
 
 
138
 
139
- # Per-epoch validation accuracy
140
  for epoch, acc in enumerate(training_metrics["val_accuracies"]):
141
- training_logger.report_scalar("validation", "accuracy_per_epoch", value=acc, iteration=epoch)
142
 
143
  training_logger.report_single_value("best_val_accuracy", training_metrics["best_accuracy"])
144
 
 
48
 
49
  full_dataset = ds['train']
50
 
 
 
51
  # Apply subset indices to full dataset - this gives you the same subset as data prep
52
  subset_dataset = full_dataset.select(subset_indices)
53
 
 
93
  reuse_last_task_id=False,
94
  )
95
 
96
+ # Detail the data prep task used
97
  training_logger = training_task.get_logger()
98
+ data_prep_metadata = {
99
+ "data_prep_task_id": DYNAMIC_TASK_ID,
100
+ "dataset_id": dataset_id,
101
+ "dataset_link": dataset_link,
102
+ "augmentation_used": aug_config,
103
+ "seed_used": seed,
104
+ }
105
+ training_task.connect(data_prep_metadata, name="data_prep_metadata")
106
 
107
  # Training parameters - Modify these to experiment
108
  training_config = {
109
  "num_classes": 39,
110
+ "n_epochs": 3,
111
  "learning_rate": 1e-3,
112
  "batch_size": batch_size,
113
+ "optimizer": "adam",
114
  "save_path": "best_model.pt",
115
  }
116
  training_task.connect(training_config)
 
131
  device=device,
132
  n_epochs=training_config["n_epochs"],
133
  lr=training_config["learning_rate"],
134
+ num_classes=training_config["num_classes"],
135
+ optimizer_type=training_config["optimizer"],
136
  save_path=training_config["save_path"],
137
  )
138
 
139
 
140
  # ----------- Log metrics to ClearML -----------
141
  # Per-batch training losses and accuracies
142
+ for i, loss in enumerate(training_metrics["batch_losses"]):
143
+ training_logger.report_scalar("train_batch", "loss", value=loss, iteration=i)
144
+
145
+ for i, acc in enumerate(training_metrics["batch_accuracies"]):
146
+ training_logger.report_scalar("train_batch", "accuracy", value=acc, iteration=i)
147
+
148
 
149
+ # Per-epoch training losses and accuracies
150
+ epoch_metrics = zip(training_metrics["epoch_losses"], training_metrics["epoch_accuracies"])
151
+ for epoch, (loss, acc) in enumerate(epoch_metrics):
152
+ training_logger.report_scalar("train_epoch", "loss", loss, iteration=epoch)
153
+ training_logger.report_scalar("train_epoch", "accuracy", acc, iteration=epoch)
154
 
155
+ # Per-epoch validation accuracies
156
  for epoch, acc in enumerate(training_metrics["val_accuracies"]):
157
+ training_logger.report_scalar("validation_epoch", "accuracy", value=acc, iteration=epoch)
158
 
159
  training_logger.report_single_value("best_val_accuracy", training_metrics["best_accuracy"])
160