k23064919 commited on
Commit
fcf6bb8
·
2 Parent(s): 6771828 1ea541c

Merge branch 'develop' of https://github.kcl.ac.uk/K23064919/smallGroupProject into develop

Browse files
.gitignore CHANGED
@@ -1,10 +1,8 @@
1
- <<<<<<< HEAD
2
  .vscode/
3
  .venv/
4
  .vscode/
5
  .models/
6
  __pycache__/
7
- =======
8
 
9
  # Python environment
10
  venv/
@@ -18,4 +16,4 @@ __pycache__/
18
 
19
  # Generated files from data_preparation.py
20
  class_distribution.png
21
- >>>>>>> 04cb88662062ef6b880c627546d067fa0cedfa8b
 
 
1
  .vscode/
2
  .venv/
3
  .vscode/
4
  .models/
5
  __pycache__/
 
6
 
7
  # Python environment
8
  venv/
 
16
 
17
  # Generated files from data_preparation.py
18
  class_distribution.png
19
+
best_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23a4c08eaad4b40290eca84e6a8fa3e1d69bdf4312d5db6db5de96d1d8753024
3
+ size 130261986
dataPrep/data_preparation.py CHANGED
@@ -45,8 +45,9 @@ if torch.cuda.is_available():
45
 
46
 
47
  # ----- ClearML Setup -----
 
48
  task = Task.init(
49
- project_name='Small Group Project',
50
  task_name='Data Preparation',
51
  task_type=Task.TaskTypes.data_processing
52
  )
 
45
 
46
 
47
  # ----- ClearML Setup -----
48
+ project_name = "Small Group Project"
49
  task = Task.init(
50
+ project_name=f'{project_name}/Data Preparation',
51
  task_name='Data Preparation',
52
  task_type=Task.TaskTypes.data_processing
53
  )
dataPrep/helpers/clearml_data.py CHANGED
@@ -11,12 +11,12 @@ Takes latest Data Prep ClearML task from project and reconstruct:
11
  - data loaders for both full and subset datasets
12
  - Aug settings used
13
  '''
14
- def extract_latest_data_task(project_name: str = "Small Group Project"):
15
 
16
  # --------- Get latest Data Preparation task from ClearML ---------
17
 
18
  all_tasks = Task.get_tasks(
19
- project_name=project_name,
20
  allow_archived=False,
21
  task_filter={'order_by': ["-last_update"]},
22
  )
@@ -76,7 +76,7 @@ def extract_latest_data_task(project_name: str = "Small Group Project"):
76
  subset_dataset = full_dataset.select(subset_indices)
77
 
78
  # Get data loaders for both full and subset datasets
79
- subset_loaders, full_loaders, aug_config = get_data_loaders(data_params, subset_dataset, full_dataset)
80
  batch_size = int(data_params['General/dataloaders/batch_size'])
81
  seed = int(data_params['General/seed'])
82
 
@@ -99,7 +99,7 @@ def extract_latest_data_task(project_name: str = "Small Group Project"):
99
  Takes a given dataset, subset, data params to create DataLoaders
100
  Loaders split data into train, val, test
101
  '''
102
- def get_data_loaders(data_params, subset_dataset, full_dataset):
103
 
104
  # Extract data parameters- these will be used in the DataLoaders
105
  seed = int(data_params['General/seed'])
@@ -115,7 +115,7 @@ def get_data_loaders(data_params, subset_dataset, full_dataset):
115
 
116
  # Create DataLoaders using the parameters from data prep
117
  subset_loaders = make_dataset_loaders(
118
- subset_dataset, seed, batch_size, test_size, aug_config
119
  )
120
 
121
  print("\n--- Handoff Test Successful ---")
@@ -125,7 +125,7 @@ def get_data_loaders(data_params, subset_dataset, full_dataset):
125
 
126
 
127
  full_loaders = make_dataset_loaders(
128
- full_dataset, seed, batch_size, test_size, aug_config
129
  )
130
 
131
  print("\n--- Handoff Test Successful ---")
 
11
  - data loaders for both full and subset datasets
12
  - Aug settings used
13
  '''
14
+ def extract_latest_data_task(project_name: str = "Small Group Project", num_workers: int = 8):
15
 
16
  # --------- Get latest Data Preparation task from ClearML ---------
17
 
18
  all_tasks = Task.get_tasks(
19
+ project_name=f'{project_name}/Data Preparation',
20
  allow_archived=False,
21
  task_filter={'order_by': ["-last_update"]},
22
  )
 
76
  subset_dataset = full_dataset.select(subset_indices)
77
 
78
  # Get data loaders for both full and subset datasets
79
+ subset_loaders, full_loaders, aug_config = get_data_loaders(data_params, subset_dataset, full_dataset, num_workers=num_workers)
80
  batch_size = int(data_params['General/dataloaders/batch_size'])
81
  seed = int(data_params['General/seed'])
82
 
 
99
  Takes a given dataset, subset, data params to create DataLoaders
100
  Loaders split data into train, val, test
101
  '''
102
+ def get_data_loaders(data_params, subset_dataset, full_dataset, num_workers):
103
 
104
  # Extract data parameters- these will be used in the DataLoaders
105
  seed = int(data_params['General/seed'])
 
115
 
116
  # Create DataLoaders using the parameters from data prep
117
  subset_loaders = make_dataset_loaders(
118
+ subset_dataset, seed, batch_size, test_size, aug_config, workers=num_workers
119
  )
120
 
121
  print("\n--- Handoff Test Successful ---")
 
125
 
126
 
127
  full_loaders = make_dataset_loaders(
128
+ full_dataset, seed, batch_size, test_size, aug_config, workers=num_workers
129
  )
130
 
131
  print("\n--- Handoff Test Successful ---")
dataPrep/helpers/transforms_loaders.py CHANGED
@@ -47,24 +47,25 @@ def make_augment_pipeline(aug_config):
47
  return augmentation
48
 
49
 
 
 
 
 
 
 
 
 
 
50
  """
51
  Creates and returns DataLoaders (train, val, test) for a given dataset.
52
  Performs a 70/15/15 split
53
  """
54
- def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config):
55
 
56
  # Define transformation pipelines for the dataset
57
  normalisation = make_norm_pipeline()
58
  augmentation = make_augment_pipeline(aug_config)
59
 
60
- def apply_augmentation(batch):
61
- batch['image'] = [augmentation(x) for x in batch['image']]
62
- return batch
63
-
64
- def apply_normalisation(batch):
65
- batch['image'] = [normalisation(x) for x in batch['image']]
66
- return batch
67
-
68
  # 70/30 split creates train set
69
  split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
70
  train_split = split_1['train']
@@ -76,14 +77,34 @@ def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config):
76
  val_split, test_split = split_2['train'], split_2['test']
77
 
78
  # Put each split through pipelines
79
- train_split.set_transform(apply_augmentation)
80
- val_split.set_transform(apply_normalisation)
81
- test_split.set_transform(apply_normalisation)
82
 
83
  # Create dataloader for each
84
- train_loader = DataLoader(train_split, batch_size=batch_size, shuffle=True)
85
- val_loader = DataLoader(val_split, batch_size=batch_size, shuffle=False)
86
- test_loader = DataLoader(test_split, batch_size=batch_size, shuffle=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  dataset_loaders = {
89
  "train": train_loader,
 
47
  return augmentation
48
 
49
 
50
+ def apply_augmentation(batch, augmentation):
51
+ batch['image'] = [augmentation(x) for x in batch['image']]
52
+ return batch
53
+
54
+ def apply_normalisation(batch, normalisation):
55
+ batch['image'] = [normalisation(x) for x in batch['image']]
56
+ return batch
57
+
58
+
59
  """
60
  Creates and returns DataLoaders (train, val, test) for a given dataset.
61
  Performs a 70/15/15 split
62
  """
63
+ def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config, workers=8):
64
 
65
  # Define transformation pipelines for the dataset
66
  normalisation = make_norm_pipeline()
67
  augmentation = make_augment_pipeline(aug_config)
68
 
 
 
 
 
 
 
 
 
69
  # 70/30 split creates train set
70
  split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
71
  train_split = split_1['train']
 
77
  val_split, test_split = split_2['train'], split_2['test']
78
 
79
  # Put each split through pipelines
80
+ train_split.set_transform(lambda batch: apply_augmentation(batch, augmentation))
81
+ val_split.set_transform(lambda batch: apply_normalisation(batch, normalisation))
82
+ test_split.set_transform(lambda batch: apply_normalisation(batch, normalisation))
83
 
84
  # Create dataloader for each
85
+ train_loader = DataLoader(
86
+ train_split,
87
+ batch_size=batch_size,
88
+ shuffle=True,
89
+ pin_memory=True,
90
+ num_workers=workers
91
+ )
92
+ val_loader = DataLoader(
93
+ val_split,
94
+ batch_size=batch_size,
95
+ shuffle=False,
96
+ pin_memory=True,
97
+ num_workers=workers
98
+ )
99
+ test_loader = DataLoader(
100
+ test_split,
101
+ batch_size=batch_size,
102
+ shuffle=False,
103
+ pin_memory=True,
104
+ num_workers=workers
105
+ )
106
+
107
+ print(f"\nWorkers used in DataLoaders: {workers}\n")
108
 
109
  dataset_loaders = {
110
  "train": train_loader,
models/modelTwo.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class BetterCNN(nn.Module):
6
+ def __init__(self, noOfClasses=39):
7
+ super(BetterCNN, self).__init__()
8
+
9
+ # 32 Channels
10
+ # We use padding=1 to keep spatial size same before pooling
11
+ self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)
12
+ self.bn1 = nn.BatchNorm2d(32)
13
+
14
+ # 64 Channels
15
+ self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
16
+ self.bn2 = nn.BatchNorm2d(64)
17
+
18
+ # 128 Channels
19
+ self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
20
+ self.bn3 = nn.BatchNorm2d(128)
21
+
22
+ # 256 Channels
23
+ self.conv4 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
24
+ self.bn4 = nn.BatchNorm2d(256)
25
+
26
+ # Pooling layer
27
+ self.pool = nn.MaxPool2d(2, 2)
28
+
29
+ # Adaptive Pooling
30
+ self.adaptive_pool = nn.AdaptiveAvgPool2d((4, 4))
31
+
32
+ # Classification Head
33
+ self.fc1 = nn.Linear(256 * 4 * 4, 1024)
34
+ self.dropout = nn.Dropout(0.5) # Dropout after Linear layer
35
+
36
+ self.fc2 = nn.Linear(1024, 512)
37
+ self.fc3 = nn.Linear(512, noOfClasses)
38
+
39
+ def forward(self, x):
40
+ # Block 1
41
+ x = self.conv1(x)
42
+ x = self.bn1(x) # BatchNorm
43
+ x = F.relu(x)
44
+ x = self.pool(x)
45
+
46
+ # Block 2
47
+ x = self.pool(F.relu(self.bn2(self.conv2(x))))
48
+
49
+ # Block 3
50
+ x = self.pool(F.relu(self.bn3(self.conv3(x))))
51
+
52
+ # Block 4
53
+ x = self.pool(F.relu(self.bn4(self.conv4(x))))
54
+
55
+ # Adapt & Flatten
56
+ x = self.adaptive_pool(x)
57
+ x = torch.flatten(x, 1) # Flattens to (Batch, 4096)
58
+
59
+ # Dense Layers
60
+ x = F.relu(self.fc1(x))
61
+ x = self.dropout(x) # Regularization
62
+ x = F.relu(self.fc2(x))
63
+ x = self.fc3(x) # No activation needed here (handled by CrossEntropyLoss)
64
+
65
+ return x
subset_indices.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:972615a5b506b5ee2490f61866c26a4a2f9e2498c0baedb195a2a0d10a62e76f
3
+ size 111016
testingModel/helpers/evaluation.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn import CrossEntropyLoss
3
+
4
+
5
+ """
6
+ Evaluates a trained model on a dataloader that returns batches like:
7
+ batch["image"] -> Tensor [B, 3, 256, 256]
8
+ batch["label"] -> Tensor [B]
9
+
10
+ Returns dict:
11
+ { "accuracy": float, "loss": float }
12
+ """
13
+ def make_predictions(model, dataloader, device):
14
+
15
+ model.eval()
16
+ criterion = CrossEntropyLoss()
17
+
18
+ total_loss = 0
19
+ total_correct = 0
20
+ total_samples = 0
21
+
22
+ with torch.no_grad():
23
+ for batch in dataloader:
24
+
25
+ # Move tensors to device
26
+ images = batch["image"].to(device)
27
+ labels = batch["label"].to(device).long()
28
+
29
+ # Forward pass
30
+ outputs = model(images)
31
+ loss = criterion(outputs, labels)
32
+
33
+ total_loss += loss.item() * images.size(0)
34
+ total_correct += (outputs.argmax(dim=1) == labels).sum().item()
35
+ total_samples += labels.size(0)
36
+
37
+ accuracy = total_correct / total_samples
38
+ avg_loss = total_loss / total_samples
39
+
40
+ return {
41
+ "accuracy": accuracy,
42
+ "loss": avg_loss,
43
+ }
testingModel/run_testing.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from clearml import Task
2
+ from dataPrep.helpers.clearml_data import extract_latest_data_task
3
+
4
+ import torch
5
+ from models.modelOne import modelOne
6
+ from testingModel.helpers.evaluation import make_predictions
7
+
8
+
9
+ # -------------- Load Data --------------
10
+ project_name = "Small Group Project"
11
+ subset_loaders, full_loaders, data_prep_metadata = extract_latest_data_task(project_name=project_name)
12
+
13
+
14
+ # -------- ClearML Testing Task Setup --------
15
+ testing_task = Task.init(
16
+ project_name=f"{project_name}/Model Testing",
17
+ task_name="Model Testing",
18
+ task_type=Task.TaskTypes.testing,
19
+ reuse_last_task_id=False,
20
+ )
21
+
22
+ # Reference the data prep task used
23
+ testing_logger = testing_task.get_logger()
24
+ testing_task.connect(data_prep_metadata, name="data_prep_metadata_READONLY")
25
+
26
+ CLEARML_TRAINING_ID = "5bac154a885b4acbaa07d8588027bb27"
27
+
28
+ # Testing parameters - Modify these when experimenting
29
+ testing_config = {
30
+ "model_train_id": CLEARML_TRAINING_ID,
31
+ "num_classes": 39,
32
+ "model_path": "best_model.pt",
33
+ }
34
+ testing_task.connect(testing_config)
35
+
36
+ # Load the model weights from ClearML training task
37
+ training_task = Task.get_task(task_id=testing_config["model_train_id"])
38
+ model_artifact = training_task.artifacts.get("best_model")
39
+ model_path = model_artifact.get_local_copy()
40
+
41
+ # Reference training metadata
42
+ training_hyperparams = training_task.get_parameters_as_dict()
43
+ testing_task.connect(training_hyperparams['General'], name="training_metadata_READONLY")
44
+
45
+
46
+ # -------- Rebuild the ML model --------
47
+ model = modelOne()
48
+ state_dict = torch.load(model_path, map_location="cpu") # Load to CPU first
49
+ model.load_state_dict(state_dict)
50
+ model.eval() # set dropout & batch norm layers to eval mode
51
+
52
+ # Move model to GPU if available
53
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
54
+ model.to(device)
55
+
56
+
57
+ # -------------------- Test model on test set --------------------
58
+ testing_logger.report_text("Starting evaluation on TEST SUBSET...\n")
59
+ test_subset = subset_loaders['test']
60
+
61
+ subset_results = make_predictions(model, test_subset, device)
62
+
63
+
64
+ # Accuracy & Loss logging
65
+ testing_logger.report_single_value(name="Test Subset Accuracy", value=subset_results["accuracy"])
66
+ testing_logger.report_single_value(name="Test Subset Loss", value=subset_results["loss"])
67
+
68
+
69
+ # --------- Complete -----------------
70
+ print("\n------ Testing Complete ------")
71
+ testing_logger.report_text(
72
+ f"TEST SUBSET RESULTS:\n"
73
+ f"Loss: {subset_results['loss']:.4f}\n"
74
+ f"Accuracy: {subset_results['accuracy']:.4f}\n"
75
+ )
76
+ testing_task.close()
trainingModel/Training.py DELETED
@@ -1,182 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- import numpy as np
4
- from torcheval.metrics import MulticlassAccuracy
5
- from torch.utils.data import DataLoader
6
-
7
-
8
- # fix errors in runtime
9
-
10
-
11
- def train_model(
12
- model: nn.Module,
13
- train_loader: DataLoader,
14
- val_loader: DataLoader,
15
- device: torch.device,
16
- n_epochs: int = 4,
17
- lr: float = 1e-3,
18
- num_classes: int = 39,
19
- optimizer_type: str = "adam",
20
- flatten_input: bool = False,
21
- save_path: str = "best_model.pt",
22
- ):
23
- """
24
- Trains the given model and returns:
25
- - training_losses: numpy array of loss per batch
26
- - training_accuracies: numpy array of running accuracy per batch
27
- - val_accuracies: numpy array of accuracy per epoch
28
- - best_accuracy: highest validation accuracy achieved
29
-
30
- Expected batch format:
31
- batch["image"] → Tensor [B, C, H, W]
32
- batch["label"] → Tensor [B] with class IDs (int64)
33
- Model output:
34
- outputs → Tensor [B, num_classes] (logits)
35
- """
36
-
37
-
38
- # Move model to device
39
- model.to(device)
40
-
41
- # Loss and optimizer
42
- criterion = nn.CrossEntropyLoss()
43
-
44
- if optimizer_type.lower() == "adam":
45
- optimizer = torch.optim.Adam(model.parameters(), lr=lr ) # might add momentum 0.9 later
46
- else:
47
- optimizer = torch.optim.AdamW(model.parameters(), lr=lr )
48
-
49
- # Metric trackers
50
- train_accuracy_fn = MulticlassAccuracy(num_classes=num_classes)
51
- val_accuracy_fn = MulticlassAccuracy(num_classes=num_classes)
52
-
53
- # Arrays to log metrics
54
- num_batches = len(train_loader)
55
-
56
- # Batch-level logs
57
- batch_losses = []
58
- batch_accuracies = []
59
-
60
- # Epoch-level logs
61
- epoch_losses = np.zeros(n_epochs)
62
- epoch_accuracies = np.zeros(n_epochs)
63
- val_accuracies = np.zeros(n_epochs)
64
-
65
-
66
- if num_batches == 0:
67
- raise RuntimeError("UH OH!!!! empty train loader")
68
-
69
- # Store training losses and accuracies for every batch
70
- # num_batches is the number of batches for every epoch
71
- #training_losses = np.zeros(num_batches * n_epochs)
72
- #training_accuracies = np.zeros(num_batches * n_epochs)
73
-
74
- # store validation accuracy for every epoch
75
-
76
-
77
- # keep track of best validation accuracy and best model
78
- best_accuracy = 0.0
79
-
80
-
81
- #----------------------
82
- # training loop
83
- #----------------------
84
-
85
- for epoch in range(n_epochs):
86
- model.train()
87
- train_accuracy_fn.reset()
88
-
89
- running_loss = 0.0
90
- running_correct = 0
91
- running_total = 0
92
-
93
- # iterate over all the dataloader's mini-batches
94
- for batch in train_loader:
95
-
96
- # move to GPU memory
97
- inputs = batch["image"].to(device)
98
- labels = batch["label"].to(device).long()
99
-
100
- # flatten if not cnn REVISE LATER
101
- if flatten_input:
102
- inputs = inputs.view(inputs.size(0), -1)
103
-
104
- optimizer.zero_grad()
105
-
106
- # Forward pass
107
- outputs = model(inputs)
108
- loss = criterion(outputs, labels)
109
-
110
- # Backward pass & update params
111
- loss.backward()
112
- optimizer.step()
113
-
114
- # Log batch-level metrics
115
- batch_losses.append(loss.item())
116
- batch_acc = (outputs.argmax(dim=1) == labels).float().mean().item()
117
- batch_accuracies.append(batch_acc)
118
-
119
- # Sum epoch stats
120
- running_loss += loss.item() * inputs.size(0)
121
- running_correct += (outputs.argmax(dim=1) == labels).sum().item()
122
- running_total += labels.size(0)
123
-
124
-
125
- # Epoch-level metrics (average over all batches)
126
- epoch_loss_avg = running_loss / running_total
127
- epoch_acc_avg = running_correct / running_total
128
-
129
- epoch_losses[epoch] = epoch_loss_avg
130
- epoch_accuracies[epoch] = epoch_acc_avg
131
-
132
- print(f"\n--- Epoch {epoch + 1}: ---")
133
- print(f'Train loss={epoch_loss_avg:.4f}\nTrain accuracy={epoch_acc_avg:.4f}\n')
134
-
135
- # ----------------------
136
- # validation loop
137
- # ----------------------
138
-
139
- model.eval()
140
- val_accuracy_fn.reset()
141
-
142
-
143
- with torch.no_grad():
144
- for batch in val_loader:
145
- inputs = batch["image"].to(device)
146
- labels = batch["label"].to(device).long()
147
-
148
- # flatten if not cnn REVISE LATER
149
- if flatten_input:
150
- inputs = inputs.view(inputs.size(0), -1)
151
-
152
- outputs = model(inputs)
153
- val_accuracy_fn.update(outputs, labels)
154
-
155
-
156
- current_val_accuracy = val_accuracy_fn.compute().item()
157
- val_accuracies[epoch] = current_val_accuracy
158
-
159
- print(f"\nEpoch {epoch+1}: val acc={current_val_accuracy:.4f}")
160
-
161
- # keep track of best validation accuracy and save best model so far
162
- if current_val_accuracy > best_accuracy:
163
- best_accuracy = current_val_accuracy
164
- torch.save(model.state_dict(), save_path)
165
-
166
-
167
- print(f'Epoch {epoch + 1} validation complete\n')
168
-
169
- print(f"\nTraining finished. Best val accuracy: {best_accuracy:.4f}")
170
- print(f"Best model weights saved to: {save_path}")
171
-
172
- training_metrics = {
173
- "batch_losses": np.array(batch_losses),
174
- "batch_accuracies": np.array(batch_accuracies),
175
- "epoch_losses": epoch_losses,
176
- "epoch_accuracies": epoch_accuracies,
177
- "val_accuracies": val_accuracies,
178
- "best_accuracy": best_accuracy,
179
- }
180
-
181
- return training_metrics
182
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
trainingModel/helpers/Training.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+ from torcheval.metrics import MulticlassAccuracy
5
+ from torch.utils.data import DataLoader
6
+
7
+
8
+
9
+
10
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+ print("Using device:", DEVICE)
12
+
13
+ def train_model(
14
+ model: nn.Module,
15
+ train_loader: DataLoader,
16
+ val_loader: DataLoader,
17
+ n_epochs: int = 4,
18
+ lr: float = 1e-3,
19
+ save_path: str = "best_model.pt",
20
+ num_classes : int = 39,
21
+ early_stop : int = 3,
22
+
23
+
24
+ ):
25
+ """
26
+ Trains the given model and returns:
27
+ - training_losses: numpy array of loss per epoch
28
+ - training_accuracies: numpy array of running accuracy per epoch
29
+ - val_accuracies: numpy array of accuracy per epoch
30
+ - best_accuracy: highest validation accuracy achieved
31
+
32
+
33
+ Expected batch format:
34
+ batch["image"] → Tensor [B, C, H, W]
35
+ batch["label"] → Tensor [B] with class IDs (int64)
36
+ Model output:
37
+ outputs → Tensor [B, num_classes] (logits)
38
+ """
39
+
40
+
41
+ # Move model to device
42
+ model.to(DEVICE)
43
+
44
+
45
+ # Loss and optimizer
46
+ criterion = nn.CrossEntropyLoss()
47
+ optimizer = torch.optim.Adam(model.parameters(), lr=lr ) # might add momentum 0.9 later
48
+
49
+
50
+ # Metric trackers
51
+ train_accuracy_fn = MulticlassAccuracy(num_classes=num_classes)
52
+ val_accuracy_fn = MulticlassAccuracy(num_classes=num_classes)
53
+
54
+
55
+ # Arrays to log metrics
56
+ num_batches = len(train_loader)
57
+
58
+
59
+ if num_batches == 0:
60
+ raise RuntimeError("UH OH!!!! empty train loader")
61
+
62
+
63
+ # Store training losses and accuracies for every epoch
64
+ training_losses = np.zeros(n_epochs)
65
+ training_accuracies = np.zeros(n_epochs)
66
+
67
+
68
+ # store validation accuracy for every epoch
69
+ val_accuracies = np.zeros(n_epochs)
70
+
71
+
72
+ # keep track of best validation accuracy and best model
73
+ best_accuracy = 0.0
74
+
75
+
76
+ # keep track of accuracy improvement
77
+ improv_counter = 0
78
+
79
+
80
+ #----------------------
81
+ # training loop
82
+ #----------------------
83
+
84
+ for epoch in range(n_epochs):
85
+ model.train()
86
+ train_accuracy_fn.reset()
87
+
88
+
89
+ training_loss = 0.0
90
+
91
+
92
+ # iterate over all the dataloader's mini-batches
93
+ for i, batch in enumerate(train_loader):
94
+
95
+
96
+ # move to GPU memory
97
+ inputs = batch["image"].to(DEVICE)
98
+ labels = batch["label"].to(DEVICE).long()
99
+
100
+
101
+
102
+
103
+ optimizer.zero_grad()
104
+
105
+
106
+ # Forward pass
107
+ outputs = model(inputs)
108
+ loss = criterion(outputs, labels)
109
+
110
+ # Backward pass
111
+ loss.backward()
112
+
113
+
114
+ # updates the parameters
115
+ optimizer.step()
116
+
117
+ # log the loss value for epoch
118
+ training_loss += loss.item()
119
+
120
+
121
+ #updates the accuracy computation with new data
122
+ train_accuracy_fn.update(outputs, labels)
123
+
124
+
125
+ # compute epoch-level training metrics
126
+ training_losses[epoch] = training_loss / num_batches
127
+ training_accuracies[epoch] = train_accuracy_fn.compute().item()
128
+
129
+
130
+ print(f'Epoch {epoch + 1} training complete. Training Accuracy: {training_accuracies[epoch]:.4f}')
131
+
132
+
133
+ # ----------------------
134
+ # validation loop
135
+ # ----------------------
136
+
137
+
138
+ model.eval()
139
+ val_accuracy_fn.reset()
140
+
141
+
142
+
143
+
144
+ with torch.no_grad():
145
+ for batch in val_loader:
146
+ inputs = batch["image"].to(DEVICE)
147
+ labels = batch["label"].to(DEVICE).long()
148
+
149
+
150
+ outputs = model(inputs)
151
+
152
+
153
+ val_accuracy_fn.update(outputs, labels)
154
+
155
+
156
+ current_accuracy = val_accuracy_fn.compute().item()
157
+ val_accuracies[epoch] = current_accuracy
158
+
159
+
160
+ # keep track of best validation accuracy and save best model so far
161
+ if current_accuracy > best_accuracy:
162
+ best_accuracy = current_accuracy
163
+ torch.save(model.state_dict(), save_path)
164
+ improv_counter = 0 #Resets coounter if accuracy improves
165
+ print(f'Epoch {epoch + 1} (validation accuracy: {best_accuracy})')
166
+
167
+
168
+ else:
169
+ improv_counter +=1
170
+ print(f'No improvement for {improv_counter} epoch')
171
+
172
+
173
+ if improv_counter >= early_stop:
174
+ print (f"Early stopping at epoch {epoch +1}")
175
+ break
176
+
177
+
178
+
179
+
180
+ print(f'Epoch {epoch + 1} validation complete')
181
+
182
+
183
+ print(f"\nTraining finished. Best val accuracy: {best_accuracy:.4f}")
184
+ print(f"Best model weights saved to: {save_path}")
185
+
186
+
187
+ training_metrics = {
188
+ "losses": training_losses,
189
+ "accuracies": training_accuracies,
190
+ "val_accuracies": val_accuracies,
191
+ "best_accuracy": best_accuracy
192
+
193
+ }
194
+
195
+ return training_metrics
196
+
197
+
198
+
199
+
trainingModel/run_training.py CHANGED
@@ -1,20 +1,21 @@
1
-
2
  from clearml import Task
3
  from dataPrep.helpers.clearml_data import extract_latest_data_task
4
 
5
  import torch
6
- from models.modelOne import modelOne
7
- from trainingModel.Training import train_model
8
 
9
 
10
  # -------------- Load Data --------------
 
11
  project_name = "Small Group Project"
12
- subset_loaders, full_loaders, data_prep_metadata = extract_latest_data_task(project_name=project_name)
13
 
14
 
15
  # -------- ClearML Training Task Setup --------
16
  training_task = Task.init(
17
- project_name="Small Group Project",
18
  task_name="Model Training",
19
  reuse_last_task_id=False,
20
  )
@@ -26,18 +27,24 @@ training_task.connect(data_prep_metadata, name="data_prep_metadata_READONLY")
26
  # Training parameters - Modify these to experiment
27
  training_config = {
28
  "num_classes": 39,
29
- "n_epochs": 10,
30
  "learning_rate": 1e-3,
31
  "optimizer": "adam",
32
  "save_path": "best_model.pt",
 
33
  }
34
  training_task.connect(training_config)
35
 
36
 
37
  # -------- Build the ML model --------
38
- model = modelOne(noOfClasses=training_config["num_classes"])
39
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
40
 
 
 
 
 
41
 
42
  # ------- Train the model (on subset for now) -------
43
 
@@ -46,33 +53,27 @@ training_metrics = train_model(
46
  model=model,
47
  train_loader=subset_loaders['train'],
48
  val_loader=subset_loaders['val'],
49
- device=device,
50
  n_epochs=training_config["n_epochs"],
51
  lr=training_config["learning_rate"],
52
  num_classes=training_config["num_classes"],
53
- optimizer_type=training_config["optimizer"],
54
  save_path=training_config["save_path"],
 
55
  )
56
 
57
 
58
  # ----------- Log metrics to ClearML -----------
59
- # Per-batch training losses and accuracies
60
- for i, loss in enumerate(training_metrics["batch_losses"]):
61
- training_logger.report_scalar("training batch loss", "loss", value=loss, iteration=i)
62
-
63
- for i, acc in enumerate(training_metrics["batch_accuracies"]):
64
- training_logger.report_scalar("training batch accuracy", "accuracy", value=acc, iteration=i)
65
-
66
  # Per-epoch training losses and accuracies
67
- epoch_metrics = zip(training_metrics["epoch_losses"], training_metrics["epoch_accuracies"])
68
- for epoch, (loss, acc) in enumerate(epoch_metrics):
69
- training_logger.report_scalar("training epoch loss", "loss", loss, iteration=epoch)
70
- training_logger.report_scalar("training epoch accuracy", "accuracy", acc, iteration=epoch)
 
71
 
72
  # Per-epoch validation accuracies
73
  for epoch, acc in enumerate(training_metrics["val_accuracies"]):
74
  training_logger.report_scalar("validation epoch accuracy", "accuracy", value=acc, iteration=epoch)
75
 
 
76
  training_logger.report_single_value("best_val_accuracy", training_metrics["best_accuracy"])
77
 
78
  # Upload best model as artifact
 
1
+ import os
2
  from clearml import Task
3
  from dataPrep.helpers.clearml_data import extract_latest_data_task
4
 
5
  import torch
6
+ from models.modelTwo import BetterCNN
7
+ from trainingModel.helpers.Training import train_model
8
 
9
 
10
  # -------------- Load Data --------------
11
+ NUM_WORKERS = 0
12
  project_name = "Small Group Project"
13
+ subset_loaders, full_loaders, data_prep_metadata = extract_latest_data_task(project_name=project_name, num_workers=NUM_WORKERS)
14
 
15
 
16
  # -------- ClearML Training Task Setup --------
17
  training_task = Task.init(
18
+ project_name=f"{project_name}/Model Training",
19
  task_name="Model Training",
20
  reuse_last_task_id=False,
21
  )
 
27
  # Training parameters - Modify these to experiment
28
  training_config = {
29
  "num_classes": 39,
30
+ "n_epochs": 1,
31
  "learning_rate": 1e-3,
32
  "optimizer": "adam",
33
  "save_path": "best_model.pt",
34
+ "num_workers": NUM_WORKERS
35
  }
36
  training_task.connect(training_config)
37
 
38
 
39
  # -------- Build the ML model --------
40
+ model = BetterCNN(noOfClasses=training_config["num_classes"])
41
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
42
+ model.to(device)
43
 
44
+ # Print device info
45
+ print(f"\n**Using device: {device}**\n")
46
+ if device.type == 'cuda':
47
+ print(f"GPU Name: {torch.cuda.get_device_name(0)}")
48
 
49
  # ------- Train the model (on subset for now) -------
50
 
 
53
  model=model,
54
  train_loader=subset_loaders['train'],
55
  val_loader=subset_loaders['val'],
 
56
  n_epochs=training_config["n_epochs"],
57
  lr=training_config["learning_rate"],
58
  num_classes=training_config["num_classes"],
 
59
  save_path=training_config["save_path"],
60
+ early_stop=3,
61
  )
62
 
63
 
64
  # ----------- Log metrics to ClearML -----------
 
 
 
 
 
 
 
65
  # Per-epoch training losses and accuracies
66
+ for epoch, loss in enumerate(training_metrics["losses"]):
67
+ training_logger.report_scalar("training epoch loss", "loss", value=loss, iteration=epoch)
68
+
69
+ for epoch, acc in enumerate(training_metrics["accuracies"]):
70
+ training_logger.report_scalar("training epoch accuracy", "accuracy", value=acc, iteration=epoch)
71
 
72
  # Per-epoch validation accuracies
73
  for epoch, acc in enumerate(training_metrics["val_accuracies"]):
74
  training_logger.report_scalar("validation epoch accuracy", "accuracy", value=acc, iteration=epoch)
75
 
76
+ # Best validation accuracy
77
  training_logger.report_single_value("best_val_accuracy", training_metrics["best_accuracy"])
78
 
79
  # Upload best model as artifact