Spaces:

k23064919
/

smallGroupProject

Sleeping

App Files Files Community

k23064919 commited on Nov 26, 2025

Commit

7875de0

2 Parent(s): 534796e 1de07b3

Merge branch 'develop' of https://github.kcl.ac.uk/K23064919/smallGroupProject into develop

Browse files

Files changed (8) hide show

dataPrep/data_preparation.py +63 -20
dataPrep/helpers/create_dataset.py +22 -8
dataPrep/helpers/transforms_loaders.py +37 -19
models/__init__.py +0 -0
models/modelOne.py +2 -1
trainingModel/Training.py +32 -35
trainingModel/__init__.py +0 -0
trainingModel/run_training.py +149 -0

dataPrep/data_preparation.py CHANGED Viewed

@@ -6,7 +6,7 @@ import random
 import numpy as np
 import pandas as pd
 from datasets import load_dataset
-from helpers.create_dataset import load_subset_from_dataset
 from helpers.transforms_loaders import make_dataset_loaders
 # --- Visualization ---
@@ -15,17 +15,28 @@ import matplotlib.pyplot as plt
 # --- PyTorch (Machine Learning) ---
 import torch
-from torchvision import transforms
-from torch.utils.data import DataLoader
 # --- Experiment Tracking ---
-from clearml import Task, Logger, Dataset
-# Setting up the SEED to be able to repeat experiments
 SEED = 42
 DATASET_SUBSET_RATIO = 0.25
 random.seed(SEED)
 np.random.seed(SEED)
 torch.manual_seed(SEED)
@@ -34,20 +45,37 @@ if torch.cuda.is_available():
 # ----- ClearML Setup -----
-task = Task.init(project_name= 'Small Group CW', task_name = 'data_prep')
 task.set_random_seed(SEED)
 clearml_logger = task.get_logger()
-# Log subset config to ClearML
-task.connect_configuration(
-    {"subset_ratio": DATASET_SUBSET_RATIO},
-    name="Data subsetting"
-)
 # ----- Load a subset from a given dataset & track with ClearML -----
-data_plants, prototyping_dataset, features, clearml_dataset = load_subset_from_dataset(
-    SEED, DATASET_SUBSET_RATIO, clearml_logger
 )
@@ -56,7 +84,7 @@ data_plants, prototyping_dataset, features, clearml_dataset = load_subset_from_d
 # Reformatting the label feature to understand bias
 labels_list = prototyping_dataset['label']
 df_labels = pd.Series(labels_list)
-label_count = df_labels.value_counts(sort = False)
 # Checking the amount of samples in each class and logging it to clearML
@@ -100,12 +128,11 @@ plt.title("Class Distribution in Prototype Dataset")
 plt.xlabel("Class")
 plt.ylabel("Count")
 plt.tight_layout()
-plt.savefig("class_distribution.png")
-clearml_logger.report_image(
     title="EDA Class Distribution",
     series="Prototype Subset",
-    local_path="class_distribution.png",
     iteration=1
 )
@@ -113,9 +140,16 @@ clearml_logger.report_image(
 # ----------------------------------------------------------------------
 if __name__ == "__main__":
-    # ------------------- Dataset splits ----------------------------------
     prototype_loaders = make_dataset_loaders(
-        prototyping_dataset, seed=SEED, batch_size=32, test_size=0.3
     )
     print("\n--- Handoff Test Successful ---")
@@ -123,8 +157,15 @@ if __name__ == "__main__":
     print(f"Prototype Validation loader batches: {len(prototype_loaders['val'])}")
     print(f"Prototype Test loader batches: {len(prototype_loaders['test'])}")
     final_loaders = make_dataset_loaders(
-        data_plants, seed=SEED, batch_size=32, test_size=0.3
     )
     print("\n--- Handoff Test Successful ---")
@@ -137,6 +178,8 @@ if __name__ == "__main__":
         {"dataset_id": clearml_dataset.id},
         name="Dataset Metadata"
     )
     # Close the ClearML task
     task.close()

 import numpy as np
 import pandas as pd
 from datasets import load_dataset
+from helpers.create_dataset import make_subset
 from helpers.transforms_loaders import make_dataset_loaders
 # --- Visualization ---
 # --- PyTorch (Machine Learning) ---
 import torch
 # --- Experiment Tracking ---
+from clearml import Task
+# -------- Controllable parameters --------
+# Dataset parameters
 SEED = 42
+DATASET_LINK = "DScomp380/plant_village"
 DATASET_SUBSET_RATIO = 0.25
+# Augmentation parameters
+ROTATION = 30
+BRIGHTNESS = 0.2
+SATURATION = 0.2
+BLUR = 3
+# DataLoader parameters
+BATCH_SIZE = 32
+TEST_SIZE = 0.3
+# Setting up the SEED to be able to repeat experiments
 random.seed(SEED)
 np.random.seed(SEED)
 torch.manual_seed(SEED)
 # ----- ClearML Setup -----
+task = Task.init(
+    project_name='Small Group Project',
+    task_name='Data Preparation',
+    task_type=Task.TaskTypes.data_processing
+)
 task.set_random_seed(SEED)
 clearml_logger = task.get_logger()
+# -------- Track full configuration in ClearML --------
+task.connect({
+    "seed": SEED,
+    "dataset": {
+        "link": DATASET_LINK,
+        "subset_ratio": DATASET_SUBSET_RATIO,
+    },
+    "augmentation": {
+        "rotation": ROTATION,
+        "brightness": BRIGHTNESS,
+        "saturation": SATURATION,
+        "blur": BLUR
+    },
+    "dataloaders": {
+        "batch_size": BATCH_SIZE,
+        "test_size": TEST_SIZE
+    }
+})
 # ----- Load a subset from a given dataset & track with ClearML -----
+data_plants, prototyping_dataset, features, clearml_dataset = make_subset(
+    DATASET_LINK, DATASET_SUBSET_RATIO, clearml_logger
 )
 # Reformatting the label feature to understand bias
 labels_list = prototyping_dataset['label']
 df_labels = pd.Series(labels_list)
+label_count = df_labels.value_counts(sort=False)
 # Checking the amount of samples in each class and logging it to clearML
 plt.xlabel("Class")
 plt.ylabel("Count")
 plt.tight_layout()
+clearml_logger.report_matplotlib_figure(
     title="EDA Class Distribution",
     series="Prototype Subset",
+    figure=plt.gcf(),
     iteration=1
 )
 # ----------------------------------------------------------------------
 if __name__ == "__main__":
+    # ---------------- Dataset splits ----------------
+    aug_config = {
+        'rotation': ROTATION,
+        'brightness': BRIGHTNESS,
+        'saturation': SATURATION,
+        'blur': BLUR
+    }
     prototype_loaders = make_dataset_loaders(
+        prototyping_dataset, SEED, BATCH_SIZE, TEST_SIZE, aug_config
     )
     print("\n--- Handoff Test Successful ---")
     print(f"Prototype Validation loader batches: {len(prototype_loaders['val'])}")
     print(f"Prototype Test loader batches: {len(prototype_loaders['test'])}")
+    clearml_logger.report_text(
+        f"Prototype loaders created: "
+        f"train={len(prototype_loaders['train'])}, "
+        f"val={len(prototype_loaders['val'])}, "
+        f"test={len(prototype_loaders['test'])}"
+    )
     final_loaders = make_dataset_loaders(
+        data_plants, SEED, BATCH_SIZE, TEST_SIZE, aug_config
     )
     print("\n--- Handoff Test Successful ---")
         {"dataset_id": clearml_dataset.id},
         name="Dataset Metadata"
     )
+    task.mark_completed()
     # Close the ClearML task
     task.close()

dataPrep/helpers/create_dataset.py CHANGED Viewed

@@ -2,19 +2,23 @@
 A collection of dataset (DS) loading and subsetting functions.
 """
 import random
 import numpy as np
 from datasets import load_dataset
 from clearml import Dataset
-# Load a DS from HuggingFace Link and subset - upload both to ClearML
-def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
-    DATASET_LINK = "DScomp380/plant_village"
     # Load dataset
     try:
-        ds = load_dataset(DATASET_LINK)
     except Exception as e:
         raise RuntimeError(f"Error loading the dataset: {e}")
@@ -31,19 +35,26 @@ def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
     subset_indices = indices[:subset_size]
     prototyping_dataset = data_plants.select(subset_indices)
-    # ---------- Register subset in ClearML ----------
     clearml_dataset = Dataset.create(
         dataset_name="Plant Village Prototype",
-        dataset_project="smallGroupProject",
-        dataset_tags=["prototype", "subset"]
     )
     # Save indices
     subset_path = "subset_indices.npy"
     np.save(subset_path, subset_indices)
     clearml_dataset.add_files(subset_path)
     clearml_dataset.set_metadata({
         "subset_ratio": subset_ratio,
         "total_samples": len(prototyping_dataset)
     })
@@ -52,4 +63,7 @@ def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
     clearml_dataset.finalize()
     clearml_logger.report_text(f"Created ClearML Dataset: {clearml_dataset.id}")
     return data_plants, prototyping_dataset, features, clearml_dataset

 A collection of dataset (DS) loading and subsetting functions.
 """
+import os
 import random
 import numpy as np
 from datasets import load_dataset
 from clearml import Dataset
+'''
+Load a DS from HuggingFace Link & randomly subset it - upload subset to ClearML
+Subset indicies are uploaded to ClearML for reproducibility
+REPRODUCE: Load full DS, then load indicies from ClearML to get same subset
+'''
+def make_subset(dataset_link, subset_ratio, clearml_logger):
     # Load dataset
     try:
+        ds = load_dataset(dataset_link)
     except Exception as e:
         raise RuntimeError(f"Error loading the dataset: {e}")
     subset_indices = indices[:subset_size]
     prototyping_dataset = data_plants.select(subset_indices)
+# I THINK WE NEED TO REMOVE THIS LATER
+# We dont really need to upload subset everytime (Im not sure tho)
+    # Register subset in ClearML
     clearml_dataset = Dataset.create(
         dataset_name="Plant Village Prototype",
+        dataset_project="Small Group Project",
+        dataset_tags=["prototype", "subset"],
+        use_current_task=False
     )
+    clearml_dataset.add_tags([
+        f"subset_ratio_{subset_ratio}",
+        "hf_source"
+    ])
     # Save indices
     subset_path = "subset_indices.npy"
     np.save(subset_path, subset_indices)
     clearml_dataset.add_files(subset_path)
     clearml_dataset.set_metadata({
+        "huggingface_dataset": dataset_link,
         "subset_ratio": subset_ratio,
         "total_samples": len(prototyping_dataset)
     })
     clearml_dataset.finalize()
     clearml_logger.report_text(f"Created ClearML Dataset: {clearml_dataset.id}")
+    # Clean up local file
+    os.remove(subset_path)
     return data_plants, prototyping_dataset, features, clearml_dataset

dataPrep/helpers/transforms_loaders.py CHANGED Viewed

@@ -6,17 +6,16 @@ from torchvision import transforms
 from torch.utils.data import DataLoader
-# Defines and returns the normalization and augmentation pipelines.
-def make_transform_pipelines():
-    # Standard ImageNet mean and std - Used to normalize the tensors
-    IMAGENET_MEAN = [0.485, 0.456, 0.406]
-    IMAGENET_STD = [0.229, 0.224, 0.225]
     # Pipeline ensures image format is consistent (for Val/Test)
     normalisation = transforms.Compose([
         # Convert PIL Image to a PyTorch Tensor, scales pixel values from [0, 255] to [0.0, 1.0]
         transforms.ToTensor(),
@@ -24,43 +23,62 @@ def make_transform_pipelines():
         transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
     ])
     # Augmentation pipeline (to create "new" images by changing some parameters)
     augmentation = transforms.Compose([
         # Randomly changing some parameters of pictures to enrich dataset
-        transforms.RandomRotation(30),
-        transforms.ColorJitter(brightness=0.2, saturation=0.2),
-        transforms.GaussianBlur(3),
         transforms.ToTensor(),
         transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
     ])
-    return normalisation, augmentation
 """
 Creates and returns DataLoaders (train, val, test) for a given dataset.
 Performs a 70/15/15 split
 """
-def make_dataset_loaders(dataset, seed, batch_size=32, test_size=0.3):
     # Define transformation pipelines for the dataset
-    normalisation, augmentation = make_transform_pipelines()
     # 70/30 split creates train set
     split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
     train_split = split_1['train']
     remaining_split = split_1['test']
     # 15/15 split on remaining data - validation and test sets
-    val_split = test_size/2
     split_2 = remaining_split.train_test_split(test_size=val_split, seed=seed)
     val_split, test_split = split_2['train'], split_2['test']
     # Put each split through pipelines
-    train_split.set_transform(augmentation)
-    val_split.set_transform(normalisation)
-    test_split.set_transform(normalisation)
     # Create dataloader for each
     train_loader = DataLoader(train_split, batch_size=batch_size, shuffle=True)

 from torch.utils.data import DataLoader
+# Standard ImageNet mean and std - Used to normalize the tensors
+IMAGENET_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_STD = [0.229, 0.224, 0.225]
+IMAGE_SIZE = (256, 256)
+# Defines and returns the normalization pipeline.
+def make_norm_pipeline():
     # Pipeline ensures image format is consistent (for Val/Test)
     normalisation = transforms.Compose([
+        transforms.Resize(IMAGE_SIZE),
         # Convert PIL Image to a PyTorch Tensor, scales pixel values from [0, 255] to [0.0, 1.0]
         transforms.ToTensor(),
         transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
     ])
+    return normalisation
+# Defines and returns the augmentation (rotation, brightness, saturation, blur) pipeline.
+def make_augment_pipeline(aug_config):
+    rotation = aug_config['rotation']
+    brightness = aug_config['brightness']
+    saturation = aug_config['saturation']
+    blur = aug_config['blur']
     # Augmentation pipeline (to create "new" images by changing some parameters)
     augmentation = transforms.Compose([
+        transforms.Resize(IMAGE_SIZE),
         # Randomly changing some parameters of pictures to enrich dataset
+        transforms.RandomRotation(rotation),
+        transforms.ColorJitter(brightness=brightness, saturation=saturation),
+        transforms.GaussianBlur(blur),
         transforms.ToTensor(),
         transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
     ])
+    return augmentation
 """
 Creates and returns DataLoaders (train, val, test) for a given dataset.
 Performs a 70/15/15 split
 """
+def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config):
     # Define transformation pipelines for the dataset
+    normalisation = make_norm_pipeline()
+    augmentation = make_augment_pipeline(aug_config)
+    def apply_augmentation(batch):
+        batch['image'] = [augmentation(x) for x in batch['image']]
+        return batch
+    def apply_normalisation(batch):
+        batch['image'] = [normalisation(x) for x in batch['image']]
+        return batch
     # 70/30 split creates train set
     split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
     train_split = split_1['train']
     remaining_split = split_1['test']
     # 15/15 split on remaining data - validation and test sets
+    val_split = 0.5
     split_2 = remaining_split.train_test_split(test_size=val_split, seed=seed)
     val_split, test_split = split_2['train'], split_2['test']
     # Put each split through pipelines
+    train_split.set_transform(apply_augmentation)
+    val_split.set_transform(apply_normalisation)
+    test_split.set_transform(apply_normalisation)
     # Create dataloader for each
     train_loader = DataLoader(train_split, batch_size=batch_size, shuffle=True)

models/__init__.py ADDED Viewed

File without changes

models/modelOne.py CHANGED Viewed

@@ -13,7 +13,7 @@ class modelOne(nn.Module) :
         self.conv2 = nn.Conv2d(6, 16, 5, padding=2)
         self.batchNorm2 = nn.BatchNorm2d(16)
-        self.fc1 = nn.Linear(16*64*64, 512)
         self.dropout = nn.Dropout(0.5)
         self.fc2 = nn.Linear(512, 84)
@@ -23,6 +23,7 @@ class modelOne(nn.Module) :
         x = self.pool(F.relu(self.batchNorm1(self.conv1(x))))
         x = self.pool(F.relu(self.batchNorm2(self.conv2(x))))
         x = torch.flatten(x, 1)
         x = self.dropout(x)
         x = F.relu(self.fc1(x))
         x = F.relu(self.fc2(x))

         self.conv2 = nn.Conv2d(6, 16, 5, padding=2)
         self.batchNorm2 = nn.BatchNorm2d(16)
+        self.fc1 = nn.Linear(63504, 512)
         self.dropout = nn.Dropout(0.5)
         self.fc2 = nn.Linear(512, 84)
         x = self.pool(F.relu(self.batchNorm1(self.conv1(x))))
         x = self.pool(F.relu(self.batchNorm2(self.conv2(x))))
         x = torch.flatten(x, 1)
+        print("Flattened size:", x.shape[1])
         x = self.dropout(x)
         x = F.relu(self.fc1(x))
         x = F.relu(self.fc2(x))

trainingModel/Training.py CHANGED Viewed

@@ -2,16 +2,10 @@ import torch
 import torch.nn as nn
 import numpy as np
 from torcheval.metrics import MulticlassAccuracy
-#from torchvision import transforms
 from torch.utils.data import DataLoader
-#from torchvision.datasets import MNIST
-#import torchvision.utils
-# loss, optimizer, training loop, validation, best model saving
 def train_model(
@@ -26,7 +20,19 @@ def train_model(
     num_classes : int = 39,
 ):
     # Move model to device
@@ -43,19 +49,20 @@ def train_model(
     # Arrays to log metrics
     num_batches = len(train_loader)
     # Store training losses and accuracies for every batch
     # num_batches is the number of batches for every epoch
     training_losses = np.zeros(num_batches * n_epochs)
     training_accuracies = np.zeros(num_batches * n_epochs)
     # store validation accuracy for every epoch
     val_accuracies = np.zeros(n_epochs)
     # keep track of best validation accuracy and best model
     best_accuracy = 0.0
     #----------------------
     # training loop
     #----------------------
@@ -69,16 +76,14 @@ def train_model(
             # move to GPU memory
             inputs = batch["image"].to(device)
-            labels = batch["label"].to(device)
             # flatten if not cnn REVISE LATER
             if flatten_input:
                 inputs = inputs.view(inputs.size(0), -1)
             optimizer.zero_grad()
             # Forward pass
             outputs = model(inputs)
             loss = criterion(outputs, labels)
@@ -92,40 +97,31 @@ def train_model(
             # log the loss value
             training_losses[epoch * num_batches + i] = loss.item()
-            # Compute accuracy of the batch.
             #updates the accuracy computation with new data
             train_accuracy_fn.update(outputs, labels)
             #compute accuracy with the current data
             training_accuracies[epoch * num_batches + i] = train_accuracy_fn.compute().item()
-            # display some progress (every 200 batches)
-            # optional, you can comment out
-            # if i % 200 == 0:
-            #     print(f'Epoch {epoch + 1}, batch {i+1} of {len(train_loader)}')
         print(f'Epoch {epoch + 1} training complete')
-        # Validation after each epoch
         model.eval()
         val_accuracy_fn.reset()
-        # The context 'torch.no_grad()' tells pytorch we are not interested in computing
-        # gradients here, so forward pass is more efficient
         with torch.no_grad():
-            for i, batch in enumerate(val_loader):
                 inputs = batch["image"].to(device)
-                labels = batch["label"].to(device)
                 # flatten if not cnn REVISE LATER
                 if flatten_input:
                     inputs = inputs.view(inputs.size(0), -1)
                 outputs = model(inputs)
                 val_accuracy_fn.update(outputs, labels)
@@ -133,7 +129,6 @@ def train_model(
         current_accuracy = val_accuracy_fn.compute().item()
         val_accuracies[epoch] = current_accuracy
         # keep track of best validation accuracy and save best model so far
         if current_accuracy > best_accuracy:
             best_accuracy = current_accuracy
@@ -144,10 +139,12 @@ def train_model(
     print(f"\nTraining finished. Best val accuracy: {best_accuracy:.4f}")
     print(f"Best model weights saved to: {save_path}")
-    return training_losses, training_accuracies, val_accuracies, best_accuracy
-    #tweak later
-    #best_model = MNISTNet().to(device)
-    #best_model.load_state_dict(
-    #    torch.load('mnist-torch-best_model.pt', map_location=device))

 import torch.nn as nn
 import numpy as np
 from torcheval.metrics import MulticlassAccuracy
 from torch.utils.data import DataLoader
+# fix errors in runtime
 def train_model(
     num_classes : int = 39,
 ):
+    """
+    Trains the given model and returns:
+    - training_losses: numpy array of loss per batch
+    - training_accuracies: numpy array of running accuracy per batch
+    - val_accuracies: numpy array of accuracy per epoch
+    - best_accuracy: highest validation accuracy achieved
+    Expected batch format:
+        batch["image"] → Tensor [B, C, H, W]
+        batch["label"] → Tensor [B] with class IDs (int64)
+    Model output:
+        outputs → Tensor [B, num_classes] (logits)
+    """
     # Move model to device
     # Arrays to log metrics
     num_batches = len(train_loader)
+    if num_batches == 0:
+        raise RuntimeError("UH OH!!!! empty train loader")
     # Store training losses and accuracies for every batch
     # num_batches is the number of batches for every epoch
     training_losses = np.zeros(num_batches * n_epochs)
     training_accuracies = np.zeros(num_batches * n_epochs)
     # store validation accuracy for every epoch
     val_accuracies = np.zeros(n_epochs)
     # keep track of best validation accuracy and best model
     best_accuracy = 0.0
     #----------------------
     # training loop
     #----------------------
             # move to GPU memory
             inputs = batch["image"].to(device)
+            labels = batch["label"].to(device).long()
             # flatten if not cnn REVISE LATER
             if flatten_input:
                 inputs = inputs.view(inputs.size(0), -1)
             optimizer.zero_grad()
             # Forward pass
             outputs = model(inputs)
             loss = criterion(outputs, labels)
             # log the loss value
             training_losses[epoch * num_batches + i] = loss.item()
             #updates the accuracy computation with new data
             train_accuracy_fn.update(outputs, labels)
             #compute accuracy with the current data
             training_accuracies[epoch * num_batches + i] = train_accuracy_fn.compute().item()
         print(f'Epoch {epoch + 1} training complete')
+        # ----------------------
+        # validation loop
+        # ----------------------
         model.eval()
         val_accuracy_fn.reset()
         with torch.no_grad():
+            for batch in val_loader:
                 inputs = batch["image"].to(device)
+                labels = batch["label"].to(device).long()
                 # flatten if not cnn REVISE LATER
                 if flatten_input:
                     inputs = inputs.view(inputs.size(0), -1)
                 outputs = model(inputs)
                 val_accuracy_fn.update(outputs, labels)
         current_accuracy = val_accuracy_fn.compute().item()
         val_accuracies[epoch] = current_accuracy
         # keep track of best validation accuracy and save best model so far
         if current_accuracy > best_accuracy:
             best_accuracy = current_accuracy
     print(f"\nTraining finished. Best val accuracy: {best_accuracy:.4f}")
     print(f"Best model weights saved to: {save_path}")
+    training_metrics = {
+        "losses": training_losses,
+        "accuracies": training_accuracies,
+        "val_accuracies": val_accuracies,
+        "best_accuracy": best_accuracy,
+    }
+    return training_metrics

trainingModel/__init__.py ADDED Viewed

File without changes

trainingModel/run_training.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import os
+import numpy as np
+from clearml import Task, Dataset
+from datasets import load_dataset
+from dataPrep.helpers.transforms_loaders import make_dataset_loaders
+import torch
+from models.modelOne import modelOne
+from trainingModel.Training import train_model
+# -------------- Load Data --------------
+all_tasks = Task.get_tasks(project_name="Small Group Project")
+if not all_tasks:
+    raise RuntimeError("No tasks found in project 'Small Group Project'")
+dp_tasks = [t for t in all_tasks if t.name == "Data Preparation"]
+if not dp_tasks:
+    raise RuntimeError("No 'Data Preparation' tasks found in this project!")
+# Latest Data Prep Task
+latest_task = max(dp_tasks, key=lambda t: t.id)
+DYNAMIC_TASK_ID = latest_task.id
+DATA_PREP = Task.get_task(task_id=DYNAMIC_TASK_ID)
+# Dataset ID
+config_objects = DATA_PREP.get_configuration_objects()
+raw_meta = config_objects["Dataset Metadata"]
+dataset_id = raw_meta.split("=")[1].strip().replace('"', "")
+# Load ClearML Dataset
+subset_clearml = Dataset.get(dataset_id=dataset_id)
+local_folder = subset_clearml.get_local_copy()
+subset_indices = np.load(os.path.join(local_folder, "subset_indices.npy"))
+# Load Dataset Parameters
+data_params = DATA_PREP.get_parameters()
+dataset_link = data_params['General/dataset/link']
+# Load Full Dataset
+try:
+    ds = load_dataset(dataset_link)
+except Exception as e:
+    raise RuntimeError(f"Error loading the dataset: {e}")
+full_dataset = ds['train']
+# Apply subset indices to full dataset - this gives you the same subset as data prep
+subset_dataset = full_dataset.select(subset_indices)
+# Extract parameters from data prep task - these will create the DataLoaders
+seed = int(data_params['General/seed'])
+batch_size = int(data_params['General/dataloaders/batch_size'])
+test_size = float(data_params['General/dataloaders/test_size'])
+aug_config = {
+    'rotation': float(data_params['General/augmentation/rotation']),
+    'brightness': float(data_params['General/augmentation/brightness']),
+    'saturation': float(data_params['General/augmentation/saturation']),
+    'blur': float(data_params['General/augmentation/blur'])
+}
+# Create DataLoaders using the parameters from data prep
+subset_loaders = make_dataset_loaders(
+    subset_dataset, seed, batch_size, test_size, aug_config
+)
+print("\n--- Handoff Test Successful ---")
+print(f"Prototype Train loader batches: {len(subset_loaders['train'])}")
+print(f"Prototype Validation loader batches: {len(subset_loaders['val'])}")
+print(f"Prototype Test loader batches: {len(subset_loaders['test'])}")
+full_loaders = make_dataset_loaders(
+    full_dataset, seed, batch_size, test_size, aug_config
+)
+print("\n--- Handoff Test Successful ---")
+print(f"Train loader batches: {len(full_loaders['train'])}")
+print(f"Validation loader batches: {len(full_loaders['val'])}")
+print(f"Test loader batches: {len(full_loaders['test'])}")
+# -------------- DATA PREP ENDS --------------
+# -------- ClearML Training Task Setup --------
+training_task = Task.init(
+    project_name="Small Group Project",
+    task_name="Model Training",
+    reuse_last_task_id=False,
+)
+training_logger = training_task.get_logger()
+training_task.connect({"data_prep_task_used": DYNAMIC_TASK_ID})
+# Training parameters - Modify these to experiment
+training_config = {
+    "num_classes": 39,
+    "n_epochs": 1,
+    "learning_rate": 1e-3,
+    "batch_size": batch_size,
+    "save_path": "best_model.pt",
+}
+training_task.connect(training_config)
+# -------- Build the ML model --------
+model = modelOne(noOfClasses=training_config["num_classes"])
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ------- Train the model (on subset for now) -------
+print("\n--- Starting Model Training on Subset ---")
+training_metrics = train_model(
+    model=model,
+    train_loader=subset_loaders['train'],
+    val_loader=subset_loaders['val'],
+    device=device,
+    n_epochs=training_config["n_epochs"],
+    lr=training_config["learning_rate"],
+    save_path=training_config["save_path"],
+)
+# ----------- Log metrics to ClearML -----------
+# Per-batch training losses and accuracies
+for i, loss in enumerate(training_metrics["losses"]):
+    training_logger.report_scalar("train", "loss_per_batch", value=loss, iteration=i)
+for i, acc in enumerate(training_metrics["accuracies"]):
+    training_logger.report_scalar("train", "accuracy_per_batch", value=acc, iteration=i)
+# Per-epoch validation accuracy
+for epoch, acc in enumerate(training_metrics["val_accuracies"]):
+    training_logger.report_scalar("validation", "accuracy_per_epoch", value=acc, iteration=epoch)
+training_logger.report_single_value("best_val_accuracy", training_metrics["best_accuracy"])
+# Upload best model as artifact
+training_task.upload_artifact("best_model", training_config["save_path"])
+print("\nTraining complete.")
+training_task.close()