Spaces:

k23064919
/

smallGroupProject

Sleeping

App Files Files Community

Yusuf Rahman (k22040245) commited on Nov 18, 2025

Commit

0d36ad3

unverified ·

2 Parent(s): deb385a 8e6181a

Merge pull request #2 from K23064919/ops/clearml-setup

Browse files

Files changed (6) hide show

dataPrep/data_preparation.py +63 -20
dataPrep/helpers/create_dataset.py +19 -6
dataPrep/helpers/transforms_loaders.py +23 -13
models/__init__.py +0 -0
trainingModel/__init__.py +0 -0
trainingModel/run_training.py +96 -0

dataPrep/data_preparation.py CHANGED Viewed

@@ -6,7 +6,7 @@ import random
 import numpy as np
 import pandas as pd
 from datasets import load_dataset
-from helpers.create_dataset import load_subset_from_dataset
 from helpers.transforms_loaders import make_dataset_loaders
 # --- Visualization ---
@@ -15,17 +15,28 @@ import matplotlib.pyplot as plt
 # --- PyTorch (Machine Learning) ---
 import torch
-from torchvision import transforms
-from torch.utils.data import DataLoader
 # --- Experiment Tracking ---
-from clearml import Task, Logger, Dataset
-# Setting up the SEED to be able to repeat experiments
 SEED = 42
 DATASET_SUBSET_RATIO = 0.25
 random.seed(SEED)
 np.random.seed(SEED)
 torch.manual_seed(SEED)
@@ -34,20 +45,37 @@ if torch.cuda.is_available():
 # ----- ClearML Setup -----
-task = Task.init(project_name= 'Small Group CW', task_name = 'data_prep')
 task.set_random_seed(SEED)
 clearml_logger = task.get_logger()
-# Log subset config to ClearML
-task.connect_configuration(
-    {"subset_ratio": DATASET_SUBSET_RATIO},
-    name="Data subsetting"
-)
 # ----- Load a subset from a given dataset & track with ClearML -----
-data_plants, prototyping_dataset, features, clearml_dataset = load_subset_from_dataset(
-    SEED, DATASET_SUBSET_RATIO, clearml_logger
 )
@@ -56,7 +84,7 @@ data_plants, prototyping_dataset, features, clearml_dataset = load_subset_from_d
 # Reformatting the label feature to understand bias
 labels_list = prototyping_dataset['label']
 df_labels = pd.Series(labels_list)
-label_count = df_labels.value_counts(sort = False)
 # Checking the amount of samples in each class and logging it to clearML
@@ -100,12 +128,11 @@ plt.title("Class Distribution in Prototype Dataset")
 plt.xlabel("Class")
 plt.ylabel("Count")
 plt.tight_layout()
-plt.savefig("class_distribution.png")
-clearml_logger.report_image(
     title="EDA Class Distribution",
     series="Prototype Subset",
-    local_path="class_distribution.png",
     iteration=1
 )
@@ -113,9 +140,16 @@ clearml_logger.report_image(
 # ----------------------------------------------------------------------
 if __name__ == "__main__":
-    # ------------------- Dataset splits ----------------------------------
     prototype_loaders = make_dataset_loaders(
-        prototyping_dataset, seed=SEED, batch_size=32, test_size=0.3
     )
     print("\n--- Handoff Test Successful ---")
@@ -123,8 +157,15 @@ if __name__ == "__main__":
     print(f"Prototype Validation loader batches: {len(prototype_loaders['val'])}")
     print(f"Prototype Test loader batches: {len(prototype_loaders['test'])}")
     final_loaders = make_dataset_loaders(
-        data_plants, seed=SEED, batch_size=32, test_size=0.3
     )
     print("\n--- Handoff Test Successful ---")
@@ -137,6 +178,8 @@ if __name__ == "__main__":
         {"dataset_id": clearml_dataset.id},
         name="Dataset Metadata"
     )
     # Close the ClearML task
     task.close()

 import numpy as np
 import pandas as pd
 from datasets import load_dataset
+from helpers.create_dataset import make_subset
 from helpers.transforms_loaders import make_dataset_loaders
 # --- Visualization ---
 # --- PyTorch (Machine Learning) ---
 import torch
 # --- Experiment Tracking ---
+from clearml import Task
+# -------- Controllable parameters --------
+# Dataset parameters
 SEED = 42
+DATASET_LINK = "DScomp380/plant_village"
 DATASET_SUBSET_RATIO = 0.25
+# Augmentation parameters
+ROTATION = 30
+BRIGHTNESS = 0.2
+SATURATION = 0.2
+BLUR = 3
+# DataLoader parameters
+BATCH_SIZE = 32
+TEST_SIZE = 0.3
+# Setting up the SEED to be able to repeat experiments
 random.seed(SEED)
 np.random.seed(SEED)
 torch.manual_seed(SEED)
 # ----- ClearML Setup -----
+task = Task.init(
+    project_name='Small Group Project',
+    task_name='Data Preparation',
+    task_type=Task.TaskTypes.data_processing
+)
 task.set_random_seed(SEED)
 clearml_logger = task.get_logger()
+# -------- Track full configuration in ClearML --------
+task.connect({
+    "seed": SEED,
+    "dataset": {
+        "link": DATASET_LINK,
+        "subset_ratio": DATASET_SUBSET_RATIO,
+    },
+    "augmentation": {
+        "rotation": ROTATION,
+        "brightness": BRIGHTNESS,
+        "saturation": SATURATION,
+        "blur": BLUR
+    },
+    "dataloaders": {
+        "batch_size": BATCH_SIZE,
+        "test_size": TEST_SIZE
+    }
+})
 # ----- Load a subset from a given dataset & track with ClearML -----
+data_plants, prototyping_dataset, features, clearml_dataset = make_subset(
+    DATASET_LINK, DATASET_SUBSET_RATIO, clearml_logger
 )
 # Reformatting the label feature to understand bias
 labels_list = prototyping_dataset['label']
 df_labels = pd.Series(labels_list)
+label_count = df_labels.value_counts(sort=False)
 # Checking the amount of samples in each class and logging it to clearML
 plt.xlabel("Class")
 plt.ylabel("Count")
 plt.tight_layout()
+clearml_logger.report_matplotlib_figure(
     title="EDA Class Distribution",
     series="Prototype Subset",
+    figure=plt.gcf(),
     iteration=1
 )
 # ----------------------------------------------------------------------
 if __name__ == "__main__":
+    # ---------------- Dataset splits ----------------
+    aug_config = {
+        'rotation': ROTATION,
+        'brightness': BRIGHTNESS,
+        'saturation': SATURATION,
+        'blur': BLUR
+    }
     prototype_loaders = make_dataset_loaders(
+        prototyping_dataset, SEED, BATCH_SIZE, TEST_SIZE, aug_config
     )
     print("\n--- Handoff Test Successful ---")
     print(f"Prototype Validation loader batches: {len(prototype_loaders['val'])}")
     print(f"Prototype Test loader batches: {len(prototype_loaders['test'])}")
+    clearml_logger.report_text(
+        f"Prototype loaders created: "
+        f"train={len(prototype_loaders['train'])}, "
+        f"val={len(prototype_loaders['val'])}, "
+        f"test={len(prototype_loaders['test'])}"
+    )
     final_loaders = make_dataset_loaders(
+        data_plants, SEED, BATCH_SIZE, TEST_SIZE, aug_config
     )
     print("\n--- Handoff Test Successful ---")
         {"dataset_id": clearml_dataset.id},
         name="Dataset Metadata"
     )
+    task.mark_completed()
     # Close the ClearML task
     task.close()

dataPrep/helpers/create_dataset.py CHANGED Viewed

@@ -2,19 +2,23 @@
 A collection of dataset (DS) loading and subsetting functions.
 """
 import random
 import numpy as np
 from datasets import load_dataset
 from clearml import Dataset
-# Load a DS from HuggingFace Link and subset - upload both to ClearML
-def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
-    DATASET_LINK = "DScomp380/plant_village"
     # Load dataset
     try:
-        ds = load_dataset(DATASET_LINK)
     except Exception as e:
         raise RuntimeError(f"Error loading the dataset: {e}")
@@ -35,15 +39,21 @@ def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
     # ---------- Register subset in ClearML ----------
     clearml_dataset = Dataset.create(
         dataset_name="Plant Village Prototype",
-        dataset_project="smallGroupProject",
-        dataset_tags=["prototype", "subset"]
     )
     # Save indices
     subset_path = "subset_indices.npy"
     np.save(subset_path, subset_indices)
     clearml_dataset.add_files(subset_path)
     clearml_dataset.set_metadata({
         "subset_ratio": subset_ratio,
         "total_samples": len(prototyping_dataset)
     })
@@ -52,4 +62,7 @@ def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
     clearml_dataset.finalize()
     clearml_logger.report_text(f"Created ClearML Dataset: {clearml_dataset.id}")
     return data_plants, prototyping_dataset, features, clearml_dataset

 A collection of dataset (DS) loading and subsetting functions.
 """
+import os
 import random
 import numpy as np
 from datasets import load_dataset
 from clearml import Dataset
+'''
+Load a DS from HuggingFace Link & randomly subset it - upload subset to ClearML
+Subset indicies are uploaded to ClearML for reproducibility
+REPRODUCE: Load full DS, then load indicies from ClearML to get same subset
+'''
+def make_subset(dataset_link, subset_ratio, clearml_logger):
     # Load dataset
     try:
+        ds = load_dataset(dataset_link)
     except Exception as e:
         raise RuntimeError(f"Error loading the dataset: {e}")
     # ---------- Register subset in ClearML ----------
     clearml_dataset = Dataset.create(
         dataset_name="Plant Village Prototype",
+        dataset_project="Small Group Project",
+        dataset_tags=["prototype", "subset"],
+        use_current_task=True
     )
+    clearml_dataset.add_tags([
+        f"subset_ratio_{subset_ratio}",
+        "hf_source"
+    ])
     # Save indices
     subset_path = "subset_indices.npy"
     np.save(subset_path, subset_indices)
     clearml_dataset.add_files(subset_path)
     clearml_dataset.set_metadata({
+        "huggingface_dataset": dataset_link,
         "subset_ratio": subset_ratio,
         "total_samples": len(prototyping_dataset)
     })
     clearml_dataset.finalize()
     clearml_logger.report_text(f"Created ClearML Dataset: {clearml_dataset.id}")
+    # Clean up local file
+    os.remove(subset_path)
     return data_plants, prototyping_dataset, features, clearml_dataset

dataPrep/helpers/transforms_loaders.py CHANGED Viewed

@@ -6,13 +6,12 @@ from torchvision import transforms
 from torch.utils.data import DataLoader
-# Defines and returns the normalization and augmentation pipelines.
-def make_transform_pipelines():
-    # Standard ImageNet mean and std - Used to normalize the tensors
-    IMAGENET_MEAN = [0.485, 0.456, 0.406]
-    IMAGENET_STD = [0.229, 0.224, 0.225]
     # Pipeline ensures image format is consistent (for Val/Test)
     normalisation = transforms.Compose([
@@ -24,28 +23,39 @@ def make_transform_pipelines():
         transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
     ])
     # Augmentation pipeline (to create "new" images by changing some parameters)
     augmentation = transforms.Compose([
         # Randomly changing some parameters of pictures to enrich dataset
-        transforms.RandomRotation(30),
-        transforms.ColorJitter(brightness=0.2, saturation=0.2),
-        transforms.GaussianBlur(3),
         transforms.ToTensor(),
         transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
     ])
-    return normalisation, augmentation
 """
 Creates and returns DataLoaders (train, val, test) for a given dataset.
 Performs a 70/15/15 split
 """
-def make_dataset_loaders(dataset, seed, batch_size=32, test_size=0.3):
     # Define transformation pipelines for the dataset
-    normalisation, augmentation = make_transform_pipelines()
     # 70/30 split creates train set
     split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
@@ -53,7 +63,7 @@ def make_dataset_loaders(dataset, seed, batch_size=32, test_size=0.3):
     remaining_split = split_1['test']
     # 15/15 split on remaining data - validation and test sets
-    val_split = test_size/2
     split_2 = remaining_split.train_test_split(test_size=val_split, seed=seed)
     val_split, test_split = split_2['train'], split_2['test']

 from torch.utils.data import DataLoader
+# Standard ImageNet mean and std - Used to normalize the tensors
+IMAGENET_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_STD = [0.229, 0.224, 0.225]
+# Defines and returns the normalization pipeline.
+def make_norm_pipeline():
     # Pipeline ensures image format is consistent (for Val/Test)
     normalisation = transforms.Compose([
         transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
     ])
+    return normalisation
+# Defines and returns the augmentation (rotation, brightness, saturation, blur) pipeline.
+def make_augment_pipeline(aug_config):
+    rotation = aug_config['rotation']
+    brightness = aug_config['brightness']
+    saturation = aug_config['saturation']
+    blur = aug_config['blur']
     # Augmentation pipeline (to create "new" images by changing some parameters)
     augmentation = transforms.Compose([
         # Randomly changing some parameters of pictures to enrich dataset
+        transforms.RandomRotation(rotation),
+        transforms.ColorJitter(brightness=brightness, saturation=saturation),
+        transforms.GaussianBlur(blur),
         transforms.ToTensor(),
         transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
     ])
+    return augmentation
 """
 Creates and returns DataLoaders (train, val, test) for a given dataset.
 Performs a 70/15/15 split
 """
+def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config):
     # Define transformation pipelines for the dataset
+    normalisation = make_norm_pipeline()
+    augmentation = make_augment_pipeline(aug_config)
     # 70/30 split creates train set
     split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
     remaining_split = split_1['test']
     # 15/15 split on remaining data - validation and test sets
+    val_split = 0.5
     split_2 = remaining_split.train_test_split(test_size=val_split, seed=seed)
     val_split, test_split = split_2['train'], split_2['test']

models/__init__.py ADDED Viewed

File without changes

trainingModel/__init__.py ADDED Viewed

File without changes

trainingModel/run_training.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import numpy as np
+from clearml import Task, Dataset
+from datasets import load_dataset
+from dataPrep.helpers.transforms_loaders import make_dataset_loaders
+import torch
+from models.modelOne import modelOne
+from trainingModel.Training import train_model
+# Load data prep task from ClearML
+DATA_PREP_TASK_ID = "f6888baedc7142fcad9e0cc6837c5cb5"
+DATA_PREP = Task.get_task(task_id=DATA_PREP_TASK_ID)
+data_params = DATA_PREP.get_parameters()
+dataset_link = data_params['General/dataset/link']
+# Load the whole dataset
+try:
+    ds = load_dataset(dataset_link)
+except Exception as e:
+    raise RuntimeError(f"Error loading the dataset: {e}")
+full_dataset = ds['train']
+# Load the subset indices from ClearML
+SUBSET_ID = "f6888baedc7142fcad9e0cc6837c5cb5"
+subset_clearml = Dataset.get(dataset_id=SUBSET_ID)
+local_folder = subset_clearml.get_local_copy()
+subset_indices_path = os.path.join(local_folder, "subset_indices.npy")
+subset_indices = np.load(subset_indices_path)
+print("Loaded subset indices:", subset_indices.shape)
+# Apply subset indices to full dataset - this gives you the same subset as data prep
+subset_dataset = full_dataset.select(subset_indices)
+# Extract parameters from data prep task - these will create the DataLoaders
+seed = int(data_params['General/seed'])
+batch_size = int(data_params['General/dataloaders/batch_size'])
+test_size = float(data_params['General/dataloaders/test_size'])
+aug_config = {
+    'rotation': float(data_params['General/augmentation/rotation']),
+    'brightness': float(data_params['General/augmentation/brightness']),
+    'saturation': float(data_params['General/augmentation/saturation']),
+    'blur': float(data_params['General/augmentation/blur'])
+}
+# Create DataLoaders using the parameters from data prep
+subset_loaders = make_dataset_loaders(
+    subset_dataset, seed, batch_size, test_size, aug_config
+)
+print("\n--- Handoff Test Successful ---")
+print(f"Prototype Train loader batches: {len(subset_loaders['train'])}")
+print(f"Prototype Validation loader batches: {len(subset_loaders['val'])}")
+print(f"Prototype Test loader batches: {len(subset_loaders['test'])}")
+full_loaders = make_dataset_loaders(
+    full_dataset, seed, batch_size, test_size, aug_config
+)
+print("\n--- Handoff Test Successful ---")
+print(f"Train loader batches: {len(full_loaders['train'])}")
+print(f"Validation loader batches: {len(full_loaders['val'])}")
+print(f"Test loader batches: {len(full_loaders['test'])}")
+# -------- Build the ML model --------
+model = modelOne(noOfClasses=39)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ------- Train the model (on subset for now) -------
+'''
+When calling this function, the model should be trained on the given dataset
+train_model(
+    model=model,
+    train_loader=subset_loaders['train'],
+    val_loader=subset_loaders['val'],
+    device=device,
+    n_epochs=10,
+    lr=1e-3,
+    save_path="best_model.pt",
+)
+'''