Spaces:

k23064919
/

smallGroupProject

Running

App Files Files Community

Yusuf commited on Nov 13, 2025

Commit

04cb886

1 Parent(s): 6b1327e

CHORE: separate dataset load & transform pipelines

Browse files

Files changed (3) hide show

dataPrep/data_preparation.py +25 -163
dataPrep/helpers/create_dataset.py +55 -0
dataPrep/helpers/transforms_loaders.py +76 -0

dataPrep/data_preparation.py CHANGED Viewed

@@ -6,6 +6,8 @@ import random
 import numpy as np
 import pandas as pd
 from datasets import load_dataset
 # --- Visualization ---
 import matplotlib.pyplot as plt
@@ -22,6 +24,8 @@ from clearml import Task, Logger, Dataset
 # Setting up the SEED to be able to repeat experiments
 SEED = 42
 random.seed(SEED)
 np.random.seed(SEED)
 torch.manual_seed(SEED)
@@ -29,66 +33,23 @@ if torch.cuda.is_available():
     torch.cuda.manual_seed_all(SEED)
-# Initialising a task on ClearML
-# UPDATE CLEARML
 task = Task.init(project_name= 'Small Group CW', task_name = 'data_prep')
 task.set_random_seed(SEED)
 clearml_logger = task.get_logger()
-# Loading dataset from HugginFace and checking it
-try:
-    ds = load_dataset("DScomp380/plant_village")
-except Exception as e:
-    print(f"Error loading the dataset: {e}")
-data_plants = ds['train']
-data_length = len(data_plants)
-features = data_plants.features
-# --------------------------- Data selection --------------------------------
-# Creating the prototyping dataset
-SUBSET_RATIO = 0.25 # 25% for prototyping
 # Log subset config to ClearML
 task.connect_configuration(
-    {"subset_ratio": SUBSET_RATIO},
     name="Data subsetting"
 )
-# Calculate amount of samples we use
-subset_size = int(data_length * SUBSET_RATIO)
-# Creating a subset of random data (by their indices)
-indices = list(range(data_length))
-random.shuffle(indices)
-subset_indices = indices[:subset_size]
-prototyping_dataset = data_plants.select(subset_indices)
-# Register this subset in ClearML
-dataset = Dataset.create(
-    dataset_name="Plant Village Prototype",
-    dataset_project="smallGroupProject",
-    dataset_tags=["prototype", "subset"]
 )
-# Save indicies used for reproducibility
-subset_path = "subset_indices.npy"
-np.save(subset_path, subset_indices)
-dataset.add_files(subset_path)
-# Add simple metadata
-dataset.set_metadata({
-    "subset_ratio": SUBSET_RATIO,
-    "total_samples": len(prototyping_dataset)
-})
-# Upload to ClearML storage
-dataset.upload()
-dataset.finalize()
-# Log the dataset ID
-clearml_logger.report_text(f"Created ClearML Dataset: {dataset.id}")
 # ---- Exploratory data analysis (EDA) ----
@@ -149,130 +110,31 @@ clearml_logger.report_image(
 )
-# --------------- Data Splits ------------
-def get_transform_pipelines():
-    """
-    Defines and returns the normalization and augmentation pipelines.
-    """
-    # Standard ImageNet mean and std
-    # These values are used to normalize the tensors
-    IMAGENET_MEAN = [0.485, 0.456, 0.406]
-    IMAGENET_STD = [0.229, 0.224, 0.225]
-    # Defining pipeline to ensure that images are consistently formatted (for Val/Test)
-    normalisation_pipeline = transforms.Compose([
-        # Convert PIL Image to a PyTorch Tensor
-        # This also scales pixel values from [0, 255] to [0.0, 1.0]
-        transforms.ToTensor(),
-        # Normalise the Tensor; Standartises pixel values
-        transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
-    ])
-    # Augmentation pipeline (to change some parameters of the pictures to create "new" ones)
-    augmentation_pipeline = transforms.Compose([
-        # Randomly changing some parameters of pictures to enrich dataset
-        transforms.RandomRotation(degrees=30),
-        transforms.ColorJitter(brightness=0.2, saturation=0.2),
-        transforms.GaussianBlur(kernel_size=3),
-        # Convert to Tensor and Normalise
-        transforms.ToTensor(),
-        transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
-    ])
-    # Return both pipelines
-    return normalisation_pipeline, augmentation_pipeline
-def get_prototype_loaders(batch_size=32):
-    """
-    Creates and returns DataLoaders for the 25% PROTOTYPE dataset.
-    """
-    # Calling function to define pipelines
-    normalisation_pipeline, augmentation_pipeline = get_transform_pipelines()
-    # -- Split the prototype dataset --
-    # This returns a dictionary: {'train': 70%, 'test': 30%}
-    split_1_dict = prototyping_dataset.train_test_split(test_size=0.3, seed=SEED)
-    # Assign the 70% part to final train split
-    proto_train_split = split_1_dict['train']
-    # Assign the 30% part to a temporary var
-    proto_temp_split = split_1_dict['test']
-    # Split 30% into 2 15%
-    # This returns a dictionary: {'train': 50%, 'test': 50%}
-    split_2_dict = proto_temp_split.train_test_split(test_size=0.5, seed=SEED)
-    proto_val_split = split_2_dict['train']
-    proto_test_split = split_2_dict['test']
-    # -- Putting splits through pipelines --
-    proto_train_split.set_transform(augmentation_pipeline)
-    proto_val_split.set_transform(normalisation_pipeline)
-    proto_test_split.set_transform(normalisation_pipeline)
-    # -- Creating the prototype dataloaders --
-    proto_train_loader = DataLoader(dataset = proto_train_split, batch_size = batch_size, shuffle = True )
-    proto_val_loader = DataLoader(dataset = proto_val_split, batch_size = batch_size, shuffle = False )
-    proto_test_loader = DataLoader(dataset = proto_test_split, batch_size = batch_size, shuffle = False )
-    return proto_train_loader, proto_val_loader, proto_test_loader
-def get_final_loaders(batch_size=32):
-    """
-    Creates and returns DataLoaders for the 100% FINAL dataset.
-    """
-    # Calling function to define pipelines
-    normalisation_pipeline, augmentation_pipeline = get_transform_pipelines()
-    # -- Split the FULL dataset --
-    # This returns a dictionary: {'train': 70%, 'test': 30%}
-    split_1_dict = data_plants.train_test_split(test_size=0.3, seed=SEED)
-    # Assign the 70% part to final train split
-    train_split = split_1_dict['train']
-    # Assign the 30% part to a temporary var
-    temp_split = split_1_dict['test']
-    # Split 30% into 2 15%
-    # This returns a dictionary: {'train': 50%, 'test': 50%}
-    split_2_dict = temp_split.train_test_split(test_size=0.5, seed=SEED)
-    val_split = split_2_dict['train']
-    test_split = split_2_dict['test']
-    # -- Putting splits through pipelines --
-    train_split.set_transform(augmentation_pipeline)
-    val_split.set_transform(normalisation_pipeline)
-    test_split.set_transform(normalisation_pipeline)
-    # -- Creating the final dataloaders --
-    train_loader = DataLoader(dataset = train_split, batch_size = batch_size, shuffle = True )
-    val_loader = DataLoader(dataset = val_split, batch_size = batch_size, shuffle = False )
-    test_loader = DataLoader(dataset = test_split, batch_size = batch_size, shuffle = False )
-    return train_loader, val_loader, test_loader
 # ----------------------------------------------------------------------
 if __name__ == "__main__":
-    train_loader, val_loader, test_loader = get_prototype_loaders(batch_size=32)
     print("\n--- Handoff Test Successful ---")
-    print(f"Train loader batches: {len(train_loader)}")
-    print(f"Validation loader batches: {len(val_loader)}")
-    print(f"Test loader batches: {len(test_loader)}")
-    train_loader_fin, val_loader_fin, test_loader_fin = get_final_loaders(batch_size=32)
     print("\n--- Handoff Test Successful ---")
-    print(f"Train loader batches: {len(train_loader_fin)}")
-    print(f"Validation loader batches: {len(val_loader_fin)}")
-    print(f"Test loader batches: {len(test_loader_fin)}")
     # Record dataset info in ClearML
     task.connect_configuration(
-        {"dataset_id": dataset.id},
         name="Dataset Metadata"
     )

 import numpy as np
 import pandas as pd
 from datasets import load_dataset
+from helpers.create_dataset import load_subset_from_dataset
+from helpers.transforms_loaders import make_dataset_loaders
 # --- Visualization ---
 import matplotlib.pyplot as plt
 # Setting up the SEED to be able to repeat experiments
 SEED = 42
+DATASET_SUBSET_RATIO = 0.25
 random.seed(SEED)
 np.random.seed(SEED)
 torch.manual_seed(SEED)
     torch.cuda.manual_seed_all(SEED)
+# ----- ClearML Setup -----
 task = Task.init(project_name= 'Small Group CW', task_name = 'data_prep')
 task.set_random_seed(SEED)
 clearml_logger = task.get_logger()
 # Log subset config to ClearML
 task.connect_configuration(
+    {"subset_ratio": DATASET_SUBSET_RATIO},
     name="Data subsetting"
 )
+# ----- Load a subset from a given dataset & track with ClearML -----
+data_plants, prototyping_dataset, features, clearml_dataset = load_subset_from_dataset(
+    SEED, DATASET_SUBSET_RATIO, clearml_logger
 )
 # ---- Exploratory data analysis (EDA) ----
 )
 # ----------------------------------------------------------------------
 if __name__ == "__main__":
+    # ------------------- Dataset splits ----------------------------------
+    prototype_loaders = make_dataset_loaders(
+        prototyping_dataset, seed=SEED, batch_size=32, test_size=0.3
+    )
     print("\n--- Handoff Test Successful ---")
+    print(f"Prototype Train loader batches: {len(prototype_loaders['train'])}")
+    print(f"Prototype Validation loader batches: {len(prototype_loaders['val'])}")
+    print(f"Prototype Test loader batches: {len(prototype_loaders['test'])}")
+    final_loaders = make_dataset_loaders(
+        data_plants, seed=SEED, batch_size=32, test_size=0.3
+    )
     print("\n--- Handoff Test Successful ---")
+    print(f"Train loader batches: {len(final_loaders['train'])}")
+    print(f"Validation loader batches: {len(final_loaders['val'])}")
+    print(f"Test loader batches: {len(final_loaders['test'])}")
     # Record dataset info in ClearML
     task.connect_configuration(
+        {"dataset_id": clearml_dataset.id},
         name="Dataset Metadata"
     )

dataPrep/helpers/create_dataset.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""
+A collection of dataset (DS) loading and subsetting functions.
+"""
+import random
+import numpy as np
+from datasets import load_dataset
+from clearml import Dataset
+# Load a DS from HuggingFace Link and subset - upload both to ClearML
+def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
+    DATASET_LINK = "DScomp380/plant_village"
+    # Load dataset
+    try:
+        ds = load_dataset(DATASET_LINK)
+    except Exception as e:
+        raise RuntimeError(f"Error loading the dataset: {e}")
+    data_plants = ds['train']
+    data_length = len(data_plants)
+    features = data_plants.features
+    # Calculate amount of samples we use
+    subset_size = int(data_length * subset_ratio)
+    # Creating a subset of random data (by their indicies)
+    indices = list(range(data_length))
+    random.shuffle(indices)
+    subset_indices = indices[:subset_size]
+    prototyping_dataset = data_plants.select(subset_indices)
+    # ---------- Register subset in ClearML ----------
+    clearml_dataset = Dataset.create(
+        dataset_name="Plant Village Prototype",
+        dataset_project="smallGroupProject",
+        dataset_tags=["prototype", "subset"]
+    )
+    # Save indices
+    subset_path = "subset_indices.npy"
+    np.save(subset_path, subset_indices)
+    clearml_dataset.add_files(subset_path)
+    clearml_dataset.set_metadata({
+        "subset_ratio": subset_ratio,
+        "total_samples": len(prototyping_dataset)
+    })
+    clearml_dataset.upload()
+    clearml_dataset.finalize()
+    clearml_logger.report_text(f"Created ClearML Dataset: {clearml_dataset.id}")
+    return data_plants, prototyping_dataset, features, clearml_dataset

dataPrep/helpers/transforms_loaders.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""
+A collection of data transformation and dataset loading functions.
+"""
+from torchvision import transforms
+from torch.utils.data import DataLoader
+# Defines and returns the normalization and augmentation pipelines.
+def make_transform_pipelines():
+    # Standard ImageNet mean and std - Used to normalize the tensors
+    IMAGENET_MEAN = [0.485, 0.456, 0.406]
+    IMAGENET_STD = [0.229, 0.224, 0.225]
+    # Pipeline ensures image format is consistent (for Val/Test)
+    normalisation = transforms.Compose([
+        # Convert PIL Image to a PyTorch Tensor, scales pixel values from [0, 255] to [0.0, 1.0]
+        transforms.ToTensor(),
+        # Standardises pixel values
+        transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
+    ])
+    # Augmentation pipeline (to create "new" images by changing some parameters)
+    augmentation = transforms.Compose([
+        # Randomly changing some parameters of pictures to enrich dataset
+        transforms.RandomRotation(30),
+        transforms.ColorJitter(brightness=0.2, saturation=0.2),
+        transforms.GaussianBlur(3),
+        transforms.ToTensor(),
+        transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
+    ])
+    return normalisation, augmentation
+"""
+Creates and returns DataLoaders (train, val, test) for a given dataset.
+Performs a 70/15/15 split
+"""
+def make_dataset_loaders(dataset, seed, batch_size=32, test_size=0.3):
+    # Define transformation pipelines for the dataset
+    normalisation, augmentation = make_transform_pipelines()
+    # 70/30 split creates train set
+    split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
+    train_split = split_1['train']
+    remaining_split = split_1['test']
+    # 15/15 split on remaining data - validation and test sets
+    val_split = test_size/2
+    split_2 = remaining_split.train_test_split(test_size=val_split, seed=seed)
+    val_split, test_split = split_2['train'], split_2['test']
+    # Put each split through pipelines
+    train_split.set_transform(augmentation)
+    val_split.set_transform(normalisation)
+    test_split.set_transform(normalisation)
+    # Create dataloader for each
+    train_loader = DataLoader(train_split, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_split, batch_size=batch_size, shuffle=False)
+    test_loader = DataLoader(test_split, batch_size=batch_size, shuffle=False)
+    dataset_loaders = {
+        "train": train_loader,
+        "val": val_loader,
+        "test": test_loader
+    }
+    return dataset_loaders