Spaces:

k23064919
/

smallGroupProject

Sleeping

App Files Files Community

ra1425 commited on Nov 6, 2025

Commit

afc3315

1 Parent(s): c0dc8ab

FEAT: Implemented the complete prototype data pipeline

Browse files

Files changed (1) hide show

data_preparation.py +97 -11

data_preparation.py CHANGED Viewed

@@ -1,16 +1,25 @@
 import os
 import random
 import numpy as np
 import pandas as pd
-# Visualisation
-#import seaborn as sns
 import matplotlib.pyplot as plt
 import torch
 from clearml import Task, Logger
-from datasets import load_dataset
 SEED = 42
 random.seed(SEED)
 np.random.seed(SEED)
@@ -24,8 +33,10 @@ task = Task.init(project_name= 'smallGroupProject', task_name = 'data_prep')
 task.set_random_seed(SEED)
 clearml_logger = task.get_logger()
-# Loading dataset from HugginFace
 try:
     ds = load_dataset("DScomp380/plant_village")
 except Exception as e:
@@ -33,14 +44,18 @@ except Exception as e:
 data_plants = ds['train']
 # Verification
 print(f"\nLoaded object type: {type(data_plants)}")
 data_length = len(data_plants)
 print(f"\nLoaded object size: {data_length}")
 features = data_plants.features
 print(f"\nDataset features: {features}")
 # Verifying label count
 if 'label' in features and hasattr(features['label'], 'num_classes'):
@@ -48,13 +63,19 @@ if 'label' in features and hasattr(features['label'], 'num_classes'):
     print(f"Number of disease categories (labels): {label_count}")
 else:
     print("Couldnt determine the labels automatically")
 # Verifying single sample
 sample = data_plants[0]
 print(f"Sample image type: {type(sample['image'])}")
 print(f"Sample label: {sample['label']}")
-# -----------------------------------------------------------
 # Creating the prototyping dataset
 SUBSET_RATIO = 0.25 # 25% for prototyping
@@ -73,13 +94,11 @@ subset_indices = indices[:subset_size]
 prototyping_dataset = data_plants.select(subset_indices)
 #Verifying
 print(f"Prototyping dataset size: {len(prototyping_dataset)}")
-# -----------------------------------------------------------
-# Exploratory data analysis (EDA)
-#sns.set(color_codes = True)
 # Reformatting the label feature to understand bias
 labels_list = prototyping_dataset['label']
@@ -112,11 +131,12 @@ clearml_logger.report_scalar(
     iteration=1
 )
-print("Class imbalance analysis: ")
 print(f"Max labels in a class: {max_count}")
 print(f"Min labels in a class: {min_count}")
 print(f"Mean labels in a class: {mean_count}")
 print(f"Imbalance ratio: {max_count/min_count:.2f}")
 # Mapping indeces to class names
 class_names = features['label'].names
@@ -139,4 +159,70 @@ clearml_logger.report_image(
     local_path=plot_file      # The path to the file you just saved
 )
-plt.show()

+# --- Standard Python Library ---
 import os
 import random
+# --- Data Handling & Analysis ---
 import numpy as np
 import pandas as pd
+from datasets import load_dataset
+# --- Visualization ---
 import matplotlib.pyplot as plt
+# import seaborn as sns
+# --- PyTorch (Machine Learning) ---
 import torch
+from torchvision import transforms
+from torch.utils.data import DataLoader
+# --- Experiment Tracking ---
 from clearml import Task, Logger
+# Setting up the SEED to be able to repeat experiments
 SEED = 42
 random.seed(SEED)
 np.random.seed(SEED)
 task.set_random_seed(SEED)
 clearml_logger = task.get_logger()
+print("✅ Checkpoint: Imports, SEED and ClearML are set")
+# Loading dataset from HugginFace and checking it
 try:
     ds = load_dataset("DScomp380/plant_village")
 except Exception as e:
 data_plants = ds['train']
+print("--- Verification ---")
 # Verification
 print(f"\nLoaded object type: {type(data_plants)}")
+print("\n --- \n")
 data_length = len(data_plants)
 print(f"\nLoaded object size: {data_length}")
+print("\n --- \n")
 features = data_plants.features
 print(f"\nDataset features: {features}")
+print("\n --- \n")
 # Verifying label count
 if 'label' in features and hasattr(features['label'], 'num_classes'):
     print(f"Number of disease categories (labels): {label_count}")
 else:
     print("Couldnt determine the labels automatically")
+print("\n --- \n")
 # Verifying single sample
 sample = data_plants[0]
 print(f"Sample image type: {type(sample['image'])}")
 print(f"Sample label: {sample['label']}")
+print("\n --- \n")
+print("✅ Checkpoint: Dataset is loaded and data is checked")
+# --------------------------- Data selection --------------------------------
 # Creating the prototyping dataset
 SUBSET_RATIO = 0.25 # 25% for prototyping
 prototyping_dataset = data_plants.select(subset_indices)
+print("✅ Checkpoint: Prototyping dataset is created")
 #Verifying
 print(f"Prototyping dataset size: {len(prototyping_dataset)}")
+# ---- Exploratory data analysis (EDA) ----
 # Reformatting the label feature to understand bias
 labels_list = prototyping_dataset['label']
     iteration=1
 )
+print("--- Class imbalance analysis --- ")
 print(f"Max labels in a class: {max_count}")
 print(f"Min labels in a class: {min_count}")
 print(f"Mean labels in a class: {mean_count}")
 print(f"Imbalance ratio: {max_count/min_count:.2f}")
+print("✅ Checkpoint: Class distribution is calculated")
 # Mapping indeces to class names
 class_names = features['label'].names
     local_path=plot_file      # The path to the file you just saved
 )
+# To see the plot uncomment but itll pause the code
+#plt.show()
+print("✅ Checkpoint: Plot with classes distributions is created and saved")
+# --------------- Data Splits ------------
+# Standard ImageNet mean and std
+# These values are used to normalize the tensors
+IMAGENET_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_STD = [0.229, 0.224, 0.225]
+# Defining pipeline to ensure that images are consistently formatted (for Val/Test)
+normalisation_pipeline = transforms.Compose([
+    # Convert PIL Image to a PyTorch Tensor
+    # This also scales pixel values from [0, 255] to [0.0, 1.0]
+    transforms.ToTensor(),
+    # Normalise the Tensor; Standartises pixel values
+    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
+])
+print("✅ Checkpoint: Transform pipeline created")
+# Augmentation pipeline (to change some parameters of the pictures to create "new" ones)
+augmentation_pipeline = transforms.Compose([
+    # Randomly changing some parameters of pictures to enrich dataset
+    transforms.RandomRotation(degrees=30),
+    transforms.ColorJitter(brightness=0.2, saturation=0.2),
+    transforms.GaussianBlur(kernel_size=3),
+    # Convert to Tensor and Normalise
+    transforms.ToTensor(),
+    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
+])
+print("✅ Checkpoint: Augmentation pipeline created")
+# -- Split the prototype dataset --
+# This returns a dictionary: {'train': 70%, 'test': 30%}
+split_1_dict = prototyping_dataset.train_test_split(test_size=0.3, seed=SEED)
+# Assign the 70% part to final train split
+proto_train_split = split_1_dict['train']
+# Assign the 30% part to a temporary var
+proto_temp_split = split_1_dict['test']
+# Split 30% into 2 15%
+# This returns a dictionary: {'train': 50%, 'test': 50%}
+split_2_dict = proto_temp_split.train_test_split(test_size=0.5, seed=SEED)
+proto_val_split = split_2_dict['train']
+proto_test_split = split_2_dict['test']
+print("✅ Checkpoint: Dataset splitted")
+# -- Putting splits through pipelines --
+proto_train_split.set_transform(augmentation_pipeline)
+proto_val_split.set_transform(normalisation_pipeline)
+proto_test_split.set_transform(normalisation_pipeline)
+# -- Creating the prototype dataloaders --
+BATCH_SIZE = 32
+proto_train_loader = DataLoader(dataset = proto_train_split, batch_size = BATCH_SIZE, shuffle = True )
+proto_val_loader = DataLoader(dataset = proto_val_split, batch_size = BATCH_SIZE, shuffle = False )
+proto_test_loader = DataLoader(dataset = proto_test_split, batch_size = BATCH_SIZE, shuffle = False )
+print("✅ Checkpoint: DataLoaders are set")