Yusuf commited on
Commit
04cb886
·
1 Parent(s): 6b1327e

CHORE: separate dataset load & transform pipelines

Browse files
dataPrep/data_preparation.py CHANGED
@@ -6,6 +6,8 @@ import random
6
  import numpy as np
7
  import pandas as pd
8
  from datasets import load_dataset
 
 
9
 
10
  # --- Visualization ---
11
  import matplotlib.pyplot as plt
@@ -22,6 +24,8 @@ from clearml import Task, Logger, Dataset
22
 
23
  # Setting up the SEED to be able to repeat experiments
24
  SEED = 42
 
 
25
  random.seed(SEED)
26
  np.random.seed(SEED)
27
  torch.manual_seed(SEED)
@@ -29,66 +33,23 @@ if torch.cuda.is_available():
29
  torch.cuda.manual_seed_all(SEED)
30
 
31
 
32
- # Initialising a task on ClearML
33
- # UPDATE CLEARML
34
  task = Task.init(project_name= 'Small Group CW', task_name = 'data_prep')
35
  task.set_random_seed(SEED)
36
  clearml_logger = task.get_logger()
37
 
38
- # Loading dataset from HugginFace and checking it
39
- try:
40
- ds = load_dataset("DScomp380/plant_village")
41
- except Exception as e:
42
- print(f"Error loading the dataset: {e}")
43
-
44
- data_plants = ds['train']
45
- data_length = len(data_plants)
46
- features = data_plants.features
47
-
48
- # --------------------------- Data selection --------------------------------
49
- # Creating the prototyping dataset
50
- SUBSET_RATIO = 0.25 # 25% for prototyping
51
-
52
  # Log subset config to ClearML
53
  task.connect_configuration(
54
- {"subset_ratio": SUBSET_RATIO},
55
  name="Data subsetting"
56
  )
57
 
58
- # Calculate amount of samples we use
59
- subset_size = int(data_length * SUBSET_RATIO)
60
-
61
- # Creating a subset of random data (by their indices)
62
- indices = list(range(data_length))
63
- random.shuffle(indices)
64
- subset_indices = indices[:subset_size]
65
- prototyping_dataset = data_plants.select(subset_indices)
66
 
67
- # Register this subset in ClearML
68
- dataset = Dataset.create(
69
- dataset_name="Plant Village Prototype",
70
- dataset_project="smallGroupProject",
71
- dataset_tags=["prototype", "subset"]
72
  )
73
 
74
- # Save indicies used for reproducibility
75
- subset_path = "subset_indices.npy"
76
- np.save(subset_path, subset_indices)
77
- dataset.add_files(subset_path)
78
-
79
- # Add simple metadata
80
- dataset.set_metadata({
81
- "subset_ratio": SUBSET_RATIO,
82
- "total_samples": len(prototyping_dataset)
83
- })
84
-
85
- # Upload to ClearML storage
86
- dataset.upload()
87
- dataset.finalize()
88
-
89
- # Log the dataset ID
90
- clearml_logger.report_text(f"Created ClearML Dataset: {dataset.id}")
91
-
92
 
93
  # ---- Exploratory data analysis (EDA) ----
94
 
@@ -149,130 +110,31 @@ clearml_logger.report_image(
149
  )
150
 
151
 
152
- # --------------- Data Splits ------------
153
- def get_transform_pipelines():
154
- """
155
- Defines and returns the normalization and augmentation pipelines.
156
- """
157
- # Standard ImageNet mean and std
158
- # These values are used to normalize the tensors
159
- IMAGENET_MEAN = [0.485, 0.456, 0.406]
160
- IMAGENET_STD = [0.229, 0.224, 0.225]
161
-
162
- # Defining pipeline to ensure that images are consistently formatted (for Val/Test)
163
- normalisation_pipeline = transforms.Compose([
164
- # Convert PIL Image to a PyTorch Tensor
165
- # This also scales pixel values from [0, 255] to [0.0, 1.0]
166
- transforms.ToTensor(),
167
-
168
- # Normalise the Tensor; Standartises pixel values
169
- transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
170
- ])
171
- # Augmentation pipeline (to change some parameters of the pictures to create "new" ones)
172
- augmentation_pipeline = transforms.Compose([
173
- # Randomly changing some parameters of pictures to enrich dataset
174
- transforms.RandomRotation(degrees=30),
175
- transforms.ColorJitter(brightness=0.2, saturation=0.2),
176
- transforms.GaussianBlur(kernel_size=3),
177
-
178
- # Convert to Tensor and Normalise
179
- transforms.ToTensor(),
180
- transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
181
- ])
182
- # Return both pipelines
183
- return normalisation_pipeline, augmentation_pipeline
184
-
185
-
186
- def get_prototype_loaders(batch_size=32):
187
- """
188
- Creates and returns DataLoaders for the 25% PROTOTYPE dataset.
189
- """
190
- # Calling function to define pipelines
191
- normalisation_pipeline, augmentation_pipeline = get_transform_pipelines()
192
-
193
- # -- Split the prototype dataset --
194
- # This returns a dictionary: {'train': 70%, 'test': 30%}
195
- split_1_dict = prototyping_dataset.train_test_split(test_size=0.3, seed=SEED)
196
-
197
- # Assign the 70% part to final train split
198
- proto_train_split = split_1_dict['train']
199
-
200
- # Assign the 30% part to a temporary var
201
- proto_temp_split = split_1_dict['test']
202
-
203
- # Split 30% into 2 15%
204
- # This returns a dictionary: {'train': 50%, 'test': 50%}
205
- split_2_dict = proto_temp_split.train_test_split(test_size=0.5, seed=SEED)
206
-
207
- proto_val_split = split_2_dict['train']
208
- proto_test_split = split_2_dict['test']
209
-
210
- # -- Putting splits through pipelines --
211
- proto_train_split.set_transform(augmentation_pipeline)
212
- proto_val_split.set_transform(normalisation_pipeline)
213
- proto_test_split.set_transform(normalisation_pipeline)
214
-
215
- # -- Creating the prototype dataloaders --
216
- proto_train_loader = DataLoader(dataset = proto_train_split, batch_size = batch_size, shuffle = True )
217
- proto_val_loader = DataLoader(dataset = proto_val_split, batch_size = batch_size, shuffle = False )
218
- proto_test_loader = DataLoader(dataset = proto_test_split, batch_size = batch_size, shuffle = False )
219
-
220
- return proto_train_loader, proto_val_loader, proto_test_loader
221
-
222
-
223
- def get_final_loaders(batch_size=32):
224
- """
225
- Creates and returns DataLoaders for the 100% FINAL dataset.
226
- """
227
- # Calling function to define pipelines
228
- normalisation_pipeline, augmentation_pipeline = get_transform_pipelines()
229
-
230
- # -- Split the FULL dataset --
231
- # This returns a dictionary: {'train': 70%, 'test': 30%}
232
- split_1_dict = data_plants.train_test_split(test_size=0.3, seed=SEED)
233
-
234
- # Assign the 70% part to final train split
235
- train_split = split_1_dict['train']
236
-
237
- # Assign the 30% part to a temporary var
238
- temp_split = split_1_dict['test']
239
-
240
- # Split 30% into 2 15%
241
- # This returns a dictionary: {'train': 50%, 'test': 50%}
242
- split_2_dict = temp_split.train_test_split(test_size=0.5, seed=SEED)
243
-
244
- val_split = split_2_dict['train']
245
- test_split = split_2_dict['test']
246
-
247
- # -- Putting splits through pipelines --
248
- train_split.set_transform(augmentation_pipeline)
249
- val_split.set_transform(normalisation_pipeline)
250
- test_split.set_transform(normalisation_pipeline)
251
-
252
- # -- Creating the final dataloaders --
253
- train_loader = DataLoader(dataset = train_split, batch_size = batch_size, shuffle = True )
254
- val_loader = DataLoader(dataset = val_split, batch_size = batch_size, shuffle = False )
255
- test_loader = DataLoader(dataset = test_split, batch_size = batch_size, shuffle = False )
256
- return train_loader, val_loader, test_loader
257
-
258
  # ----------------------------------------------------------------------
259
  if __name__ == "__main__":
260
 
261
- train_loader, val_loader, test_loader = get_prototype_loaders(batch_size=32)
 
 
 
 
262
  print("\n--- Handoff Test Successful ---")
263
- print(f"Train loader batches: {len(train_loader)}")
264
- print(f"Validation loader batches: {len(val_loader)}")
265
- print(f"Test loader batches: {len(test_loader)}")
 
 
 
 
266
 
267
- train_loader_fin, val_loader_fin, test_loader_fin = get_final_loaders(batch_size=32)
268
  print("\n--- Handoff Test Successful ---")
269
- print(f"Train loader batches: {len(train_loader_fin)}")
270
- print(f"Validation loader batches: {len(val_loader_fin)}")
271
- print(f"Test loader batches: {len(test_loader_fin)}")
272
 
273
  # Record dataset info in ClearML
274
  task.connect_configuration(
275
- {"dataset_id": dataset.id},
276
  name="Dataset Metadata"
277
  )
278
 
 
6
  import numpy as np
7
  import pandas as pd
8
  from datasets import load_dataset
9
+ from helpers.create_dataset import load_subset_from_dataset
10
+ from helpers.transforms_loaders import make_dataset_loaders
11
 
12
  # --- Visualization ---
13
  import matplotlib.pyplot as plt
 
24
 
25
  # Setting up the SEED to be able to repeat experiments
26
  SEED = 42
27
+ DATASET_SUBSET_RATIO = 0.25
28
+
29
  random.seed(SEED)
30
  np.random.seed(SEED)
31
  torch.manual_seed(SEED)
 
33
  torch.cuda.manual_seed_all(SEED)
34
 
35
 
36
+ # ----- ClearML Setup -----
 
37
  task = Task.init(project_name= 'Small Group CW', task_name = 'data_prep')
38
  task.set_random_seed(SEED)
39
  clearml_logger = task.get_logger()
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  # Log subset config to ClearML
42
  task.connect_configuration(
43
+ {"subset_ratio": DATASET_SUBSET_RATIO},
44
  name="Data subsetting"
45
  )
46
 
 
 
 
 
 
 
 
 
47
 
48
+ # ----- Load a subset from a given dataset & track with ClearML -----
49
+ data_plants, prototyping_dataset, features, clearml_dataset = load_subset_from_dataset(
50
+ SEED, DATASET_SUBSET_RATIO, clearml_logger
 
 
51
  )
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  # ---- Exploratory data analysis (EDA) ----
55
 
 
110
  )
111
 
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  # ----------------------------------------------------------------------
114
  if __name__ == "__main__":
115
 
116
+ # ------------------- Dataset splits ----------------------------------
117
+ prototype_loaders = make_dataset_loaders(
118
+ prototyping_dataset, seed=SEED, batch_size=32, test_size=0.3
119
+ )
120
+
121
  print("\n--- Handoff Test Successful ---")
122
+ print(f"Prototype Train loader batches: {len(prototype_loaders['train'])}")
123
+ print(f"Prototype Validation loader batches: {len(prototype_loaders['val'])}")
124
+ print(f"Prototype Test loader batches: {len(prototype_loaders['test'])}")
125
+
126
+ final_loaders = make_dataset_loaders(
127
+ data_plants, seed=SEED, batch_size=32, test_size=0.3
128
+ )
129
 
 
130
  print("\n--- Handoff Test Successful ---")
131
+ print(f"Train loader batches: {len(final_loaders['train'])}")
132
+ print(f"Validation loader batches: {len(final_loaders['val'])}")
133
+ print(f"Test loader batches: {len(final_loaders['test'])}")
134
 
135
  # Record dataset info in ClearML
136
  task.connect_configuration(
137
+ {"dataset_id": clearml_dataset.id},
138
  name="Dataset Metadata"
139
  )
140
 
dataPrep/helpers/create_dataset.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A collection of dataset (DS) loading and subsetting functions.
3
+ """
4
+
5
+ import random
6
+ import numpy as np
7
+ from datasets import load_dataset
8
+ from clearml import Dataset
9
+
10
+
11
+ # Load a DS from HuggingFace Link and subset - upload both to ClearML
12
+ def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
13
+ DATASET_LINK = "DScomp380/plant_village"
14
+
15
+ # Load dataset
16
+ try:
17
+ ds = load_dataset(DATASET_LINK)
18
+ except Exception as e:
19
+ raise RuntimeError(f"Error loading the dataset: {e}")
20
+
21
+ data_plants = ds['train']
22
+ data_length = len(data_plants)
23
+ features = data_plants.features
24
+
25
+ # Calculate amount of samples we use
26
+ subset_size = int(data_length * subset_ratio)
27
+
28
+ # Creating a subset of random data (by their indicies)
29
+ indices = list(range(data_length))
30
+ random.shuffle(indices)
31
+ subset_indices = indices[:subset_size]
32
+
33
+ prototyping_dataset = data_plants.select(subset_indices)
34
+
35
+ # ---------- Register subset in ClearML ----------
36
+ clearml_dataset = Dataset.create(
37
+ dataset_name="Plant Village Prototype",
38
+ dataset_project="smallGroupProject",
39
+ dataset_tags=["prototype", "subset"]
40
+ )
41
+
42
+ # Save indices
43
+ subset_path = "subset_indices.npy"
44
+ np.save(subset_path, subset_indices)
45
+ clearml_dataset.add_files(subset_path)
46
+ clearml_dataset.set_metadata({
47
+ "subset_ratio": subset_ratio,
48
+ "total_samples": len(prototyping_dataset)
49
+ })
50
+
51
+ clearml_dataset.upload()
52
+ clearml_dataset.finalize()
53
+ clearml_logger.report_text(f"Created ClearML Dataset: {clearml_dataset.id}")
54
+
55
+ return data_plants, prototyping_dataset, features, clearml_dataset
dataPrep/helpers/transforms_loaders.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A collection of data transformation and dataset loading functions.
3
+ """
4
+
5
+ from torchvision import transforms
6
+ from torch.utils.data import DataLoader
7
+
8
+
9
+
10
+ # Defines and returns the normalization and augmentation pipelines.
11
+ def make_transform_pipelines():
12
+
13
+ # Standard ImageNet mean and std - Used to normalize the tensors
14
+ IMAGENET_MEAN = [0.485, 0.456, 0.406]
15
+ IMAGENET_STD = [0.229, 0.224, 0.225]
16
+
17
+ # Pipeline ensures image format is consistent (for Val/Test)
18
+ normalisation = transforms.Compose([
19
+
20
+ # Convert PIL Image to a PyTorch Tensor, scales pixel values from [0, 255] to [0.0, 1.0]
21
+ transforms.ToTensor(),
22
+
23
+ # Standardises pixel values
24
+ transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
25
+ ])
26
+
27
+ # Augmentation pipeline (to create "new" images by changing some parameters)
28
+ augmentation = transforms.Compose([
29
+
30
+ # Randomly changing some parameters of pictures to enrich dataset
31
+ transforms.RandomRotation(30),
32
+ transforms.ColorJitter(brightness=0.2, saturation=0.2),
33
+ transforms.GaussianBlur(3),
34
+ transforms.ToTensor(),
35
+ transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
36
+ ])
37
+
38
+ return normalisation, augmentation
39
+
40
+
41
+ """
42
+ Creates and returns DataLoaders (train, val, test) for a given dataset.
43
+ Performs a 70/15/15 split
44
+ """
45
+ def make_dataset_loaders(dataset, seed, batch_size=32, test_size=0.3):
46
+
47
+ # Define transformation pipelines for the dataset
48
+ normalisation, augmentation = make_transform_pipelines()
49
+
50
+ # 70/30 split creates train set
51
+ split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
52
+ train_split = split_1['train']
53
+ remaining_split = split_1['test']
54
+
55
+ # 15/15 split on remaining data - validation and test sets
56
+ val_split = test_size/2
57
+ split_2 = remaining_split.train_test_split(test_size=val_split, seed=seed)
58
+ val_split, test_split = split_2['train'], split_2['test']
59
+
60
+ # Put each split through pipelines
61
+ train_split.set_transform(augmentation)
62
+ val_split.set_transform(normalisation)
63
+ test_split.set_transform(normalisation)
64
+
65
+ # Create dataloader for each
66
+ train_loader = DataLoader(train_split, batch_size=batch_size, shuffle=True)
67
+ val_loader = DataLoader(val_split, batch_size=batch_size, shuffle=False)
68
+ test_loader = DataLoader(test_split, batch_size=batch_size, shuffle=False)
69
+
70
+ dataset_loaders = {
71
+ "train": train_loader,
72
+ "val": val_loader,
73
+ "test": test_loader
74
+ }
75
+
76
+ return dataset_loaders