Yusuf commited on
Commit
2ace27a
·
1 Parent(s): 04cb886

CHORE: separate aug pipeline & parametrise aug transforms

Browse files
dataPrep/data_preparation.py CHANGED
@@ -6,7 +6,7 @@ import random
6
  import numpy as np
7
  import pandas as pd
8
  from datasets import load_dataset
9
- from helpers.create_dataset import load_subset_from_dataset
10
  from helpers.transforms_loaders import make_dataset_loaders
11
 
12
  # --- Visualization ---
@@ -15,17 +15,28 @@ import matplotlib.pyplot as plt
15
 
16
  # --- PyTorch (Machine Learning) ---
17
  import torch
18
- from torchvision import transforms
19
- from torch.utils.data import DataLoader
20
 
21
  # --- Experiment Tracking ---
22
- from clearml import Task, Logger, Dataset
23
 
24
 
25
- # Setting up the SEED to be able to repeat experiments
 
26
  SEED = 42
 
27
  DATASET_SUBSET_RATIO = 0.25
28
 
 
 
 
 
 
 
 
 
 
 
 
29
  random.seed(SEED)
30
  np.random.seed(SEED)
31
  torch.manual_seed(SEED)
@@ -34,10 +45,15 @@ if torch.cuda.is_available():
34
 
35
 
36
  # ----- ClearML Setup -----
37
- task = Task.init(project_name= 'Small Group CW', task_name = 'data_prep')
38
  task.set_random_seed(SEED)
39
  clearml_logger = task.get_logger()
40
 
 
 
 
 
 
41
  # Log subset config to ClearML
42
  task.connect_configuration(
43
  {"subset_ratio": DATASET_SUBSET_RATIO},
@@ -45,18 +61,12 @@ task.connect_configuration(
45
  )
46
 
47
 
48
- # ----- Load a subset from a given dataset & track with ClearML -----
49
- data_plants, prototyping_dataset, features, clearml_dataset = load_subset_from_dataset(
50
- SEED, DATASET_SUBSET_RATIO, clearml_logger
51
- )
52
-
53
-
54
  # ---- Exploratory data analysis (EDA) ----
55
 
56
  # Reformatting the label feature to understand bias
57
  labels_list = prototyping_dataset['label']
58
  df_labels = pd.Series(labels_list)
59
- label_count = df_labels.value_counts(sort = False)
60
 
61
  # Checking the amount of samples in each class and logging it to clearML
62
 
@@ -114,8 +124,15 @@ clearml_logger.report_image(
114
  if __name__ == "__main__":
115
 
116
  # ------------------- Dataset splits ----------------------------------
 
 
 
 
 
 
 
117
  prototype_loaders = make_dataset_loaders(
118
- prototyping_dataset, seed=SEED, batch_size=32, test_size=0.3
119
  )
120
 
121
  print("\n--- Handoff Test Successful ---")
@@ -124,7 +141,7 @@ if __name__ == "__main__":
124
  print(f"Prototype Test loader batches: {len(prototype_loaders['test'])}")
125
 
126
  final_loaders = make_dataset_loaders(
127
- data_plants, seed=SEED, batch_size=32, test_size=0.3
128
  )
129
 
130
  print("\n--- Handoff Test Successful ---")
 
6
  import numpy as np
7
  import pandas as pd
8
  from datasets import load_dataset
9
+ from helpers.create_dataset import make_subset
10
  from helpers.transforms_loaders import make_dataset_loaders
11
 
12
  # --- Visualization ---
 
15
 
16
  # --- PyTorch (Machine Learning) ---
17
  import torch
 
 
18
 
19
  # --- Experiment Tracking ---
20
+ from clearml import Task
21
 
22
 
23
+ # -------- Controllable parameters --------
24
+ # Dataset parameters
25
  SEED = 42
26
+ DATASET_LINK = "DScomp380/plant_village"
27
  DATASET_SUBSET_RATIO = 0.25
28
 
29
+ # Augmentation parameters
30
+ ROTATION = 30
31
+ BRIGHTNESS = 0.2
32
+ SATURATION = 0.2
33
+ BLUR = 3
34
+
35
+ # DataLoader parameters
36
+ BATCH_SIZE = 32
37
+ TEST_SIZE = 0.3
38
+
39
+ # Setting up the SEED to be able to repeat experiments
40
  random.seed(SEED)
41
  np.random.seed(SEED)
42
  torch.manual_seed(SEED)
 
45
 
46
 
47
  # ----- ClearML Setup -----
48
+ task = Task.init(project_name='Small Group Project', task_name='data_prep')
49
  task.set_random_seed(SEED)
50
  clearml_logger = task.get_logger()
51
 
52
+ # ----- Load a subset from a given dataset & track with ClearML -----
53
+ data_plants, prototyping_dataset, features, clearml_dataset = make_subset(
54
+ DATASET_LINK, DATASET_SUBSET_RATIO, clearml_logger
55
+ )
56
+
57
  # Log subset config to ClearML
58
  task.connect_configuration(
59
  {"subset_ratio": DATASET_SUBSET_RATIO},
 
61
  )
62
 
63
 
 
 
 
 
 
 
64
  # ---- Exploratory data analysis (EDA) ----
65
 
66
  # Reformatting the label feature to understand bias
67
  labels_list = prototyping_dataset['label']
68
  df_labels = pd.Series(labels_list)
69
+ label_count = df_labels.value_counts(sort=False)
70
 
71
  # Checking the amount of samples in each class and logging it to clearML
72
 
 
124
  if __name__ == "__main__":
125
 
126
  # ------------------- Dataset splits ----------------------------------
127
+ aug_config = {
128
+ 'rotation': ROTATION,
129
+ 'brightness': BRIGHTNESS,
130
+ 'saturation': SATURATION,
131
+ 'blur': BLUR
132
+ }
133
+
134
  prototype_loaders = make_dataset_loaders(
135
+ prototyping_dataset, SEED, BATCH_SIZE, TEST_SIZE, aug_config
136
  )
137
 
138
  print("\n--- Handoff Test Successful ---")
 
141
  print(f"Prototype Test loader batches: {len(prototype_loaders['test'])}")
142
 
143
  final_loaders = make_dataset_loaders(
144
+ data_plants, SEED, BATCH_SIZE, TEST_SIZE, aug_config
145
  )
146
 
147
  print("\n--- Handoff Test Successful ---")
dataPrep/helpers/create_dataset.py CHANGED
@@ -2,19 +2,19 @@
2
  A collection of dataset (DS) loading and subsetting functions.
3
  """
4
 
 
5
  import random
6
  import numpy as np
7
  from datasets import load_dataset
8
  from clearml import Dataset
9
 
10
 
11
- # Load a DS from HuggingFace Link and subset - upload both to ClearML
12
- def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
13
- DATASET_LINK = "DScomp380/plant_village"
14
 
15
  # Load dataset
16
  try:
17
- ds = load_dataset(DATASET_LINK)
18
  except Exception as e:
19
  raise RuntimeError(f"Error loading the dataset: {e}")
20
 
@@ -35,8 +35,9 @@ def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
35
  # ---------- Register subset in ClearML ----------
36
  clearml_dataset = Dataset.create(
37
  dataset_name="Plant Village Prototype",
38
- dataset_project="smallGroupProject",
39
- dataset_tags=["prototype", "subset"]
 
40
  )
41
 
42
  # Save indices
@@ -44,6 +45,7 @@ def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
44
  np.save(subset_path, subset_indices)
45
  clearml_dataset.add_files(subset_path)
46
  clearml_dataset.set_metadata({
 
47
  "subset_ratio": subset_ratio,
48
  "total_samples": len(prototyping_dataset)
49
  })
@@ -52,4 +54,7 @@ def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
52
  clearml_dataset.finalize()
53
  clearml_logger.report_text(f"Created ClearML Dataset: {clearml_dataset.id}")
54
 
 
 
 
55
  return data_plants, prototyping_dataset, features, clearml_dataset
 
2
  A collection of dataset (DS) loading and subsetting functions.
3
  """
4
 
5
+ import os
6
  import random
7
  import numpy as np
8
  from datasets import load_dataset
9
  from clearml import Dataset
10
 
11
 
12
+ # Load a DS from HuggingFace Link and subset the DS - upload both to ClearML
13
+ def make_subset(dataset_link, subset_ratio, clearml_logger):
 
14
 
15
  # Load dataset
16
  try:
17
+ ds = load_dataset(dataset_link)
18
  except Exception as e:
19
  raise RuntimeError(f"Error loading the dataset: {e}")
20
 
 
35
  # ---------- Register subset in ClearML ----------
36
  clearml_dataset = Dataset.create(
37
  dataset_name="Plant Village Prototype",
38
+ dataset_project="Small Group Project",
39
+ dataset_tags=["prototype", "subset"],
40
+ use_current_task=True
41
  )
42
 
43
  # Save indices
 
45
  np.save(subset_path, subset_indices)
46
  clearml_dataset.add_files(subset_path)
47
  clearml_dataset.set_metadata({
48
+ "huggingface_dataset": dataset_link,
49
  "subset_ratio": subset_ratio,
50
  "total_samples": len(prototyping_dataset)
51
  })
 
54
  clearml_dataset.finalize()
55
  clearml_logger.report_text(f"Created ClearML Dataset: {clearml_dataset.id}")
56
 
57
+ # Clean up local file
58
+ os.remove(subset_path)
59
+
60
  return data_plants, prototyping_dataset, features, clearml_dataset
dataPrep/helpers/transforms_loaders.py CHANGED
@@ -6,13 +6,12 @@ from torchvision import transforms
6
  from torch.utils.data import DataLoader
7
 
8
 
 
 
 
9
 
10
- # Defines and returns the normalization and augmentation pipelines.
11
- def make_transform_pipelines():
12
-
13
- # Standard ImageNet mean and std - Used to normalize the tensors
14
- IMAGENET_MEAN = [0.485, 0.456, 0.406]
15
- IMAGENET_STD = [0.229, 0.224, 0.225]
16
 
17
  # Pipeline ensures image format is consistent (for Val/Test)
18
  normalisation = transforms.Compose([
@@ -24,28 +23,39 @@ def make_transform_pipelines():
24
  transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
25
  ])
26
 
 
 
 
 
 
 
 
 
 
 
27
  # Augmentation pipeline (to create "new" images by changing some parameters)
28
  augmentation = transforms.Compose([
29
 
30
  # Randomly changing some parameters of pictures to enrich dataset
31
- transforms.RandomRotation(30),
32
- transforms.ColorJitter(brightness=0.2, saturation=0.2),
33
- transforms.GaussianBlur(3),
34
  transforms.ToTensor(),
35
  transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
36
  ])
37
 
38
- return normalisation, augmentation
39
 
40
 
41
  """
42
  Creates and returns DataLoaders (train, val, test) for a given dataset.
43
  Performs a 70/15/15 split
44
  """
45
- def make_dataset_loaders(dataset, seed, batch_size=32, test_size=0.3):
46
 
47
  # Define transformation pipelines for the dataset
48
- normalisation, augmentation = make_transform_pipelines()
 
49
 
50
  # 70/30 split creates train set
51
  split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
@@ -53,7 +63,7 @@ def make_dataset_loaders(dataset, seed, batch_size=32, test_size=0.3):
53
  remaining_split = split_1['test']
54
 
55
  # 15/15 split on remaining data - validation and test sets
56
- val_split = test_size/2
57
  split_2 = remaining_split.train_test_split(test_size=val_split, seed=seed)
58
  val_split, test_split = split_2['train'], split_2['test']
59
 
 
6
  from torch.utils.data import DataLoader
7
 
8
 
9
+ # Standard ImageNet mean and std - Used to normalize the tensors
10
+ IMAGENET_MEAN = [0.485, 0.456, 0.406]
11
+ IMAGENET_STD = [0.229, 0.224, 0.225]
12
 
13
+ # Defines and returns the normalization pipeline.
14
+ def make_norm_pipeline():
 
 
 
 
15
 
16
  # Pipeline ensures image format is consistent (for Val/Test)
17
  normalisation = transforms.Compose([
 
23
  transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
24
  ])
25
 
26
+ return normalisation
27
+
28
+ # Defines and returns the augmentation (rotation, brightness, saturation, blur) pipeline.
29
+ def make_augment_pipeline(aug_config):
30
+
31
+ rotation = aug_config['rotation']
32
+ brightness = aug_config['brightness']
33
+ saturation = aug_config['saturation']
34
+ blur = aug_config['blur']
35
+
36
  # Augmentation pipeline (to create "new" images by changing some parameters)
37
  augmentation = transforms.Compose([
38
 
39
  # Randomly changing some parameters of pictures to enrich dataset
40
+ transforms.RandomRotation(rotation),
41
+ transforms.ColorJitter(brightness, saturation),
42
+ transforms.GaussianBlur(blur),
43
  transforms.ToTensor(),
44
  transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
45
  ])
46
 
47
+ return augmentation
48
 
49
 
50
  """
51
  Creates and returns DataLoaders (train, val, test) for a given dataset.
52
  Performs a 70/15/15 split
53
  """
54
+ def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config):
55
 
56
  # Define transformation pipelines for the dataset
57
+ normalisation = make_norm_pipeline()
58
+ augmentation = make_augment_pipeline(aug_config)
59
 
60
  # 70/30 split creates train set
61
  split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
 
63
  remaining_split = split_1['test']
64
 
65
  # 15/15 split on remaining data - validation and test sets
66
+ val_split = 0.5
67
  split_2 = remaining_split.train_test_split(test_size=val_split, seed=seed)
68
  val_split, test_split = split_2['train'], split_2['test']
69