Yusuf Rahman (k22040245) commited on
Commit
0d36ad3
·
unverified ·
2 Parent(s): deb385a 8e6181a

Merge pull request #2 from K23064919/ops/clearml-setup

Browse files
dataPrep/data_preparation.py CHANGED
@@ -6,7 +6,7 @@ import random
6
  import numpy as np
7
  import pandas as pd
8
  from datasets import load_dataset
9
- from helpers.create_dataset import load_subset_from_dataset
10
  from helpers.transforms_loaders import make_dataset_loaders
11
 
12
  # --- Visualization ---
@@ -15,17 +15,28 @@ import matplotlib.pyplot as plt
15
 
16
  # --- PyTorch (Machine Learning) ---
17
  import torch
18
- from torchvision import transforms
19
- from torch.utils.data import DataLoader
20
 
21
  # --- Experiment Tracking ---
22
- from clearml import Task, Logger, Dataset
23
 
24
 
25
- # Setting up the SEED to be able to repeat experiments
 
26
  SEED = 42
 
27
  DATASET_SUBSET_RATIO = 0.25
28
 
 
 
 
 
 
 
 
 
 
 
 
29
  random.seed(SEED)
30
  np.random.seed(SEED)
31
  torch.manual_seed(SEED)
@@ -34,20 +45,37 @@ if torch.cuda.is_available():
34
 
35
 
36
  # ----- ClearML Setup -----
37
- task = Task.init(project_name= 'Small Group CW', task_name = 'data_prep')
 
 
 
 
38
  task.set_random_seed(SEED)
39
  clearml_logger = task.get_logger()
40
 
41
- # Log subset config to ClearML
42
- task.connect_configuration(
43
- {"subset_ratio": DATASET_SUBSET_RATIO},
44
- name="Data subsetting"
45
- )
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  # ----- Load a subset from a given dataset & track with ClearML -----
49
- data_plants, prototyping_dataset, features, clearml_dataset = load_subset_from_dataset(
50
- SEED, DATASET_SUBSET_RATIO, clearml_logger
51
  )
52
 
53
 
@@ -56,7 +84,7 @@ data_plants, prototyping_dataset, features, clearml_dataset = load_subset_from_d
56
  # Reformatting the label feature to understand bias
57
  labels_list = prototyping_dataset['label']
58
  df_labels = pd.Series(labels_list)
59
- label_count = df_labels.value_counts(sort = False)
60
 
61
  # Checking the amount of samples in each class and logging it to clearML
62
 
@@ -100,12 +128,11 @@ plt.title("Class Distribution in Prototype Dataset")
100
  plt.xlabel("Class")
101
  plt.ylabel("Count")
102
  plt.tight_layout()
103
- plt.savefig("class_distribution.png")
104
 
105
- clearml_logger.report_image(
106
  title="EDA Class Distribution",
107
  series="Prototype Subset",
108
- local_path="class_distribution.png",
109
  iteration=1
110
  )
111
 
@@ -113,9 +140,16 @@ clearml_logger.report_image(
113
  # ----------------------------------------------------------------------
114
  if __name__ == "__main__":
115
 
116
- # ------------------- Dataset splits ----------------------------------
 
 
 
 
 
 
 
117
  prototype_loaders = make_dataset_loaders(
118
- prototyping_dataset, seed=SEED, batch_size=32, test_size=0.3
119
  )
120
 
121
  print("\n--- Handoff Test Successful ---")
@@ -123,8 +157,15 @@ if __name__ == "__main__":
123
  print(f"Prototype Validation loader batches: {len(prototype_loaders['val'])}")
124
  print(f"Prototype Test loader batches: {len(prototype_loaders['test'])}")
125
 
 
 
 
 
 
 
 
126
  final_loaders = make_dataset_loaders(
127
- data_plants, seed=SEED, batch_size=32, test_size=0.3
128
  )
129
 
130
  print("\n--- Handoff Test Successful ---")
@@ -137,6 +178,8 @@ if __name__ == "__main__":
137
  {"dataset_id": clearml_dataset.id},
138
  name="Dataset Metadata"
139
  )
 
 
140
 
141
  # Close the ClearML task
142
  task.close()
 
6
  import numpy as np
7
  import pandas as pd
8
  from datasets import load_dataset
9
+ from helpers.create_dataset import make_subset
10
  from helpers.transforms_loaders import make_dataset_loaders
11
 
12
  # --- Visualization ---
 
15
 
16
  # --- PyTorch (Machine Learning) ---
17
  import torch
 
 
18
 
19
  # --- Experiment Tracking ---
20
+ from clearml import Task
21
 
22
 
23
+ # -------- Controllable parameters --------
24
+ # Dataset parameters
25
  SEED = 42
26
+ DATASET_LINK = "DScomp380/plant_village"
27
  DATASET_SUBSET_RATIO = 0.25
28
 
29
+ # Augmentation parameters
30
+ ROTATION = 30
31
+ BRIGHTNESS = 0.2
32
+ SATURATION = 0.2
33
+ BLUR = 3
34
+
35
+ # DataLoader parameters
36
+ BATCH_SIZE = 32
37
+ TEST_SIZE = 0.3
38
+
39
+ # Setting up the SEED to be able to repeat experiments
40
  random.seed(SEED)
41
  np.random.seed(SEED)
42
  torch.manual_seed(SEED)
 
45
 
46
 
47
  # ----- ClearML Setup -----
48
+ task = Task.init(
49
+ project_name='Small Group Project',
50
+ task_name='Data Preparation',
51
+ task_type=Task.TaskTypes.data_processing
52
+ )
53
  task.set_random_seed(SEED)
54
  clearml_logger = task.get_logger()
55
 
 
 
 
 
 
56
 
57
+ # -------- Track full configuration in ClearML --------
58
+ task.connect({
59
+ "seed": SEED,
60
+ "dataset": {
61
+ "link": DATASET_LINK,
62
+ "subset_ratio": DATASET_SUBSET_RATIO,
63
+ },
64
+ "augmentation": {
65
+ "rotation": ROTATION,
66
+ "brightness": BRIGHTNESS,
67
+ "saturation": SATURATION,
68
+ "blur": BLUR
69
+ },
70
+ "dataloaders": {
71
+ "batch_size": BATCH_SIZE,
72
+ "test_size": TEST_SIZE
73
+ }
74
+ })
75
 
76
  # ----- Load a subset from a given dataset & track with ClearML -----
77
+ data_plants, prototyping_dataset, features, clearml_dataset = make_subset(
78
+ DATASET_LINK, DATASET_SUBSET_RATIO, clearml_logger
79
  )
80
 
81
 
 
84
  # Reformatting the label feature to understand bias
85
  labels_list = prototyping_dataset['label']
86
  df_labels = pd.Series(labels_list)
87
+ label_count = df_labels.value_counts(sort=False)
88
 
89
  # Checking the amount of samples in each class and logging it to clearML
90
 
 
128
  plt.xlabel("Class")
129
  plt.ylabel("Count")
130
  plt.tight_layout()
 
131
 
132
+ clearml_logger.report_matplotlib_figure(
133
  title="EDA Class Distribution",
134
  series="Prototype Subset",
135
+ figure=plt.gcf(),
136
  iteration=1
137
  )
138
 
 
140
  # ----------------------------------------------------------------------
141
  if __name__ == "__main__":
142
 
143
+ # ---------------- Dataset splits ----------------
144
+ aug_config = {
145
+ 'rotation': ROTATION,
146
+ 'brightness': BRIGHTNESS,
147
+ 'saturation': SATURATION,
148
+ 'blur': BLUR
149
+ }
150
+
151
  prototype_loaders = make_dataset_loaders(
152
+ prototyping_dataset, SEED, BATCH_SIZE, TEST_SIZE, aug_config
153
  )
154
 
155
  print("\n--- Handoff Test Successful ---")
 
157
  print(f"Prototype Validation loader batches: {len(prototype_loaders['val'])}")
158
  print(f"Prototype Test loader batches: {len(prototype_loaders['test'])}")
159
 
160
+ clearml_logger.report_text(
161
+ f"Prototype loaders created: "
162
+ f"train={len(prototype_loaders['train'])}, "
163
+ f"val={len(prototype_loaders['val'])}, "
164
+ f"test={len(prototype_loaders['test'])}"
165
+ )
166
+
167
  final_loaders = make_dataset_loaders(
168
+ data_plants, SEED, BATCH_SIZE, TEST_SIZE, aug_config
169
  )
170
 
171
  print("\n--- Handoff Test Successful ---")
 
178
  {"dataset_id": clearml_dataset.id},
179
  name="Dataset Metadata"
180
  )
181
+ task.mark_completed()
182
+
183
 
184
  # Close the ClearML task
185
  task.close()
dataPrep/helpers/create_dataset.py CHANGED
@@ -2,19 +2,23 @@
2
  A collection of dataset (DS) loading and subsetting functions.
3
  """
4
 
 
5
  import random
6
  import numpy as np
7
  from datasets import load_dataset
8
  from clearml import Dataset
9
 
10
 
11
- # Load a DS from HuggingFace Link and subset - upload both to ClearML
12
- def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
13
- DATASET_LINK = "DScomp380/plant_village"
 
 
 
14
 
15
  # Load dataset
16
  try:
17
- ds = load_dataset(DATASET_LINK)
18
  except Exception as e:
19
  raise RuntimeError(f"Error loading the dataset: {e}")
20
 
@@ -35,15 +39,21 @@ def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
35
  # ---------- Register subset in ClearML ----------
36
  clearml_dataset = Dataset.create(
37
  dataset_name="Plant Village Prototype",
38
- dataset_project="smallGroupProject",
39
- dataset_tags=["prototype", "subset"]
 
40
  )
 
 
 
 
41
 
42
  # Save indices
43
  subset_path = "subset_indices.npy"
44
  np.save(subset_path, subset_indices)
45
  clearml_dataset.add_files(subset_path)
46
  clearml_dataset.set_metadata({
 
47
  "subset_ratio": subset_ratio,
48
  "total_samples": len(prototyping_dataset)
49
  })
@@ -52,4 +62,7 @@ def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
52
  clearml_dataset.finalize()
53
  clearml_logger.report_text(f"Created ClearML Dataset: {clearml_dataset.id}")
54
 
 
 
 
55
  return data_plants, prototyping_dataset, features, clearml_dataset
 
2
  A collection of dataset (DS) loading and subsetting functions.
3
  """
4
 
5
+ import os
6
  import random
7
  import numpy as np
8
  from datasets import load_dataset
9
  from clearml import Dataset
10
 
11
 
12
+ '''
13
+ Load a DS from HuggingFace Link & randomly subset it - upload subset to ClearML
14
+ Subset indicies are uploaded to ClearML for reproducibility
15
+ REPRODUCE: Load full DS, then load indicies from ClearML to get same subset
16
+ '''
17
+ def make_subset(dataset_link, subset_ratio, clearml_logger):
18
 
19
  # Load dataset
20
  try:
21
+ ds = load_dataset(dataset_link)
22
  except Exception as e:
23
  raise RuntimeError(f"Error loading the dataset: {e}")
24
 
 
39
  # ---------- Register subset in ClearML ----------
40
  clearml_dataset = Dataset.create(
41
  dataset_name="Plant Village Prototype",
42
+ dataset_project="Small Group Project",
43
+ dataset_tags=["prototype", "subset"],
44
+ use_current_task=True
45
  )
46
+ clearml_dataset.add_tags([
47
+ f"subset_ratio_{subset_ratio}",
48
+ "hf_source"
49
+ ])
50
 
51
  # Save indices
52
  subset_path = "subset_indices.npy"
53
  np.save(subset_path, subset_indices)
54
  clearml_dataset.add_files(subset_path)
55
  clearml_dataset.set_metadata({
56
+ "huggingface_dataset": dataset_link,
57
  "subset_ratio": subset_ratio,
58
  "total_samples": len(prototyping_dataset)
59
  })
 
62
  clearml_dataset.finalize()
63
  clearml_logger.report_text(f"Created ClearML Dataset: {clearml_dataset.id}")
64
 
65
+ # Clean up local file
66
+ os.remove(subset_path)
67
+
68
  return data_plants, prototyping_dataset, features, clearml_dataset
dataPrep/helpers/transforms_loaders.py CHANGED
@@ -6,13 +6,12 @@ from torchvision import transforms
6
  from torch.utils.data import DataLoader
7
 
8
 
 
 
 
9
 
10
- # Defines and returns the normalization and augmentation pipelines.
11
- def make_transform_pipelines():
12
-
13
- # Standard ImageNet mean and std - Used to normalize the tensors
14
- IMAGENET_MEAN = [0.485, 0.456, 0.406]
15
- IMAGENET_STD = [0.229, 0.224, 0.225]
16
 
17
  # Pipeline ensures image format is consistent (for Val/Test)
18
  normalisation = transforms.Compose([
@@ -24,28 +23,39 @@ def make_transform_pipelines():
24
  transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
25
  ])
26
 
 
 
 
 
 
 
 
 
 
 
27
  # Augmentation pipeline (to create "new" images by changing some parameters)
28
  augmentation = transforms.Compose([
29
 
30
  # Randomly changing some parameters of pictures to enrich dataset
31
- transforms.RandomRotation(30),
32
- transforms.ColorJitter(brightness=0.2, saturation=0.2),
33
- transforms.GaussianBlur(3),
34
  transforms.ToTensor(),
35
  transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
36
  ])
37
 
38
- return normalisation, augmentation
39
 
40
 
41
  """
42
  Creates and returns DataLoaders (train, val, test) for a given dataset.
43
  Performs a 70/15/15 split
44
  """
45
- def make_dataset_loaders(dataset, seed, batch_size=32, test_size=0.3):
46
 
47
  # Define transformation pipelines for the dataset
48
- normalisation, augmentation = make_transform_pipelines()
 
49
 
50
  # 70/30 split creates train set
51
  split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
@@ -53,7 +63,7 @@ def make_dataset_loaders(dataset, seed, batch_size=32, test_size=0.3):
53
  remaining_split = split_1['test']
54
 
55
  # 15/15 split on remaining data - validation and test sets
56
- val_split = test_size/2
57
  split_2 = remaining_split.train_test_split(test_size=val_split, seed=seed)
58
  val_split, test_split = split_2['train'], split_2['test']
59
 
 
6
  from torch.utils.data import DataLoader
7
 
8
 
9
+ # Standard ImageNet mean and std - Used to normalize the tensors
10
+ IMAGENET_MEAN = [0.485, 0.456, 0.406]
11
+ IMAGENET_STD = [0.229, 0.224, 0.225]
12
 
13
+ # Defines and returns the normalization pipeline.
14
+ def make_norm_pipeline():
 
 
 
 
15
 
16
  # Pipeline ensures image format is consistent (for Val/Test)
17
  normalisation = transforms.Compose([
 
23
  transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
24
  ])
25
 
26
+ return normalisation
27
+
28
+ # Defines and returns the augmentation (rotation, brightness, saturation, blur) pipeline.
29
+ def make_augment_pipeline(aug_config):
30
+
31
+ rotation = aug_config['rotation']
32
+ brightness = aug_config['brightness']
33
+ saturation = aug_config['saturation']
34
+ blur = aug_config['blur']
35
+
36
  # Augmentation pipeline (to create "new" images by changing some parameters)
37
  augmentation = transforms.Compose([
38
 
39
  # Randomly changing some parameters of pictures to enrich dataset
40
+ transforms.RandomRotation(rotation),
41
+ transforms.ColorJitter(brightness=brightness, saturation=saturation),
42
+ transforms.GaussianBlur(blur),
43
  transforms.ToTensor(),
44
  transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
45
  ])
46
 
47
+ return augmentation
48
 
49
 
50
  """
51
  Creates and returns DataLoaders (train, val, test) for a given dataset.
52
  Performs a 70/15/15 split
53
  """
54
+ def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config):
55
 
56
  # Define transformation pipelines for the dataset
57
+ normalisation = make_norm_pipeline()
58
+ augmentation = make_augment_pipeline(aug_config)
59
 
60
  # 70/30 split creates train set
61
  split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
 
63
  remaining_split = split_1['test']
64
 
65
  # 15/15 split on remaining data - validation and test sets
66
+ val_split = 0.5
67
  split_2 = remaining_split.train_test_split(test_size=val_split, seed=seed)
68
  val_split, test_split = split_2['train'], split_2['test']
69
 
models/__init__.py ADDED
File without changes
trainingModel/__init__.py ADDED
File without changes
trainingModel/run_training.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+
4
+ from clearml import Task, Dataset
5
+ from datasets import load_dataset
6
+ from dataPrep.helpers.transforms_loaders import make_dataset_loaders
7
+
8
+ import torch
9
+ from models.modelOne import modelOne
10
+ from trainingModel.Training import train_model
11
+
12
+
13
+ # Load data prep task from ClearML
14
+ DATA_PREP_TASK_ID = "f6888baedc7142fcad9e0cc6837c5cb5"
15
+ DATA_PREP = Task.get_task(task_id=DATA_PREP_TASK_ID)
16
+
17
+ data_params = DATA_PREP.get_parameters()
18
+ dataset_link = data_params['General/dataset/link']
19
+
20
+ # Load the whole dataset
21
+ try:
22
+ ds = load_dataset(dataset_link)
23
+ except Exception as e:
24
+ raise RuntimeError(f"Error loading the dataset: {e}")
25
+
26
+ full_dataset = ds['train']
27
+
28
+
29
+ # Load the subset indices from ClearML
30
+ SUBSET_ID = "f6888baedc7142fcad9e0cc6837c5cb5"
31
+ subset_clearml = Dataset.get(dataset_id=SUBSET_ID)
32
+
33
+ local_folder = subset_clearml.get_local_copy()
34
+ subset_indices_path = os.path.join(local_folder, "subset_indices.npy")
35
+ subset_indices = np.load(subset_indices_path)
36
+
37
+ print("Loaded subset indices:", subset_indices.shape)
38
+
39
+
40
+ # Apply subset indices to full dataset - this gives you the same subset as data prep
41
+ subset_dataset = full_dataset.select(subset_indices)
42
+
43
+
44
+ # Extract parameters from data prep task - these will create the DataLoaders
45
+ seed = int(data_params['General/seed'])
46
+ batch_size = int(data_params['General/dataloaders/batch_size'])
47
+ test_size = float(data_params['General/dataloaders/test_size'])
48
+
49
+ aug_config = {
50
+ 'rotation': float(data_params['General/augmentation/rotation']),
51
+ 'brightness': float(data_params['General/augmentation/brightness']),
52
+ 'saturation': float(data_params['General/augmentation/saturation']),
53
+ 'blur': float(data_params['General/augmentation/blur'])
54
+ }
55
+
56
+ # Create DataLoaders using the parameters from data prep
57
+ subset_loaders = make_dataset_loaders(
58
+ subset_dataset, seed, batch_size, test_size, aug_config
59
+ )
60
+
61
+ print("\n--- Handoff Test Successful ---")
62
+ print(f"Prototype Train loader batches: {len(subset_loaders['train'])}")
63
+ print(f"Prototype Validation loader batches: {len(subset_loaders['val'])}")
64
+ print(f"Prototype Test loader batches: {len(subset_loaders['test'])}")
65
+
66
+
67
+ full_loaders = make_dataset_loaders(
68
+ full_dataset, seed, batch_size, test_size, aug_config
69
+ )
70
+
71
+ print("\n--- Handoff Test Successful ---")
72
+ print(f"Train loader batches: {len(full_loaders['train'])}")
73
+ print(f"Validation loader batches: {len(full_loaders['val'])}")
74
+ print(f"Test loader batches: {len(full_loaders['test'])}")
75
+
76
+
77
+ # -------- Build the ML model --------
78
+ model = modelOne(noOfClasses=39)
79
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
80
+
81
+
82
+ # ------- Train the model (on subset for now) -------
83
+ '''
84
+ When calling this function, the model should be trained on the given dataset
85
+
86
+
87
+ train_model(
88
+ model=model,
89
+ train_loader=subset_loaders['train'],
90
+ val_loader=subset_loaders['val'],
91
+ device=device,
92
+ n_epochs=10,
93
+ lr=1e-3,
94
+ save_path="best_model.pt",
95
+ )
96
+ '''