Anna Rachkova (k24040374) commited on
Commit
89c8841
·
unverified ·
1 Parent(s): 9af0f61

Clean up code and remove redundancies

Browse files
Files changed (1) hide show
  1. data_preparation.py +11 -81
data_preparation.py CHANGED
@@ -35,9 +35,6 @@ task = Task.init(project_name= 'smallGroupProject', task_name = 'data_prep')
35
  task.set_random_seed(SEED)
36
  clearml_logger = task.get_logger()
37
 
38
- print("✅ Checkpoint: Imports, SEED and ClearML are set")
39
-
40
-
41
  # Loading dataset from HugginFace and checking it
42
  try:
43
  ds = load_dataset("DScomp380/plant_village")
@@ -45,38 +42,8 @@ except Exception as e:
45
  print(f"Error loading the dataset: {e}")
46
 
47
  data_plants = ds['train']
48
-
49
- print("--- Verification ---")
50
- # Verification
51
- print(f"\nLoaded object type: {type(data_plants)}")
52
- print("\n --- \n")
53
-
54
  data_length = len(data_plants)
55
- print(f"\nLoaded object size: {data_length}")
56
- print("\n --- \n")
57
-
58
  features = data_plants.features
59
- print(f"\nDataset features: {features}")
60
- print("\n --- \n")
61
-
62
- # Verifying label count
63
- if 'label' in features and hasattr(features['label'], 'num_classes'):
64
- label_count = features['label'].num_classes
65
- print(f"Number of disease categories (labels): {label_count}")
66
- else:
67
- print("Couldnt determine the labels automatically")
68
- print("\n --- \n")
69
-
70
-
71
- # Verifying single sample
72
- sample = data_plants[0]
73
- print(f"Sample image type: {type(sample['image'])}")
74
- print(f"Sample label: {sample['label']}")
75
- print("\n --- \n")
76
-
77
- print("✅ Checkpoint: Dataset is loaded and data is checked")
78
-
79
-
80
  # --------------------------- Data selection --------------------------------
81
  # Creating the prototyping dataset
82
  SUBSET_RATIO = 0.25 # 25% for prototyping
@@ -97,10 +64,6 @@ subset_indices = indices[:subset_size]
97
 
98
  prototyping_dataset = data_plants.select(subset_indices)
99
 
100
- print("✅ Checkpoint: Prototyping dataset is created")
101
- #Verifying
102
- print(f"Prototyping dataset size: {len(prototyping_dataset)}")
103
-
104
 
105
  # ---- Exploratory data analysis (EDA) ----
106
 
@@ -113,7 +76,7 @@ label_count = df_labels.value_counts(sort = False)
113
 
114
  min_count = label_count.min()
115
  clearml_logger.report_scalar(
116
- title="Classes Counts",
117
  series="Min Class Count",
118
  value=min_count,
119
  iteration=1
@@ -121,7 +84,7 @@ clearml_logger.report_scalar(
121
 
122
  max_count = label_count.max()
123
  clearml_logger.report_scalar(
124
- title="Classes Counts",
125
  series="Max Class Count",
126
  value=max_count,
127
  iteration=1
@@ -129,46 +92,22 @@ clearml_logger.report_scalar(
129
 
130
  mean_count = label_count.mean()
131
  clearml_logger.report_scalar(
132
- title="Classes Counts",
133
  series="Imbalance Ratio (Max/Min)",
134
  value=(max_count / min_count),
135
  iteration=1
136
  )
137
-
138
  print("--- Class imbalance analysis --- ")
139
  print(f"Max labels in a class: {max_count}")
140
  print(f"Min labels in a class: {min_count}")
141
  print(f"Mean labels in a class: {mean_count}")
142
  print(f"Imbalance ratio: {max_count/min_count:.2f}")
143
- print("✅ Checkpoint: Class distribution is calculated")
144
 
145
  # Mapping indeces to class names
146
  class_names = features['label'].names
147
  formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
148
  label_count.index = formatted_class_names
149
 
150
-
151
- # Creating bar chart with labels distribution
152
- label_count.plot(kind='bar', figsize=(15,6))
153
- plt.xlabel('Labels')
154
- plt.ylabel('Sample count')
155
- plt.title('Class distribution among chosen samples')
156
-
157
- plot_file = 'class_distribution.png'
158
- plt.savefig(plot_file)
159
-
160
- clearml_logger.report_image(
161
- title="EDA", # The title for the plot section in ClearML
162
- series="Class Distribution", # The name of this specific plot
163
- iteration=1, # The experiment step
164
- local_path=plot_file # The path to the file you just saved
165
- )
166
-
167
- # To see the plot uncomment but itll pause the code
168
- #plt.show()
169
- print("✅ Checkpoint: Plot with classes distributions is created and saved")
170
-
171
-
172
  # --------------- Data Splits ------------
173
  def get_transform_pipelines():
174
  """
@@ -188,8 +127,6 @@ def get_transform_pipelines():
188
  # Normalise the Tensor; Standartises pixel values
189
  transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
190
  ])
191
- print("✅ Checkpoint: Transform pipeline created")
192
-
193
  # Augmentation pipeline (to change some parameters of the pictures to create "new" ones)
194
  augmentation_pipeline = transforms.Compose([
195
  # Randomly changing some parameters of pictures to enrich dataset
@@ -200,10 +137,7 @@ def get_transform_pipelines():
200
  # Convert to Tensor and Normalise
201
  transforms.ToTensor(),
202
  transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
203
- ])
204
-
205
- print("✅ Checkpoint: Augmentation pipeline created")
206
-
207
  # Return both pipelines
208
  return normalisation_pipeline, augmentation_pipeline
209
 
@@ -232,8 +166,6 @@ def get_prototype_loaders(batch_size=32):
232
  proto_val_split = split_2_dict['train']
233
  proto_test_split = split_2_dict['test']
234
 
235
- print("✅ Checkpoint: Prototype Dataset splitted")
236
-
237
  # -- Putting splits through pipelines --
238
  proto_train_split.set_transform(augmentation_pipeline)
239
  proto_val_split.set_transform(normalisation_pipeline)
@@ -244,7 +176,6 @@ def get_prototype_loaders(batch_size=32):
244
  proto_val_loader = DataLoader(dataset = proto_val_split, batch_size = batch_size, shuffle = False )
245
  proto_test_loader = DataLoader(dataset = proto_test_split, batch_size = batch_size, shuffle = False )
246
 
247
- print("✅ Checkpoint: Prototype DataLoaders are set")
248
  return proto_train_loader, proto_val_loader, proto_test_loader
249
 
250
 
@@ -272,8 +203,6 @@ def get_final_loaders(batch_size=32):
272
  val_split = split_2_dict['train']
273
  test_split = split_2_dict['test']
274
 
275
- print("✅ Checkpoint: Final Dataset splitted")
276
-
277
  # -- Putting splits through pipelines --
278
  train_split.set_transform(augmentation_pipeline)
279
  val_split.set_transform(normalisation_pipeline)
@@ -283,22 +212,23 @@ def get_final_loaders(batch_size=32):
283
  train_loader = DataLoader(dataset = train_split, batch_size = batch_size, shuffle = True )
284
  val_loader = DataLoader(dataset = val_split, batch_size = batch_size, shuffle = False )
285
  test_loader = DataLoader(dataset = test_split, batch_size = batch_size, shuffle = False )
286
-
287
- print("✅ Checkpoint: Final DataLoaders are set")
288
  return train_loader, val_loader, test_loader
289
 
290
  # ----------------------------------------------------------------------
291
  if __name__ == "__main__":
292
 
293
- print("\nRunning data_preparation.py")
294
-
295
  train_loader, val_loader, test_loader = get_prototype_loaders(batch_size=32)
296
-
297
  print("\n--- Handoff Test Successful ---")
298
  print(f"Train loader batches: {len(train_loader)}")
299
  print(f"Validation loader batches: {len(val_loader)}")
300
  print(f"Test loader batches: {len(test_loader)}")
 
 
 
 
 
 
301
 
302
  # Close the ClearML task
303
  task.close()
304
- print("\n--- Script Finished ---")
 
35
  task.set_random_seed(SEED)
36
  clearml_logger = task.get_logger()
37
 
 
 
 
38
  # Loading dataset from HugginFace and checking it
39
  try:
40
  ds = load_dataset("DScomp380/plant_village")
 
42
  print(f"Error loading the dataset: {e}")
43
 
44
  data_plants = ds['train']
 
 
 
 
 
 
45
  data_length = len(data_plants)
 
 
 
46
  features = data_plants.features
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  # --------------------------- Data selection --------------------------------
48
  # Creating the prototyping dataset
49
  SUBSET_RATIO = 0.25 # 25% for prototyping
 
64
 
65
  prototyping_dataset = data_plants.select(subset_indices)
66
 
 
 
 
 
67
 
68
  # ---- Exploratory data analysis (EDA) ----
69
 
 
76
 
77
  min_count = label_count.min()
78
  clearml_logger.report_scalar(
79
+ title="Exploratory data analysis (EDA)",
80
  series="Min Class Count",
81
  value=min_count,
82
  iteration=1
 
84
 
85
  max_count = label_count.max()
86
  clearml_logger.report_scalar(
87
+ title="Exploratory data analysis (EDA)",
88
  series="Max Class Count",
89
  value=max_count,
90
  iteration=1
 
92
 
93
  mean_count = label_count.mean()
94
  clearml_logger.report_scalar(
95
+ title="Exploratory data analysis (EDA)",
96
  series="Imbalance Ratio (Max/Min)",
97
  value=(max_count / min_count),
98
  iteration=1
99
  )
 
100
  print("--- Class imbalance analysis --- ")
101
  print(f"Max labels in a class: {max_count}")
102
  print(f"Min labels in a class: {min_count}")
103
  print(f"Mean labels in a class: {mean_count}")
104
  print(f"Imbalance ratio: {max_count/min_count:.2f}")
 
105
 
106
  # Mapping indeces to class names
107
  class_names = features['label'].names
108
  formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
109
  label_count.index = formatted_class_names
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  # --------------- Data Splits ------------
112
  def get_transform_pipelines():
113
  """
 
127
  # Normalise the Tensor; Standartises pixel values
128
  transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
129
  ])
 
 
130
  # Augmentation pipeline (to change some parameters of the pictures to create "new" ones)
131
  augmentation_pipeline = transforms.Compose([
132
  # Randomly changing some parameters of pictures to enrich dataset
 
137
  # Convert to Tensor and Normalise
138
  transforms.ToTensor(),
139
  transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
140
+ ])
 
 
 
141
  # Return both pipelines
142
  return normalisation_pipeline, augmentation_pipeline
143
 
 
166
  proto_val_split = split_2_dict['train']
167
  proto_test_split = split_2_dict['test']
168
 
 
 
169
  # -- Putting splits through pipelines --
170
  proto_train_split.set_transform(augmentation_pipeline)
171
  proto_val_split.set_transform(normalisation_pipeline)
 
176
  proto_val_loader = DataLoader(dataset = proto_val_split, batch_size = batch_size, shuffle = False )
177
  proto_test_loader = DataLoader(dataset = proto_test_split, batch_size = batch_size, shuffle = False )
178
 
 
179
  return proto_train_loader, proto_val_loader, proto_test_loader
180
 
181
 
 
203
  val_split = split_2_dict['train']
204
  test_split = split_2_dict['test']
205
 
 
 
206
  # -- Putting splits through pipelines --
207
  train_split.set_transform(augmentation_pipeline)
208
  val_split.set_transform(normalisation_pipeline)
 
212
  train_loader = DataLoader(dataset = train_split, batch_size = batch_size, shuffle = True )
213
  val_loader = DataLoader(dataset = val_split, batch_size = batch_size, shuffle = False )
214
  test_loader = DataLoader(dataset = test_split, batch_size = batch_size, shuffle = False )
 
 
215
  return train_loader, val_loader, test_loader
216
 
217
  # ----------------------------------------------------------------------
218
  if __name__ == "__main__":
219
 
 
 
220
  train_loader, val_loader, test_loader = get_prototype_loaders(batch_size=32)
 
221
  print("\n--- Handoff Test Successful ---")
222
  print(f"Train loader batches: {len(train_loader)}")
223
  print(f"Validation loader batches: {len(val_loader)}")
224
  print(f"Test loader batches: {len(test_loader)}")
225
+
226
+ train_loader_fin, val_loader_fin, test_loader_fin = get_final_loaders(batch_size=32)
227
+ print("\n--- Handoff Test Successful ---")
228
+ print(f"Train loader batches: {len(train_loader_fin)}")
229
+ print(f"Validation loader batches: {len(val_loader_fin)}")
230
+ print(f"Test loader batches: {len(test_loader_fin)}")
231
 
232
  # Close the ClearML task
233
  task.close()
234
+ print("\n--- Script Finished ---")