Yusuf commited on
Commit
6b1327e
·
1 Parent(s): 3562c3d

FEAT: log dataset to clearml

Browse files
Files changed (1) hide show
  1. dataPrep/data_preparation.py +51 -4
dataPrep/data_preparation.py CHANGED
@@ -17,7 +17,7 @@ from torchvision import transforms
17
  from torch.utils.data import DataLoader
18
 
19
  # --- Experiment Tracking ---
20
- from clearml import Task, Logger
21
 
22
 
23
  # Setting up the SEED to be able to repeat experiments
@@ -31,7 +31,7 @@ if torch.cuda.is_available():
31
 
32
  # Initialising a task on ClearML
33
  # UPDATE CLEARML
34
- task = Task.init(project_name= 'smallGroupProject', task_name = 'data_prep')
35
  task.set_random_seed(SEED)
36
  clearml_logger = task.get_logger()
37
 
@@ -44,11 +44,12 @@ except Exception as e:
44
  data_plants = ds['train']
45
  data_length = len(data_plants)
46
  features = data_plants.features
 
47
  # --------------------------- Data selection --------------------------------
48
  # Creating the prototyping dataset
49
  SUBSET_RATIO = 0.25 # 25% for prototyping
50
 
51
- # Loggint it to ClearML
52
  task.connect_configuration(
53
  {"subset_ratio": SUBSET_RATIO},
54
  name="Data subsetting"
@@ -61,9 +62,33 @@ subset_size = int(data_length * SUBSET_RATIO)
61
  indices = list(range(data_length))
62
  random.shuffle(indices)
63
  subset_indices = indices[:subset_size]
64
-
65
  prototyping_dataset = data_plants.select(subset_indices)
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  # ---- Exploratory data analysis (EDA) ----
69
 
@@ -108,6 +133,22 @@ class_names = features['label'].names
108
  formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
109
  label_count.index = formatted_class_names
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  # --------------- Data Splits ------------
112
  def get_transform_pipelines():
113
  """
@@ -228,6 +269,12 @@ if __name__ == "__main__":
228
  print(f"Train loader batches: {len(train_loader_fin)}")
229
  print(f"Validation loader batches: {len(val_loader_fin)}")
230
  print(f"Test loader batches: {len(test_loader_fin)}")
 
 
 
 
 
 
231
 
232
  # Close the ClearML task
233
  task.close()
 
17
  from torch.utils.data import DataLoader
18
 
19
  # --- Experiment Tracking ---
20
+ from clearml import Task, Logger, Dataset
21
 
22
 
23
  # Setting up the SEED to be able to repeat experiments
 
31
 
32
  # Initialising a task on ClearML
33
  # UPDATE CLEARML
34
+ task = Task.init(project_name= 'Small Group CW', task_name = 'data_prep')
35
  task.set_random_seed(SEED)
36
  clearml_logger = task.get_logger()
37
 
 
44
  data_plants = ds['train']
45
  data_length = len(data_plants)
46
  features = data_plants.features
47
+
48
  # --------------------------- Data selection --------------------------------
49
  # Creating the prototyping dataset
50
  SUBSET_RATIO = 0.25 # 25% for prototyping
51
 
52
+ # Log subset config to ClearML
53
  task.connect_configuration(
54
  {"subset_ratio": SUBSET_RATIO},
55
  name="Data subsetting"
 
62
  indices = list(range(data_length))
63
  random.shuffle(indices)
64
  subset_indices = indices[:subset_size]
 
65
  prototyping_dataset = data_plants.select(subset_indices)
66
 
67
+ # Register this subset in ClearML
68
+ dataset = Dataset.create(
69
+ dataset_name="Plant Village Prototype",
70
+ dataset_project="smallGroupProject",
71
+ dataset_tags=["prototype", "subset"]
72
+ )
73
+
74
+ # Save indicies used for reproducibility
75
+ subset_path = "subset_indices.npy"
76
+ np.save(subset_path, subset_indices)
77
+ dataset.add_files(subset_path)
78
+
79
+ # Add simple metadata
80
+ dataset.set_metadata({
81
+ "subset_ratio": SUBSET_RATIO,
82
+ "total_samples": len(prototyping_dataset)
83
+ })
84
+
85
+ # Upload to ClearML storage
86
+ dataset.upload()
87
+ dataset.finalize()
88
+
89
+ # Log the dataset ID
90
+ clearml_logger.report_text(f"Created ClearML Dataset: {dataset.id}")
91
+
92
 
93
  # ---- Exploratory data analysis (EDA) ----
94
 
 
133
  formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
134
  label_count.index = formatted_class_names
135
 
136
+ plt.figure(figsize=(10,6))
137
+ label_count.plot(kind='bar', color='skyblue')
138
+ plt.title("Class Distribution in Prototype Dataset")
139
+ plt.xlabel("Class")
140
+ plt.ylabel("Count")
141
+ plt.tight_layout()
142
+ plt.savefig("class_distribution.png")
143
+
144
+ clearml_logger.report_image(
145
+ title="EDA Class Distribution",
146
+ series="Prototype Subset",
147
+ local_path="class_distribution.png",
148
+ iteration=1
149
+ )
150
+
151
+
152
  # --------------- Data Splits ------------
153
  def get_transform_pipelines():
154
  """
 
269
  print(f"Train loader batches: {len(train_loader_fin)}")
270
  print(f"Validation loader batches: {len(val_loader_fin)}")
271
  print(f"Test loader batches: {len(test_loader_fin)}")
272
+
273
+ # Record dataset info in ClearML
274
+ task.connect_configuration(
275
+ {"dataset_id": dataset.id},
276
+ name="Dataset Metadata"
277
+ )
278
 
279
  # Close the ClearML task
280
  task.close()