Yusuf commited on
Commit
03021e1
·
1 Parent(s): 0abee12

fix: replace clearml datasets with artifacts

Browse files
dataPrep/data_preparation.py CHANGED
@@ -74,15 +74,15 @@ task.connect({
74
  })
75
 
76
  # ----- Load a subset from a given dataset & track with ClearML -----
77
- data_plants, prototyping_dataset, features, clearml_dataset = make_subset(
78
- DATASET_LINK, DATASET_SUBSET_RATIO, clearml_logger
79
  )
80
 
81
 
82
  # ---- Exploratory data analysis (EDA) ----
83
 
84
  # Reformatting the label feature to understand bias
85
- labels_list = prototyping_dataset['label']
86
  df_labels = pd.Series(labels_list)
87
  label_count = df_labels.value_counts(sort=False)
88
 
@@ -111,6 +111,7 @@ clearml_logger.report_scalar(
111
  value=(max_count / min_count),
112
  iteration=1
113
  )
 
114
  print("--- Class imbalance analysis --- ")
115
  print(f"Max labels in a class: {max_count}")
116
  print(f"Min labels in a class: {min_count}")
@@ -122,16 +123,17 @@ class_names = features['label'].names
122
  formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
123
  label_count.index = formatted_class_names
124
 
 
125
  plt.figure(figsize=(10,6))
126
  label_count.plot(kind='bar', color='skyblue')
127
- plt.title("Class Distribution in Prototype Dataset")
128
  plt.xlabel("Class")
129
  plt.ylabel("Count")
130
  plt.tight_layout()
131
 
132
  clearml_logger.report_matplotlib_figure(
133
  title="EDA Class Distribution",
134
- series="Prototype Subset",
135
  figure=plt.gcf(),
136
  iteration=1
137
  )
@@ -149,7 +151,7 @@ if __name__ == "__main__":
149
  }
150
 
151
  prototype_loaders = make_dataset_loaders(
152
- prototyping_dataset, SEED, BATCH_SIZE, TEST_SIZE, aug_config
153
  )
154
 
155
  print("\n--- Handoff Test Successful ---")
@@ -173,14 +175,9 @@ if __name__ == "__main__":
173
  print(f"Validation loader batches: {len(final_loaders['val'])}")
174
  print(f"Test loader batches: {len(final_loaders['test'])}")
175
 
176
- # Record dataset info in ClearML
177
- task.connect_configuration(
178
- {"dataset_id": clearml_dataset.id},
179
- name="Dataset Metadata"
180
- )
181
- task.mark_completed()
182
 
183
-
184
  # Close the ClearML task
 
185
  task.close()
 
186
  print("\n--- Script Finished ---")
 
74
  })
75
 
76
  # ----- Load a subset from a given dataset & track with ClearML -----
77
+ data_plants, subset_dataset, features = make_subset(
78
+ DATASET_LINK, DATASET_SUBSET_RATIO, task
79
  )
80
 
81
 
82
  # ---- Exploratory data analysis (EDA) ----
83
 
84
  # Reformatting the label feature to understand bias
85
+ labels_list = subset_dataset['label']
86
  df_labels = pd.Series(labels_list)
87
  label_count = df_labels.value_counts(sort=False)
88
 
 
111
  value=(max_count / min_count),
112
  iteration=1
113
  )
114
+
115
  print("--- Class imbalance analysis --- ")
116
  print(f"Max labels in a class: {max_count}")
117
  print(f"Min labels in a class: {min_count}")
 
123
  formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
124
  label_count.index = formatted_class_names
125
 
126
+ # Plotting class distribution
127
  plt.figure(figsize=(10,6))
128
  label_count.plot(kind='bar', color='skyblue')
129
+ plt.title("Class Distribution in Subset Dataset")
130
  plt.xlabel("Class")
131
  plt.ylabel("Count")
132
  plt.tight_layout()
133
 
134
  clearml_logger.report_matplotlib_figure(
135
  title="EDA Class Distribution",
136
+ series="Subset Dataset",
137
  figure=plt.gcf(),
138
  iteration=1
139
  )
 
151
  }
152
 
153
  prototype_loaders = make_dataset_loaders(
154
+ subset_dataset, SEED, BATCH_SIZE, TEST_SIZE, aug_config
155
  )
156
 
157
  print("\n--- Handoff Test Successful ---")
 
175
  print(f"Validation loader batches: {len(final_loaders['val'])}")
176
  print(f"Test loader batches: {len(final_loaders['test'])}")
177
 
 
 
 
 
 
 
178
 
 
179
  # Close the ClearML task
180
+ task.mark_completed()
181
  task.close()
182
+
183
  print("\n--- Script Finished ---")
dataPrep/helpers/clearml_data.py CHANGED
@@ -7,37 +7,62 @@ from dataPrep.helpers.transforms_loaders import make_dataset_loaders
7
 
8
 
9
  '''
10
- Takes latest Data Prep ClearML task from project and extracts data loaders and metadata
 
 
11
  '''
12
  def extract_latest_data_task(project_name: str = "Small Group Project"):
13
 
14
- all_tasks = Task.get_tasks(project_name=project_name)
 
 
 
 
 
 
 
15
  if not all_tasks:
16
  raise RuntimeError(f"No tasks found in project '{project_name}'")
17
 
18
- dp_tasks = [t for t in all_tasks if t.name == "Data Preparation"]
 
 
 
 
 
19
  if not dp_tasks:
20
  raise RuntimeError("No 'Data Preparation' tasks found in this project!")
21
 
22
  # Latest Data Prep Task
23
- latest_task = max(dp_tasks, key=lambda t: t.id)
24
  DYNAMIC_TASK_ID = latest_task.id
25
  DATA_PREP = Task.get_task(task_id=DYNAMIC_TASK_ID)
26
 
27
- # Dataset ID
28
- config_objects = DATA_PREP.get_configuration_objects()
29
- raw_meta = config_objects["Dataset Metadata"]
30
- dataset_id = raw_meta.split("=")[1].strip().replace('"', "")
31
-
32
- # Load ClearML Dataset
33
- subset_clearml = Dataset.get(dataset_id=dataset_id)
34
- local_folder = subset_clearml.get_local_copy()
35
 
36
- subset_indices = np.load(os.path.join(local_folder, "subset_indices.npy"))
 
 
37
 
38
- # Load Dataset Parameters
39
  data_params = DATA_PREP.get_parameters()
 
 
40
  dataset_link = data_params['General/dataset/link']
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # Load Full Dataset
43
  try:
@@ -59,11 +84,12 @@ def extract_latest_data_task(project_name: str = "Small Group Project"):
59
  # Gather data prep task metadata
60
  data_prep_metadata = {
61
  "data_prep_task_id": DYNAMIC_TASK_ID,
62
- "dataset_id": dataset_id,
63
  "dataset_link": dataset_link,
 
64
  "augmentation_used": aug_config,
65
  "batch_size_used": batch_size,
66
  "seed_used": seed,
 
67
  }
68
 
69
  return subset_loaders, full_loaders, data_prep_metadata
@@ -74,7 +100,7 @@ Takes a given dataset, subset, data params to create DataLoaders
74
  Loaders split data into train, val, test
75
  '''
76
  def get_data_loaders(data_params, subset_dataset, full_dataset):
77
-
78
  # Extract data parameters- these will be used in the DataLoaders
79
  seed = int(data_params['General/seed'])
80
  batch_size = int(data_params['General/dataloaders/batch_size'])
 
7
 
8
 
9
  '''
10
+ Takes latest Data Prep ClearML task from project and reconstruct:
11
+ - data loaders for both full and subset datasets
12
+ - Aug settings used
13
  '''
14
  def extract_latest_data_task(project_name: str = "Small Group Project"):
15
 
16
+ # --------- Get latest Data Preparation task from ClearML ---------
17
+
18
+ all_tasks = Task.get_tasks(
19
+ project_name=project_name,
20
+ allow_archived=False,
21
+ task_filter={'order_by': ["-last_update"]},
22
+ )
23
+
24
  if not all_tasks:
25
  raise RuntimeError(f"No tasks found in project '{project_name}'")
26
 
27
+ dp_tasks = [
28
+ t for t in all_tasks
29
+ if t.task_type == Task.TaskTypes.data_processing
30
+ and t.completed is not None
31
+ ]
32
+
33
  if not dp_tasks:
34
  raise RuntimeError("No 'Data Preparation' tasks found in this project!")
35
 
36
  # Latest Data Prep Task
37
+ latest_task = dp_tasks[0]
38
  DYNAMIC_TASK_ID = latest_task.id
39
  DATA_PREP = Task.get_task(task_id=DYNAMIC_TASK_ID)
40
 
41
+ # Load subset indices artifact from Data Prep task
42
+ artifacts = DATA_PREP.artifacts
43
+ if "subset_indices" not in artifacts:
44
+ raise RuntimeError("Data Prep task did not upload 'subset_indices' artifact!")
 
 
 
 
45
 
46
+ artifact = artifacts["subset_indices"]
47
+ subset_indices_path = artifact.get_local_copy()
48
+ subset_indices = np.load(subset_indices_path)
49
 
50
+ # Load dataset metadata from Data Prep task
51
  data_params = DATA_PREP.get_parameters()
52
+
53
+ subset_ratio = float(data_params['General/dataset/subset_ratio'])
54
  dataset_link = data_params['General/dataset/link']
55
+ seed = int(data_params['General/seed'])
56
+ batch_size = int(data_params['General/dataloaders/batch_size'])
57
+ test_size = float(data_params['General/dataloaders/test_size'])
58
+
59
+ aug_config = {
60
+ 'rotation': float(data_params['General/augmentation/rotation']),
61
+ 'brightness': float(data_params['General/augmentation/brightness']),
62
+ 'saturation': float(data_params['General/augmentation/saturation']),
63
+ 'blur': float(data_params['General/augmentation/blur']),
64
+ }
65
+
66
 
67
  # Load Full Dataset
68
  try:
 
84
  # Gather data prep task metadata
85
  data_prep_metadata = {
86
  "data_prep_task_id": DYNAMIC_TASK_ID,
 
87
  "dataset_link": dataset_link,
88
+ "subset_ratio_used": subset_ratio,
89
  "augmentation_used": aug_config,
90
  "batch_size_used": batch_size,
91
  "seed_used": seed,
92
+ "test_size_used": test_size
93
  }
94
 
95
  return subset_loaders, full_loaders, data_prep_metadata
 
100
  Loaders split data into train, val, test
101
  '''
102
  def get_data_loaders(data_params, subset_dataset, full_dataset):
103
+
104
  # Extract data parameters- these will be used in the DataLoaders
105
  seed = int(data_params['General/seed'])
106
  batch_size = int(data_params['General/dataloaders/batch_size'])
dataPrep/helpers/create_dataset.py CHANGED
@@ -6,7 +6,6 @@ import os
6
  import random
7
  import numpy as np
8
  from datasets import load_dataset
9
- from clearml import Dataset
10
 
11
 
12
  '''
@@ -14,7 +13,7 @@ Load a DS from HuggingFace Link & randomly subset it - upload subset to ClearML
14
  Subset indicies are uploaded to ClearML for reproducibility
15
  REPRODUCE: Load full DS, then load indicies from ClearML to get same subset
16
  '''
17
- def make_subset(dataset_link, subset_ratio, clearml_logger):
18
 
19
  # Load dataset
20
  try:
@@ -34,36 +33,16 @@ def make_subset(dataset_link, subset_ratio, clearml_logger):
34
  random.shuffle(indices)
35
  subset_indices = indices[:subset_size]
36
 
37
- prototyping_dataset = data_plants.select(subset_indices)
38
- # I THINK WE NEED TO REMOVE THIS LATER
39
- # We dont really need to upload subset everytime (Im not sure tho)
40
- # Register subset in ClearML
41
- clearml_dataset = Dataset.create(
42
- dataset_name="Plant Village Prototype",
43
- dataset_project="Small Group Project",
44
- dataset_tags=["prototype", "subset"],
45
- use_current_task=False
46
- )
47
- clearml_dataset.add_tags([
48
- f"subset_ratio_{subset_ratio}",
49
- "hf_source"
50
- ])
51
 
52
- # Save indices
53
  subset_path = "subset_indices.npy"
54
  np.save(subset_path, subset_indices)
55
- clearml_dataset.add_files(subset_path)
56
- clearml_dataset.set_metadata({
57
- "huggingface_dataset": dataset_link,
58
- "subset_ratio": subset_ratio,
59
- "total_samples": len(prototyping_dataset)
60
- })
61
-
62
- clearml_dataset.upload()
63
- clearml_dataset.finalize()
64
- clearml_logger.report_text(f"Created ClearML Dataset: {clearml_dataset.id}")
65
 
66
- # Clean up local file
67
- os.remove(subset_path)
 
 
 
68
 
69
- return data_plants, prototyping_dataset, features, clearml_dataset
 
6
  import random
7
  import numpy as np
8
  from datasets import load_dataset
 
9
 
10
 
11
  '''
 
13
  Subset indicies are uploaded to ClearML for reproducibility
14
  REPRODUCE: Load full DS, then load indicies from ClearML to get same subset
15
  '''
16
+ def make_subset(dataset_link, subset_ratio, clearml_task):
17
 
18
  # Load dataset
19
  try:
 
33
  random.shuffle(indices)
34
  subset_indices = indices[:subset_size]
35
 
36
+ subset_dataset = data_plants.select(subset_indices)
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # -------- Upload the subset indices as a ClearML artifact --------
39
  subset_path = "subset_indices.npy"
40
  np.save(subset_path, subset_indices)
 
 
 
 
 
 
 
 
 
 
41
 
42
+ clearml_task.upload_artifact(
43
+ name="subset_indices",
44
+ artifact_object=subset_path
45
+ )
46
+ clearml_task.get_logger().report_text(f"Uploaded subset indices as artifact: {subset_path}")
47
 
48
+ return data_plants, subset_dataset, features
trainingModel/run_training.py CHANGED
@@ -26,7 +26,7 @@ training_task.connect(data_prep_metadata, name="data_prep_metadata_READONLY")
26
  # Training parameters - Modify these to experiment
27
  training_config = {
28
  "num_classes": 39,
29
- "n_epochs": 3,
30
  "learning_rate": 1e-3,
31
  "optimizer": "adam",
32
  "save_path": "best_model.pt",
 
26
  # Training parameters - Modify these to experiment
27
  training_config = {
28
  "num_classes": 39,
29
+ "n_epochs": 10,
30
  "learning_rate": 1e-3,
31
  "optimizer": "adam",
32
  "save_path": "best_model.pt",