Spaces:

k23064919
/

smallGroupProject

Runtime error

App Files Files Community

Yusuf commited on Nov 27, 2025

Commit

03021e1

1 Parent(s): 0abee12

fix: replace clearml datasets with artifacts

Browse files

Files changed (4) hide show

dataPrep/data_preparation.py +10 -13
dataPrep/helpers/clearml_data.py +42 -16
dataPrep/helpers/create_dataset.py +9 -30
trainingModel/run_training.py +1 -1

dataPrep/data_preparation.py CHANGED Viewed

@@ -74,15 +74,15 @@ task.connect({
 })
 # ----- Load a subset from a given dataset & track with ClearML -----
-data_plants, prototyping_dataset, features, clearml_dataset = make_subset(
-    DATASET_LINK, DATASET_SUBSET_RATIO, clearml_logger
 )
 # ---- Exploratory data analysis (EDA) ----
 # Reformatting the label feature to understand bias
-labels_list = prototyping_dataset['label']
 df_labels = pd.Series(labels_list)
 label_count = df_labels.value_counts(sort=False)
@@ -111,6 +111,7 @@ clearml_logger.report_scalar(
     value=(max_count / min_count),
     iteration=1
 )
 print("--- Class imbalance analysis --- ")
 print(f"Max labels in a class: {max_count}")
 print(f"Min labels in a class: {min_count}")
@@ -122,16 +123,17 @@ class_names = features['label'].names
 formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
 label_count.index = formatted_class_names
 plt.figure(figsize=(10,6))
 label_count.plot(kind='bar', color='skyblue')
-plt.title("Class Distribution in Prototype Dataset")
 plt.xlabel("Class")
 plt.ylabel("Count")
 plt.tight_layout()
 clearml_logger.report_matplotlib_figure(
     title="EDA Class Distribution",
-    series="Prototype Subset",
     figure=plt.gcf(),
     iteration=1
 )
@@ -149,7 +151,7 @@ if __name__ == "__main__":
     }
     prototype_loaders = make_dataset_loaders(
-        prototyping_dataset, SEED, BATCH_SIZE, TEST_SIZE, aug_config
     )
     print("\n--- Handoff Test Successful ---")
@@ -173,14 +175,9 @@ if __name__ == "__main__":
     print(f"Validation loader batches: {len(final_loaders['val'])}")
     print(f"Test loader batches: {len(final_loaders['test'])}")
-    # Record dataset info in ClearML
-    task.connect_configuration(
-        {"dataset_id": clearml_dataset.id},
-        name="Dataset Metadata"
-    )
-    task.mark_completed()
     # Close the ClearML task
     task.close()
     print("\n--- Script Finished ---")

 })
 # ----- Load a subset from a given dataset & track with ClearML -----
+data_plants, subset_dataset, features = make_subset(
+    DATASET_LINK, DATASET_SUBSET_RATIO, task
 )
 # ---- Exploratory data analysis (EDA) ----
 # Reformatting the label feature to understand bias
+labels_list = subset_dataset['label']
 df_labels = pd.Series(labels_list)
 label_count = df_labels.value_counts(sort=False)
     value=(max_count / min_count),
     iteration=1
 )
 print("--- Class imbalance analysis --- ")
 print(f"Max labels in a class: {max_count}")
 print(f"Min labels in a class: {min_count}")
 formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
 label_count.index = formatted_class_names
+# Plotting class distribution
 plt.figure(figsize=(10,6))
 label_count.plot(kind='bar', color='skyblue')
+plt.title("Class Distribution in Subset Dataset")
 plt.xlabel("Class")
 plt.ylabel("Count")
 plt.tight_layout()
 clearml_logger.report_matplotlib_figure(
     title="EDA Class Distribution",
+    series="Subset Dataset",
     figure=plt.gcf(),
     iteration=1
 )
     }
     prototype_loaders = make_dataset_loaders(
+        subset_dataset, SEED, BATCH_SIZE, TEST_SIZE, aug_config
     )
     print("\n--- Handoff Test Successful ---")
     print(f"Validation loader batches: {len(final_loaders['val'])}")
     print(f"Test loader batches: {len(final_loaders['test'])}")
     # Close the ClearML task
+    task.mark_completed()
     task.close()
     print("\n--- Script Finished ---")

dataPrep/helpers/clearml_data.py CHANGED Viewed

@@ -7,37 +7,62 @@ from dataPrep.helpers.transforms_loaders import make_dataset_loaders
 '''
-Takes latest Data Prep ClearML task from project and extracts data loaders and metadata
 '''
 def extract_latest_data_task(project_name: str = "Small Group Project"):
-  all_tasks = Task.get_tasks(project_name=project_name)
   if not all_tasks:
       raise RuntimeError(f"No tasks found in project '{project_name}'")
-  dp_tasks = [t for t in all_tasks if t.name == "Data Preparation"]
   if not dp_tasks:
       raise RuntimeError("No 'Data Preparation' tasks found in this project!")
   # Latest Data Prep Task
-  latest_task = max(dp_tasks, key=lambda t: t.id)
   DYNAMIC_TASK_ID = latest_task.id
   DATA_PREP = Task.get_task(task_id=DYNAMIC_TASK_ID)
-  # Dataset ID
-  config_objects = DATA_PREP.get_configuration_objects()
-  raw_meta = config_objects["Dataset Metadata"]
-  dataset_id = raw_meta.split("=")[1].strip().replace('"', "")
-  # Load ClearML Dataset
-  subset_clearml = Dataset.get(dataset_id=dataset_id)
-  local_folder = subset_clearml.get_local_copy()
-  subset_indices = np.load(os.path.join(local_folder, "subset_indices.npy"))
-  # Load Dataset Parameters
   data_params = DATA_PREP.get_parameters()
   dataset_link = data_params['General/dataset/link']
   # Load Full Dataset
   try:
@@ -59,11 +84,12 @@ def extract_latest_data_task(project_name: str = "Small Group Project"):
   # Gather data prep task metadata
   data_prep_metadata = {
     "data_prep_task_id": DYNAMIC_TASK_ID,
-    "dataset_id": dataset_id,
     "dataset_link": dataset_link,
     "augmentation_used": aug_config,
     "batch_size_used": batch_size,
     "seed_used": seed,
   }
   return subset_loaders, full_loaders, data_prep_metadata
@@ -74,7 +100,7 @@ Takes a given dataset, subset, data params to create DataLoaders
 Loaders split data into train, val, test
 '''
 def get_data_loaders(data_params, subset_dataset, full_dataset):
   # Extract data parameters- these will be used in the DataLoaders
   seed = int(data_params['General/seed'])
   batch_size = int(data_params['General/dataloaders/batch_size'])

 '''
+Takes latest Data Prep ClearML task from project and reconstruct:
+- data loaders for both full and subset datasets
+- Aug settings used
 '''
 def extract_latest_data_task(project_name: str = "Small Group Project"):
+  # --------- Get latest Data Preparation task from ClearML ---------
+  all_tasks = Task.get_tasks(
+    project_name=project_name,
+    allow_archived=False,
+    task_filter={'order_by': ["-last_update"]},
+  )
   if not all_tasks:
       raise RuntimeError(f"No tasks found in project '{project_name}'")
+  dp_tasks = [
+    t for t in all_tasks
+    if t.task_type == Task.TaskTypes.data_processing
+    and t.completed is not None
+  ]
   if not dp_tasks:
       raise RuntimeError("No 'Data Preparation' tasks found in this project!")
   # Latest Data Prep Task
+  latest_task = dp_tasks[0]
   DYNAMIC_TASK_ID = latest_task.id
   DATA_PREP = Task.get_task(task_id=DYNAMIC_TASK_ID)
+  # Load subset indices artifact from Data Prep task
+  artifacts = DATA_PREP.artifacts
+  if "subset_indices" not in artifacts:
+      raise RuntimeError("Data Prep task did not upload 'subset_indices' artifact!")
+  artifact = artifacts["subset_indices"]
+  subset_indices_path = artifact.get_local_copy()
+  subset_indices = np.load(subset_indices_path)
+  # Load dataset metadata from Data Prep task
   data_params = DATA_PREP.get_parameters()
+  subset_ratio = float(data_params['General/dataset/subset_ratio'])
   dataset_link = data_params['General/dataset/link']
+  seed = int(data_params['General/seed'])
+  batch_size = int(data_params['General/dataloaders/batch_size'])
+  test_size = float(data_params['General/dataloaders/test_size'])
+  aug_config = {
+      'rotation': float(data_params['General/augmentation/rotation']),
+      'brightness': float(data_params['General/augmentation/brightness']),
+      'saturation': float(data_params['General/augmentation/saturation']),
+      'blur': float(data_params['General/augmentation/blur']),
+  }
   # Load Full Dataset
   try:
   # Gather data prep task metadata
   data_prep_metadata = {
     "data_prep_task_id": DYNAMIC_TASK_ID,
     "dataset_link": dataset_link,
+    "subset_ratio_used": subset_ratio,
     "augmentation_used": aug_config,
     "batch_size_used": batch_size,
     "seed_used": seed,
+    "test_size_used": test_size
   }
   return subset_loaders, full_loaders, data_prep_metadata
 Loaders split data into train, val, test
 '''
 def get_data_loaders(data_params, subset_dataset, full_dataset):
   # Extract data parameters- these will be used in the DataLoaders
   seed = int(data_params['General/seed'])
   batch_size = int(data_params['General/dataloaders/batch_size'])

dataPrep/helpers/create_dataset.py CHANGED Viewed

@@ -6,7 +6,6 @@ import os
 import random
 import numpy as np
 from datasets import load_dataset
-from clearml import Dataset
 '''
@@ -14,7 +13,7 @@ Load a DS from HuggingFace Link & randomly subset it - upload subset to ClearML
 Subset indicies are uploaded to ClearML for reproducibility
 REPRODUCE: Load full DS, then load indicies from ClearML to get same subset
 '''
-def make_subset(dataset_link, subset_ratio, clearml_logger):
     # Load dataset
     try:
@@ -34,36 +33,16 @@ def make_subset(dataset_link, subset_ratio, clearml_logger):
     random.shuffle(indices)
     subset_indices = indices[:subset_size]
-    prototyping_dataset = data_plants.select(subset_indices)
-# I THINK WE NEED TO REMOVE THIS LATER
-# We dont really need to upload subset everytime (Im not sure tho)
-    # Register subset in ClearML
-    clearml_dataset = Dataset.create(
-        dataset_name="Plant Village Prototype",
-        dataset_project="Small Group Project",
-        dataset_tags=["prototype", "subset"],
-        use_current_task=False
-    )
-    clearml_dataset.add_tags([
-        f"subset_ratio_{subset_ratio}",
-        "hf_source"
-    ])
-    # Save indices
     subset_path = "subset_indices.npy"
     np.save(subset_path, subset_indices)
-    clearml_dataset.add_files(subset_path)
-    clearml_dataset.set_metadata({
-        "huggingface_dataset": dataset_link,
-        "subset_ratio": subset_ratio,
-        "total_samples": len(prototyping_dataset)
-    })
-    clearml_dataset.upload()
-    clearml_dataset.finalize()
-    clearml_logger.report_text(f"Created ClearML Dataset: {clearml_dataset.id}")
-    # Clean up local file
-    os.remove(subset_path)
-    return data_plants, prototyping_dataset, features, clearml_dataset

 import random
 import numpy as np
 from datasets import load_dataset
 '''
 Subset indicies are uploaded to ClearML for reproducibility
 REPRODUCE: Load full DS, then load indicies from ClearML to get same subset
 '''
+def make_subset(dataset_link, subset_ratio, clearml_task):
     # Load dataset
     try:
     random.shuffle(indices)
     subset_indices = indices[:subset_size]
+    subset_dataset = data_plants.select(subset_indices)
+    # -------- Upload the subset indices as a ClearML artifact --------
     subset_path = "subset_indices.npy"
     np.save(subset_path, subset_indices)
+    clearml_task.upload_artifact(
+        name="subset_indices",
+        artifact_object=subset_path
+    )
+    clearml_task.get_logger().report_text(f"Uploaded subset indices as artifact: {subset_path}")
+    return data_plants, subset_dataset, features

trainingModel/run_training.py CHANGED Viewed

@@ -26,7 +26,7 @@ training_task.connect(data_prep_metadata, name="data_prep_metadata_READONLY")
 # Training parameters - Modify these to experiment
 training_config = {
     "num_classes": 39,
-    "n_epochs": 3,
     "learning_rate": 1e-3,
     "optimizer": "adam",
     "save_path": "best_model.pt",

 # Training parameters - Modify these to experiment
 training_config = {
     "num_classes": 39,
+    "n_epochs": 10,
     "learning_rate": 1e-3,
     "optimizer": "adam",
     "save_path": "best_model.pt",