Spaces:

k23064919
/

smallGroupProject

Sleeping

App Files Files Community

Yusuf commited on Nov 13, 2025

Commit

6b1327e

1 Parent(s): 3562c3d

FEAT: log dataset to clearml

Browse files

Files changed (1) hide show

dataPrep/data_preparation.py +51 -4

dataPrep/data_preparation.py CHANGED Viewed

@@ -17,7 +17,7 @@ from torchvision import transforms
 from torch.utils.data import DataLoader
 # --- Experiment Tracking ---
-from clearml import Task, Logger
 # Setting up the SEED to be able to repeat experiments
@@ -31,7 +31,7 @@ if torch.cuda.is_available():
 # Initialising a task on ClearML
 # UPDATE CLEARML
-task = Task.init(project_name= 'smallGroupProject', task_name = 'data_prep')
 task.set_random_seed(SEED)
 clearml_logger = task.get_logger()
@@ -44,11 +44,12 @@ except Exception as e:
 data_plants = ds['train']
 data_length = len(data_plants)
 features = data_plants.features
 # --------------------------- Data selection --------------------------------
 # Creating the prototyping dataset
 SUBSET_RATIO = 0.25 # 25% for prototyping
-# Loggint it to ClearML
 task.connect_configuration(
     {"subset_ratio": SUBSET_RATIO},
     name="Data subsetting"
@@ -61,9 +62,33 @@ subset_size = int(data_length * SUBSET_RATIO)
 indices = list(range(data_length))
 random.shuffle(indices)
 subset_indices = indices[:subset_size]
 prototyping_dataset = data_plants.select(subset_indices)
 # ---- Exploratory data analysis (EDA) ----
@@ -108,6 +133,22 @@ class_names = features['label'].names
 formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
 label_count.index = formatted_class_names
 # --------------- Data Splits ------------
 def get_transform_pipelines():
     """
@@ -228,6 +269,12 @@ if __name__ == "__main__":
     print(f"Train loader batches: {len(train_loader_fin)}")
     print(f"Validation loader batches: {len(val_loader_fin)}")
     print(f"Test loader batches: {len(test_loader_fin)}")
     # Close the ClearML task
     task.close()

 from torch.utils.data import DataLoader
 # --- Experiment Tracking ---
+from clearml import Task, Logger, Dataset
 # Setting up the SEED to be able to repeat experiments
 # Initialising a task on ClearML
 # UPDATE CLEARML
+task = Task.init(project_name= 'Small Group CW', task_name = 'data_prep')
 task.set_random_seed(SEED)
 clearml_logger = task.get_logger()
 data_plants = ds['train']
 data_length = len(data_plants)
 features = data_plants.features
 # --------------------------- Data selection --------------------------------
 # Creating the prototyping dataset
 SUBSET_RATIO = 0.25 # 25% for prototyping
+# Log subset config to ClearML
 task.connect_configuration(
     {"subset_ratio": SUBSET_RATIO},
     name="Data subsetting"
 indices = list(range(data_length))
 random.shuffle(indices)
 subset_indices = indices[:subset_size]
 prototyping_dataset = data_plants.select(subset_indices)
+# Register this subset in ClearML
+dataset = Dataset.create(
+    dataset_name="Plant Village Prototype",
+    dataset_project="smallGroupProject",
+    dataset_tags=["prototype", "subset"]
+)
+# Save indicies used for reproducibility
+subset_path = "subset_indices.npy"
+np.save(subset_path, subset_indices)
+dataset.add_files(subset_path)
+# Add simple metadata
+dataset.set_metadata({
+    "subset_ratio": SUBSET_RATIO,
+    "total_samples": len(prototyping_dataset)
+})
+# Upload to ClearML storage
+dataset.upload()
+dataset.finalize()
+# Log the dataset ID
+clearml_logger.report_text(f"Created ClearML Dataset: {dataset.id}")
 # ---- Exploratory data analysis (EDA) ----
 formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
 label_count.index = formatted_class_names
+plt.figure(figsize=(10,6))
+label_count.plot(kind='bar', color='skyblue')
+plt.title("Class Distribution in Prototype Dataset")
+plt.xlabel("Class")
+plt.ylabel("Count")
+plt.tight_layout()
+plt.savefig("class_distribution.png")
+clearml_logger.report_image(
+    title="EDA Class Distribution",
+    series="Prototype Subset",
+    local_path="class_distribution.png",
+    iteration=1
+)
 # --------------- Data Splits ------------
 def get_transform_pipelines():
     """
     print(f"Train loader batches: {len(train_loader_fin)}")
     print(f"Validation loader batches: {len(val_loader_fin)}")
     print(f"Test loader batches: {len(test_loader_fin)}")
+    # Record dataset info in ClearML
+    task.connect_configuration(
+        {"dataset_id": dataset.id},
+        name="Dataset Metadata"
+    )
     # Close the ClearML task
     task.close()