# --- Standard Python Library --- import os import random # --- Data Handling & Analysis --- import numpy as np import pandas as pd from datasets import load_dataset from helpers.create_dataset import make_subset from helpers.transforms_loaders import make_dataset_loaders # --- Visualization --- import matplotlib.pyplot as plt # import seaborn as sns # --- PyTorch (Machine Learning) --- import torch # --- Experiment Tracking --- from clearml import Task # -------- Controllable parameters -------- # Dataset parameters SEED = 42 DATASET_LINK = "DScomp380/plant_village" DATASET_SUBSET_RATIO = 0.25 # Augmentation parameters ROTATION = 30 BRIGHTNESS = 0.2 SATURATION = 0.2 BLUR = 3 # DataLoader parameters BATCH_SIZE = 32 TEST_SIZE = 0.3 # Setting up the SEED to be able to repeat experiments random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED) # ----- ClearML Setup ----- project_name = "Small Group Project" task = Task.init( project_name=f'{project_name}/Data Preparation', task_name='Data Preparation', task_type=Task.TaskTypes.data_processing ) task.set_random_seed(SEED) clearml_logger = task.get_logger() # -------- Track full configuration in ClearML -------- task.connect({ "seed": SEED, "dataset": { "link": DATASET_LINK, "subset_ratio": DATASET_SUBSET_RATIO, }, "augmentation": { "rotation": ROTATION, "brightness": BRIGHTNESS, "saturation": SATURATION, "blur": BLUR }, "dataloaders": { "batch_size": BATCH_SIZE, "test_size": TEST_SIZE } }) # ----- Load a subset from a given dataset & track with ClearML ----- data_plants, prototyping_dataset, features, clearml_dataset = make_subset( DATASET_LINK, DATASET_SUBSET_RATIO, clearml_logger ) # ---- Exploratory data analysis (EDA) ---- # Reformatting the label feature to understand bias labels_list = prototyping_dataset['label'] df_labels = pd.Series(labels_list) label_count = df_labels.value_counts(sort=False) # Checking the amount of samples in each class and logging it to clearML min_count = label_count.min() clearml_logger.report_scalar( title="Exploratory data analysis (EDA)", series="Min Class Count", value=min_count, iteration=1 ) max_count = label_count.max() clearml_logger.report_scalar( title="Exploratory data analysis (EDA)", series="Max Class Count", value=max_count, iteration=1 ) mean_count = label_count.mean() clearml_logger.report_scalar( title="Exploratory data analysis (EDA)", series="Imbalance Ratio (Max/Min)", value=(max_count / min_count), iteration=1 ) print("--- Class imbalance analysis --- ") print(f"Max labels in a class: {max_count}") print(f"Min labels in a class: {min_count}") print(f"Mean labels in a class: {mean_count}") print(f"Imbalance ratio: {max_count/min_count:.2f}") # Mapping indeces to class names class_names = features['label'].names formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names] label_count.index = formatted_class_names plt.figure(figsize=(10,6)) label_count.plot(kind='bar', color='skyblue') plt.title("Class Distribution in Prototype Dataset") plt.xlabel("Class") plt.ylabel("Count") plt.tight_layout() clearml_logger.report_matplotlib_figure( title="EDA Class Distribution", series="Prototype Subset", figure=plt.gcf(), iteration=1 ) # ---------------------------------------------------------------------- if __name__ == "__main__": # ---------------- Dataset splits ---------------- aug_config = { 'rotation': ROTATION, 'brightness': BRIGHTNESS, 'saturation': SATURATION, 'blur': BLUR } prototype_loaders = make_dataset_loaders( prototyping_dataset, SEED, BATCH_SIZE, TEST_SIZE, aug_config ) print("\n--- Handoff Test Successful ---") print(f"Prototype Train loader batches: {len(prototype_loaders['train'])}") print(f"Prototype Validation loader batches: {len(prototype_loaders['val'])}") print(f"Prototype Test loader batches: {len(prototype_loaders['test'])}") clearml_logger.report_text( f"Prototype loaders created: " f"train={len(prototype_loaders['train'])}, " f"val={len(prototype_loaders['val'])}, " f"test={len(prototype_loaders['test'])}" ) final_loaders = make_dataset_loaders( data_plants, SEED, BATCH_SIZE, TEST_SIZE, aug_config ) print("\n--- Handoff Test Successful ---") print(f"Train loader batches: {len(final_loaders['train'])}") print(f"Validation loader batches: {len(final_loaders['val'])}") print(f"Test loader batches: {len(final_loaders['test'])}") # Record dataset info in ClearML task.connect_configuration( {"dataset_id": clearml_dataset.id}, name="Dataset Metadata" ) task.mark_completed() # Close the ClearML task task.close() print("\n--- Script Finished ---")