Spaces:
Sleeping
Sleeping
| # --- Standard Python Library --- | |
| import os | |
| import random | |
| # --- Data Handling & Analysis --- | |
| import numpy as np | |
| import pandas as pd | |
| from datasets import load_dataset | |
| from helpers.create_dataset import make_subset | |
| from helpers.transforms_loaders import make_dataset_loaders | |
| # --- Visualization --- | |
| import matplotlib.pyplot as plt | |
| # import seaborn as sns | |
| # --- PyTorch (Machine Learning) --- | |
| import torch | |
| # --- Experiment Tracking --- | |
| from clearml import Task | |
| # -------- Controllable parameters -------- | |
| # Dataset parameters | |
| SEED = 42 | |
| DATASET_LINK = "DScomp380/plant_village" | |
| DATASET_SUBSET_RATIO = 0.25 | |
| # Augmentation parameters | |
| ROTATION = 30 | |
| BRIGHTNESS = 0.2 | |
| SATURATION = 0.2 | |
| BLUR = 3 | |
| # DataLoader parameters | |
| BATCH_SIZE = 32 | |
| TEST_SIZE = 0.3 | |
| # Setting up the SEED to be able to repeat experiments | |
| random.seed(SEED) | |
| np.random.seed(SEED) | |
| torch.manual_seed(SEED) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed_all(SEED) | |
| # ----- ClearML Setup ----- | |
| project_name = "Small Group Project" | |
| task = Task.init( | |
| project_name=f'{project_name}/Data Preparation', | |
| task_name='Data Preparation', | |
| task_type=Task.TaskTypes.data_processing | |
| ) | |
| task.set_random_seed(SEED) | |
| clearml_logger = task.get_logger() | |
| # -------- Track full configuration in ClearML -------- | |
| task.connect({ | |
| "seed": SEED, | |
| "dataset": { | |
| "link": DATASET_LINK, | |
| "subset_ratio": DATASET_SUBSET_RATIO, | |
| }, | |
| "augmentation": { | |
| "rotation": ROTATION, | |
| "brightness": BRIGHTNESS, | |
| "saturation": SATURATION, | |
| "blur": BLUR | |
| }, | |
| "dataloaders": { | |
| "batch_size": BATCH_SIZE, | |
| "test_size": TEST_SIZE | |
| } | |
| }) | |
| # ----- Load a subset from a given dataset & track with ClearML ----- | |
| data_plants, prototyping_dataset, features, clearml_dataset = make_subset( | |
| DATASET_LINK, DATASET_SUBSET_RATIO, clearml_logger | |
| ) | |
| # ---- Exploratory data analysis (EDA) ---- | |
| # Reformatting the label feature to understand bias | |
| labels_list = prototyping_dataset['label'] | |
| df_labels = pd.Series(labels_list) | |
| label_count = df_labels.value_counts(sort=False) | |
| # Checking the amount of samples in each class and logging it to clearML | |
| min_count = label_count.min() | |
| clearml_logger.report_scalar( | |
| title="Exploratory data analysis (EDA)", | |
| series="Min Class Count", | |
| value=min_count, | |
| iteration=1 | |
| ) | |
| max_count = label_count.max() | |
| clearml_logger.report_scalar( | |
| title="Exploratory data analysis (EDA)", | |
| series="Max Class Count", | |
| value=max_count, | |
| iteration=1 | |
| ) | |
| mean_count = label_count.mean() | |
| clearml_logger.report_scalar( | |
| title="Exploratory data analysis (EDA)", | |
| series="Imbalance Ratio (Max/Min)", | |
| value=(max_count / min_count), | |
| iteration=1 | |
| ) | |
| print("--- Class imbalance analysis --- ") | |
| print(f"Max labels in a class: {max_count}") | |
| print(f"Min labels in a class: {min_count}") | |
| print(f"Mean labels in a class: {mean_count}") | |
| print(f"Imbalance ratio: {max_count/min_count:.2f}") | |
| # Mapping indeces to class names | |
| class_names = features['label'].names | |
| formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names] | |
| label_count.index = formatted_class_names | |
| plt.figure(figsize=(10,6)) | |
| label_count.plot(kind='bar', color='skyblue') | |
| plt.title("Class Distribution in Prototype Dataset") | |
| plt.xlabel("Class") | |
| plt.ylabel("Count") | |
| plt.tight_layout() | |
| clearml_logger.report_matplotlib_figure( | |
| title="EDA Class Distribution", | |
| series="Prototype Subset", | |
| figure=plt.gcf(), | |
| iteration=1 | |
| ) | |
| # ---------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| # ---------------- Dataset splits ---------------- | |
| aug_config = { | |
| 'rotation': ROTATION, | |
| 'brightness': BRIGHTNESS, | |
| 'saturation': SATURATION, | |
| 'blur': BLUR | |
| } | |
| prototype_loaders = make_dataset_loaders( | |
| prototyping_dataset, SEED, BATCH_SIZE, TEST_SIZE, aug_config | |
| ) | |
| print("\n--- Handoff Test Successful ---") | |
| print(f"Prototype Train loader batches: {len(prototype_loaders['train'])}") | |
| print(f"Prototype Validation loader batches: {len(prototype_loaders['val'])}") | |
| print(f"Prototype Test loader batches: {len(prototype_loaders['test'])}") | |
| clearml_logger.report_text( | |
| f"Prototype loaders created: " | |
| f"train={len(prototype_loaders['train'])}, " | |
| f"val={len(prototype_loaders['val'])}, " | |
| f"test={len(prototype_loaders['test'])}" | |
| ) | |
| final_loaders = make_dataset_loaders( | |
| data_plants, SEED, BATCH_SIZE, TEST_SIZE, aug_config | |
| ) | |
| print("\n--- Handoff Test Successful ---") | |
| print(f"Train loader batches: {len(final_loaders['train'])}") | |
| print(f"Validation loader batches: {len(final_loaders['val'])}") | |
| print(f"Test loader batches: {len(final_loaders['test'])}") | |
| # Record dataset info in ClearML | |
| task.connect_configuration( | |
| {"dataset_id": clearml_dataset.id}, | |
| name="Dataset Metadata" | |
| ) | |
| task.mark_completed() | |
| # Close the ClearML task | |
| task.close() | |
| print("\n--- Script Finished ---") | |