Spaces:
Sleeping
Sleeping
Yusuf
commited on
Commit
·
6b1327e
1
Parent(s):
3562c3d
FEAT: log dataset to clearml
Browse files- dataPrep/data_preparation.py +51 -4
dataPrep/data_preparation.py
CHANGED
|
@@ -17,7 +17,7 @@ from torchvision import transforms
|
|
| 17 |
from torch.utils.data import DataLoader
|
| 18 |
|
| 19 |
# --- Experiment Tracking ---
|
| 20 |
-
from clearml import Task, Logger
|
| 21 |
|
| 22 |
|
| 23 |
# Setting up the SEED to be able to repeat experiments
|
|
@@ -31,7 +31,7 @@ if torch.cuda.is_available():
|
|
| 31 |
|
| 32 |
# Initialising a task on ClearML
|
| 33 |
# UPDATE CLEARML
|
| 34 |
-
task = Task.init(project_name= '
|
| 35 |
task.set_random_seed(SEED)
|
| 36 |
clearml_logger = task.get_logger()
|
| 37 |
|
|
@@ -44,11 +44,12 @@ except Exception as e:
|
|
| 44 |
data_plants = ds['train']
|
| 45 |
data_length = len(data_plants)
|
| 46 |
features = data_plants.features
|
|
|
|
| 47 |
# --------------------------- Data selection --------------------------------
|
| 48 |
# Creating the prototyping dataset
|
| 49 |
SUBSET_RATIO = 0.25 # 25% for prototyping
|
| 50 |
|
| 51 |
-
#
|
| 52 |
task.connect_configuration(
|
| 53 |
{"subset_ratio": SUBSET_RATIO},
|
| 54 |
name="Data subsetting"
|
|
@@ -61,9 +62,33 @@ subset_size = int(data_length * SUBSET_RATIO)
|
|
| 61 |
indices = list(range(data_length))
|
| 62 |
random.shuffle(indices)
|
| 63 |
subset_indices = indices[:subset_size]
|
| 64 |
-
|
| 65 |
prototyping_dataset = data_plants.select(subset_indices)
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
# ---- Exploratory data analysis (EDA) ----
|
| 69 |
|
|
@@ -108,6 +133,22 @@ class_names = features['label'].names
|
|
| 108 |
formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
|
| 109 |
label_count.index = formatted_class_names
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
# --------------- Data Splits ------------
|
| 112 |
def get_transform_pipelines():
|
| 113 |
"""
|
|
@@ -228,6 +269,12 @@ if __name__ == "__main__":
|
|
| 228 |
print(f"Train loader batches: {len(train_loader_fin)}")
|
| 229 |
print(f"Validation loader batches: {len(val_loader_fin)}")
|
| 230 |
print(f"Test loader batches: {len(test_loader_fin)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
# Close the ClearML task
|
| 233 |
task.close()
|
|
|
|
| 17 |
from torch.utils.data import DataLoader
|
| 18 |
|
| 19 |
# --- Experiment Tracking ---
|
| 20 |
+
from clearml import Task, Logger, Dataset
|
| 21 |
|
| 22 |
|
| 23 |
# Setting up the SEED to be able to repeat experiments
|
|
|
|
| 31 |
|
| 32 |
# Initialising a task on ClearML
|
| 33 |
# UPDATE CLEARML
|
| 34 |
+
task = Task.init(project_name= 'Small Group CW', task_name = 'data_prep')
|
| 35 |
task.set_random_seed(SEED)
|
| 36 |
clearml_logger = task.get_logger()
|
| 37 |
|
|
|
|
| 44 |
data_plants = ds['train']
|
| 45 |
data_length = len(data_plants)
|
| 46 |
features = data_plants.features
|
| 47 |
+
|
| 48 |
# --------------------------- Data selection --------------------------------
|
| 49 |
# Creating the prototyping dataset
|
| 50 |
SUBSET_RATIO = 0.25 # 25% for prototyping
|
| 51 |
|
| 52 |
+
# Log subset config to ClearML
|
| 53 |
task.connect_configuration(
|
| 54 |
{"subset_ratio": SUBSET_RATIO},
|
| 55 |
name="Data subsetting"
|
|
|
|
| 62 |
indices = list(range(data_length))
|
| 63 |
random.shuffle(indices)
|
| 64 |
subset_indices = indices[:subset_size]
|
|
|
|
| 65 |
prototyping_dataset = data_plants.select(subset_indices)
|
| 66 |
|
| 67 |
+
# Register this subset in ClearML
|
| 68 |
+
dataset = Dataset.create(
|
| 69 |
+
dataset_name="Plant Village Prototype",
|
| 70 |
+
dataset_project="smallGroupProject",
|
| 71 |
+
dataset_tags=["prototype", "subset"]
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# Save indicies used for reproducibility
|
| 75 |
+
subset_path = "subset_indices.npy"
|
| 76 |
+
np.save(subset_path, subset_indices)
|
| 77 |
+
dataset.add_files(subset_path)
|
| 78 |
+
|
| 79 |
+
# Add simple metadata
|
| 80 |
+
dataset.set_metadata({
|
| 81 |
+
"subset_ratio": SUBSET_RATIO,
|
| 82 |
+
"total_samples": len(prototyping_dataset)
|
| 83 |
+
})
|
| 84 |
+
|
| 85 |
+
# Upload to ClearML storage
|
| 86 |
+
dataset.upload()
|
| 87 |
+
dataset.finalize()
|
| 88 |
+
|
| 89 |
+
# Log the dataset ID
|
| 90 |
+
clearml_logger.report_text(f"Created ClearML Dataset: {dataset.id}")
|
| 91 |
+
|
| 92 |
|
| 93 |
# ---- Exploratory data analysis (EDA) ----
|
| 94 |
|
|
|
|
| 133 |
formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
|
| 134 |
label_count.index = formatted_class_names
|
| 135 |
|
| 136 |
+
plt.figure(figsize=(10,6))
|
| 137 |
+
label_count.plot(kind='bar', color='skyblue')
|
| 138 |
+
plt.title("Class Distribution in Prototype Dataset")
|
| 139 |
+
plt.xlabel("Class")
|
| 140 |
+
plt.ylabel("Count")
|
| 141 |
+
plt.tight_layout()
|
| 142 |
+
plt.savefig("class_distribution.png")
|
| 143 |
+
|
| 144 |
+
clearml_logger.report_image(
|
| 145 |
+
title="EDA Class Distribution",
|
| 146 |
+
series="Prototype Subset",
|
| 147 |
+
local_path="class_distribution.png",
|
| 148 |
+
iteration=1
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
# --------------- Data Splits ------------
|
| 153 |
def get_transform_pipelines():
|
| 154 |
"""
|
|
|
|
| 269 |
print(f"Train loader batches: {len(train_loader_fin)}")
|
| 270 |
print(f"Validation loader batches: {len(val_loader_fin)}")
|
| 271 |
print(f"Test loader batches: {len(test_loader_fin)}")
|
| 272 |
+
|
| 273 |
+
# Record dataset info in ClearML
|
| 274 |
+
task.connect_configuration(
|
| 275 |
+
{"dataset_id": dataset.id},
|
| 276 |
+
name="Dataset Metadata"
|
| 277 |
+
)
|
| 278 |
|
| 279 |
# Close the ClearML task
|
| 280 |
task.close()
|