Spaces:

k23064919
/

smallGroupProject

Sleeping

App Files Files Community

ra1425 commited on Nov 6, 2025

Commit

c0dc8ab

1 Parent(s): 71353f6

FEAT: Completed EDA, subsetting, and logged all artifacts to ClearML

Browse files

Files changed (1) hide show

data_preparation.py +96 -1

data_preparation.py CHANGED Viewed

@@ -1,6 +1,12 @@
 import os
 import random
 import numpy as np
 import torch
 from clearml import Task, Logger
 from datasets import load_dataset
@@ -16,6 +22,8 @@ if torch.cuda.is_available():
 # Initialising a task on ClearML
 task = Task.init(project_name= 'smallGroupProject', task_name = 'data_prep')
 task.set_random_seed(SEED)
 # Loading dataset from HugginFace
 try:
@@ -44,4 +52,91 @@ else:
 # Verifying single sample
 sample = data_plants[0]
 print(f"Sample image type: {type(sample['image'])}")
-print(f"Sample label: {sample['label']}")

 import os
 import random
 import numpy as np
+import pandas as pd
+# Visualisation
+#import seaborn as sns
+import matplotlib.pyplot as plt
 import torch
 from clearml import Task, Logger
 from datasets import load_dataset
 # Initialising a task on ClearML
 task = Task.init(project_name= 'smallGroupProject', task_name = 'data_prep')
 task.set_random_seed(SEED)
+clearml_logger = task.get_logger()
 # Loading dataset from HugginFace
 try:
 # Verifying single sample
 sample = data_plants[0]
 print(f"Sample image type: {type(sample['image'])}")
+print(f"Sample label: {sample['label']}")
+# -----------------------------------------------------------
+# Creating the prototyping dataset
+SUBSET_RATIO = 0.25 # 25% for prototyping
+# Loggint it to ClearML
+task.connect_configuration(
+    {"subset_ratio": SUBSET_RATIO},
+    name="Data subsetting"
+)
+# Calculate amount of samples we use
+subset_size = int(data_length * SUBSET_RATIO)
+# Creating a subset of random data (by their indices)
+indices = list(range(data_length))
+random.shuffle(indices)
+subset_indices = indices[:subset_size]
+prototyping_dataset = data_plants.select(subset_indices)
+#Verifying
+print(f"Prototyping dataset size: {len(prototyping_dataset)}")
+# -----------------------------------------------------------
+# Exploratory data analysis (EDA)
+#sns.set(color_codes = True)
+# Reformatting the label feature to understand bias
+labels_list = prototyping_dataset['label']
+df_labels = pd.Series(labels_list)
+label_count = df_labels.value_counts(sort = False)
+# Checking the amount of samples in each class and logging it to clearML
+min_count = label_count.min()
+clearml_logger.report_scalar(
+    title="Classes Counts",
+    series="Min Class Count",
+    value=min_count,
+    iteration=1
+)
+max_count = label_count.max()
+clearml_logger.report_scalar(
+    title="Classes Counts",
+    series="Max Class Count",
+    value=max_count,
+    iteration=1
+)
+mean_count = label_count.mean()
+clearml_logger.report_scalar(
+    title="Classes Counts",
+    series="Imbalance Ratio (Max/Min)",
+    value=(max_count / min_count),
+    iteration=1
+)
+print("Class imbalance analysis: ")
+print(f"Max labels in a class: {max_count}")
+print(f"Min labels in a class: {min_count}")
+print(f"Mean labels in a class: {mean_count}")
+print(f"Imbalance ratio: {max_count/min_count:.2f}")
+# Mapping indeces to class names
+class_names = features['label'].names
+formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
+label_count.index = formatted_class_names
+# Creating bar chart with labels distribution
+label_count.plot(kind='bar', figsize=(15,6))
+plt.xlabel('Labels')
+plt.ylabel('Sample count')
+plt.title('Class distribution among chosen samples')
+plot_file = 'class_distribution.png'
+plt.savefig(plot_file)
+clearml_logger.report_image(
+    title="EDA",                    # The title for the plot section in ClearML
+    series="Class Distribution",    # The name of this specific plot
+    iteration=1,                  # The experiment step
+    local_path=plot_file      # The path to the file you just saved
+)
+plt.show()