Spaces:
Running
Running
Anna Rachkova (k24040374)
commited on
Clean up code and remove redundancies
Browse files- data_preparation.py +11 -81
data_preparation.py
CHANGED
|
@@ -35,9 +35,6 @@ task = Task.init(project_name= 'smallGroupProject', task_name = 'data_prep')
|
|
| 35 |
task.set_random_seed(SEED)
|
| 36 |
clearml_logger = task.get_logger()
|
| 37 |
|
| 38 |
-
print("✅ Checkpoint: Imports, SEED and ClearML are set")
|
| 39 |
-
|
| 40 |
-
|
| 41 |
# Loading dataset from HugginFace and checking it
|
| 42 |
try:
|
| 43 |
ds = load_dataset("DScomp380/plant_village")
|
|
@@ -45,38 +42,8 @@ except Exception as e:
|
|
| 45 |
print(f"Error loading the dataset: {e}")
|
| 46 |
|
| 47 |
data_plants = ds['train']
|
| 48 |
-
|
| 49 |
-
print("--- Verification ---")
|
| 50 |
-
# Verification
|
| 51 |
-
print(f"\nLoaded object type: {type(data_plants)}")
|
| 52 |
-
print("\n --- \n")
|
| 53 |
-
|
| 54 |
data_length = len(data_plants)
|
| 55 |
-
print(f"\nLoaded object size: {data_length}")
|
| 56 |
-
print("\n --- \n")
|
| 57 |
-
|
| 58 |
features = data_plants.features
|
| 59 |
-
print(f"\nDataset features: {features}")
|
| 60 |
-
print("\n --- \n")
|
| 61 |
-
|
| 62 |
-
# Verifying label count
|
| 63 |
-
if 'label' in features and hasattr(features['label'], 'num_classes'):
|
| 64 |
-
label_count = features['label'].num_classes
|
| 65 |
-
print(f"Number of disease categories (labels): {label_count}")
|
| 66 |
-
else:
|
| 67 |
-
print("Couldnt determine the labels automatically")
|
| 68 |
-
print("\n --- \n")
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
# Verifying single sample
|
| 72 |
-
sample = data_plants[0]
|
| 73 |
-
print(f"Sample image type: {type(sample['image'])}")
|
| 74 |
-
print(f"Sample label: {sample['label']}")
|
| 75 |
-
print("\n --- \n")
|
| 76 |
-
|
| 77 |
-
print("✅ Checkpoint: Dataset is loaded and data is checked")
|
| 78 |
-
|
| 79 |
-
|
| 80 |
# --------------------------- Data selection --------------------------------
|
| 81 |
# Creating the prototyping dataset
|
| 82 |
SUBSET_RATIO = 0.25 # 25% for prototyping
|
|
@@ -97,10 +64,6 @@ subset_indices = indices[:subset_size]
|
|
| 97 |
|
| 98 |
prototyping_dataset = data_plants.select(subset_indices)
|
| 99 |
|
| 100 |
-
print("✅ Checkpoint: Prototyping dataset is created")
|
| 101 |
-
#Verifying
|
| 102 |
-
print(f"Prototyping dataset size: {len(prototyping_dataset)}")
|
| 103 |
-
|
| 104 |
|
| 105 |
# ---- Exploratory data analysis (EDA) ----
|
| 106 |
|
|
@@ -113,7 +76,7 @@ label_count = df_labels.value_counts(sort = False)
|
|
| 113 |
|
| 114 |
min_count = label_count.min()
|
| 115 |
clearml_logger.report_scalar(
|
| 116 |
-
title="
|
| 117 |
series="Min Class Count",
|
| 118 |
value=min_count,
|
| 119 |
iteration=1
|
|
@@ -121,7 +84,7 @@ clearml_logger.report_scalar(
|
|
| 121 |
|
| 122 |
max_count = label_count.max()
|
| 123 |
clearml_logger.report_scalar(
|
| 124 |
-
title="
|
| 125 |
series="Max Class Count",
|
| 126 |
value=max_count,
|
| 127 |
iteration=1
|
|
@@ -129,46 +92,22 @@ clearml_logger.report_scalar(
|
|
| 129 |
|
| 130 |
mean_count = label_count.mean()
|
| 131 |
clearml_logger.report_scalar(
|
| 132 |
-
title="
|
| 133 |
series="Imbalance Ratio (Max/Min)",
|
| 134 |
value=(max_count / min_count),
|
| 135 |
iteration=1
|
| 136 |
)
|
| 137 |
-
|
| 138 |
print("--- Class imbalance analysis --- ")
|
| 139 |
print(f"Max labels in a class: {max_count}")
|
| 140 |
print(f"Min labels in a class: {min_count}")
|
| 141 |
print(f"Mean labels in a class: {mean_count}")
|
| 142 |
print(f"Imbalance ratio: {max_count/min_count:.2f}")
|
| 143 |
-
print("✅ Checkpoint: Class distribution is calculated")
|
| 144 |
|
| 145 |
# Mapping indeces to class names
|
| 146 |
class_names = features['label'].names
|
| 147 |
formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
|
| 148 |
label_count.index = formatted_class_names
|
| 149 |
|
| 150 |
-
|
| 151 |
-
# Creating bar chart with labels distribution
|
| 152 |
-
label_count.plot(kind='bar', figsize=(15,6))
|
| 153 |
-
plt.xlabel('Labels')
|
| 154 |
-
plt.ylabel('Sample count')
|
| 155 |
-
plt.title('Class distribution among chosen samples')
|
| 156 |
-
|
| 157 |
-
plot_file = 'class_distribution.png'
|
| 158 |
-
plt.savefig(plot_file)
|
| 159 |
-
|
| 160 |
-
clearml_logger.report_image(
|
| 161 |
-
title="EDA", # The title for the plot section in ClearML
|
| 162 |
-
series="Class Distribution", # The name of this specific plot
|
| 163 |
-
iteration=1, # The experiment step
|
| 164 |
-
local_path=plot_file # The path to the file you just saved
|
| 165 |
-
)
|
| 166 |
-
|
| 167 |
-
# To see the plot uncomment but itll pause the code
|
| 168 |
-
#plt.show()
|
| 169 |
-
print("✅ Checkpoint: Plot with classes distributions is created and saved")
|
| 170 |
-
|
| 171 |
-
|
| 172 |
# --------------- Data Splits ------------
|
| 173 |
def get_transform_pipelines():
|
| 174 |
"""
|
|
@@ -188,8 +127,6 @@ def get_transform_pipelines():
|
|
| 188 |
# Normalise the Tensor; Standartises pixel values
|
| 189 |
transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
|
| 190 |
])
|
| 191 |
-
print("✅ Checkpoint: Transform pipeline created")
|
| 192 |
-
|
| 193 |
# Augmentation pipeline (to change some parameters of the pictures to create "new" ones)
|
| 194 |
augmentation_pipeline = transforms.Compose([
|
| 195 |
# Randomly changing some parameters of pictures to enrich dataset
|
|
@@ -200,10 +137,7 @@ def get_transform_pipelines():
|
|
| 200 |
# Convert to Tensor and Normalise
|
| 201 |
transforms.ToTensor(),
|
| 202 |
transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
|
| 203 |
-
])
|
| 204 |
-
|
| 205 |
-
print("✅ Checkpoint: Augmentation pipeline created")
|
| 206 |
-
|
| 207 |
# Return both pipelines
|
| 208 |
return normalisation_pipeline, augmentation_pipeline
|
| 209 |
|
|
@@ -232,8 +166,6 @@ def get_prototype_loaders(batch_size=32):
|
|
| 232 |
proto_val_split = split_2_dict['train']
|
| 233 |
proto_test_split = split_2_dict['test']
|
| 234 |
|
| 235 |
-
print("✅ Checkpoint: Prototype Dataset splitted")
|
| 236 |
-
|
| 237 |
# -- Putting splits through pipelines --
|
| 238 |
proto_train_split.set_transform(augmentation_pipeline)
|
| 239 |
proto_val_split.set_transform(normalisation_pipeline)
|
|
@@ -244,7 +176,6 @@ def get_prototype_loaders(batch_size=32):
|
|
| 244 |
proto_val_loader = DataLoader(dataset = proto_val_split, batch_size = batch_size, shuffle = False )
|
| 245 |
proto_test_loader = DataLoader(dataset = proto_test_split, batch_size = batch_size, shuffle = False )
|
| 246 |
|
| 247 |
-
print("✅ Checkpoint: Prototype DataLoaders are set")
|
| 248 |
return proto_train_loader, proto_val_loader, proto_test_loader
|
| 249 |
|
| 250 |
|
|
@@ -272,8 +203,6 @@ def get_final_loaders(batch_size=32):
|
|
| 272 |
val_split = split_2_dict['train']
|
| 273 |
test_split = split_2_dict['test']
|
| 274 |
|
| 275 |
-
print("✅ Checkpoint: Final Dataset splitted")
|
| 276 |
-
|
| 277 |
# -- Putting splits through pipelines --
|
| 278 |
train_split.set_transform(augmentation_pipeline)
|
| 279 |
val_split.set_transform(normalisation_pipeline)
|
|
@@ -283,22 +212,23 @@ def get_final_loaders(batch_size=32):
|
|
| 283 |
train_loader = DataLoader(dataset = train_split, batch_size = batch_size, shuffle = True )
|
| 284 |
val_loader = DataLoader(dataset = val_split, batch_size = batch_size, shuffle = False )
|
| 285 |
test_loader = DataLoader(dataset = test_split, batch_size = batch_size, shuffle = False )
|
| 286 |
-
|
| 287 |
-
print("✅ Checkpoint: Final DataLoaders are set")
|
| 288 |
return train_loader, val_loader, test_loader
|
| 289 |
|
| 290 |
# ----------------------------------------------------------------------
|
| 291 |
if __name__ == "__main__":
|
| 292 |
|
| 293 |
-
print("\nRunning data_preparation.py")
|
| 294 |
-
|
| 295 |
train_loader, val_loader, test_loader = get_prototype_loaders(batch_size=32)
|
| 296 |
-
|
| 297 |
print("\n--- Handoff Test Successful ---")
|
| 298 |
print(f"Train loader batches: {len(train_loader)}")
|
| 299 |
print(f"Validation loader batches: {len(val_loader)}")
|
| 300 |
print(f"Test loader batches: {len(test_loader)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
# Close the ClearML task
|
| 303 |
task.close()
|
| 304 |
-
print("\n--- Script Finished ---")
|
|
|
|
| 35 |
task.set_random_seed(SEED)
|
| 36 |
clearml_logger = task.get_logger()
|
| 37 |
|
|
|
|
|
|
|
|
|
|
| 38 |
# Loading dataset from HugginFace and checking it
|
| 39 |
try:
|
| 40 |
ds = load_dataset("DScomp380/plant_village")
|
|
|
|
| 42 |
print(f"Error loading the dataset: {e}")
|
| 43 |
|
| 44 |
data_plants = ds['train']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
data_length = len(data_plants)
|
|
|
|
|
|
|
|
|
|
| 46 |
features = data_plants.features
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
# --------------------------- Data selection --------------------------------
|
| 48 |
# Creating the prototyping dataset
|
| 49 |
SUBSET_RATIO = 0.25 # 25% for prototyping
|
|
|
|
| 64 |
|
| 65 |
prototyping_dataset = data_plants.select(subset_indices)
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
# ---- Exploratory data analysis (EDA) ----
|
| 69 |
|
|
|
|
| 76 |
|
| 77 |
min_count = label_count.min()
|
| 78 |
clearml_logger.report_scalar(
|
| 79 |
+
title="Exploratory data analysis (EDA)",
|
| 80 |
series="Min Class Count",
|
| 81 |
value=min_count,
|
| 82 |
iteration=1
|
|
|
|
| 84 |
|
| 85 |
max_count = label_count.max()
|
| 86 |
clearml_logger.report_scalar(
|
| 87 |
+
title="Exploratory data analysis (EDA)",
|
| 88 |
series="Max Class Count",
|
| 89 |
value=max_count,
|
| 90 |
iteration=1
|
|
|
|
| 92 |
|
| 93 |
mean_count = label_count.mean()
|
| 94 |
clearml_logger.report_scalar(
|
| 95 |
+
title="Exploratory data analysis (EDA)",
|
| 96 |
series="Imbalance Ratio (Max/Min)",
|
| 97 |
value=(max_count / min_count),
|
| 98 |
iteration=1
|
| 99 |
)
|
|
|
|
| 100 |
print("--- Class imbalance analysis --- ")
|
| 101 |
print(f"Max labels in a class: {max_count}")
|
| 102 |
print(f"Min labels in a class: {min_count}")
|
| 103 |
print(f"Mean labels in a class: {mean_count}")
|
| 104 |
print(f"Imbalance ratio: {max_count/min_count:.2f}")
|
|
|
|
| 105 |
|
| 106 |
# Mapping indeces to class names
|
| 107 |
class_names = features['label'].names
|
| 108 |
formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
|
| 109 |
label_count.index = formatted_class_names
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
# --------------- Data Splits ------------
|
| 112 |
def get_transform_pipelines():
|
| 113 |
"""
|
|
|
|
| 127 |
# Normalise the Tensor; Standartises pixel values
|
| 128 |
transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
|
| 129 |
])
|
|
|
|
|
|
|
| 130 |
# Augmentation pipeline (to change some parameters of the pictures to create "new" ones)
|
| 131 |
augmentation_pipeline = transforms.Compose([
|
| 132 |
# Randomly changing some parameters of pictures to enrich dataset
|
|
|
|
| 137 |
# Convert to Tensor and Normalise
|
| 138 |
transforms.ToTensor(),
|
| 139 |
transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
|
| 140 |
+
])
|
|
|
|
|
|
|
|
|
|
| 141 |
# Return both pipelines
|
| 142 |
return normalisation_pipeline, augmentation_pipeline
|
| 143 |
|
|
|
|
| 166 |
proto_val_split = split_2_dict['train']
|
| 167 |
proto_test_split = split_2_dict['test']
|
| 168 |
|
|
|
|
|
|
|
| 169 |
# -- Putting splits through pipelines --
|
| 170 |
proto_train_split.set_transform(augmentation_pipeline)
|
| 171 |
proto_val_split.set_transform(normalisation_pipeline)
|
|
|
|
| 176 |
proto_val_loader = DataLoader(dataset = proto_val_split, batch_size = batch_size, shuffle = False )
|
| 177 |
proto_test_loader = DataLoader(dataset = proto_test_split, batch_size = batch_size, shuffle = False )
|
| 178 |
|
|
|
|
| 179 |
return proto_train_loader, proto_val_loader, proto_test_loader
|
| 180 |
|
| 181 |
|
|
|
|
| 203 |
val_split = split_2_dict['train']
|
| 204 |
test_split = split_2_dict['test']
|
| 205 |
|
|
|
|
|
|
|
| 206 |
# -- Putting splits through pipelines --
|
| 207 |
train_split.set_transform(augmentation_pipeline)
|
| 208 |
val_split.set_transform(normalisation_pipeline)
|
|
|
|
| 212 |
train_loader = DataLoader(dataset = train_split, batch_size = batch_size, shuffle = True )
|
| 213 |
val_loader = DataLoader(dataset = val_split, batch_size = batch_size, shuffle = False )
|
| 214 |
test_loader = DataLoader(dataset = test_split, batch_size = batch_size, shuffle = False )
|
|
|
|
|
|
|
| 215 |
return train_loader, val_loader, test_loader
|
| 216 |
|
| 217 |
# ----------------------------------------------------------------------
|
| 218 |
if __name__ == "__main__":
|
| 219 |
|
|
|
|
|
|
|
| 220 |
train_loader, val_loader, test_loader = get_prototype_loaders(batch_size=32)
|
|
|
|
| 221 |
print("\n--- Handoff Test Successful ---")
|
| 222 |
print(f"Train loader batches: {len(train_loader)}")
|
| 223 |
print(f"Validation loader batches: {len(val_loader)}")
|
| 224 |
print(f"Test loader batches: {len(test_loader)}")
|
| 225 |
+
|
| 226 |
+
train_loader_fin, val_loader_fin, test_loader_fin = get_final_loaders(batch_size=32)
|
| 227 |
+
print("\n--- Handoff Test Successful ---")
|
| 228 |
+
print(f"Train loader batches: {len(train_loader_fin)}")
|
| 229 |
+
print(f"Validation loader batches: {len(val_loader_fin)}")
|
| 230 |
+
print(f"Test loader batches: {len(test_loader_fin)}")
|
| 231 |
|
| 232 |
# Close the ClearML task
|
| 233 |
task.close()
|
| 234 |
+
print("\n--- Script Finished ---")
|