Spaces:

k23064919
/

smallGroupProject

Running

App Files Files Community

Anna Rachkova (k24040374) commited on Nov 12, 2025

Commit

89c8841

unverified ·

1 Parent(s): 9af0f61

Clean up code and remove redundancies

Browse files

Files changed (1) hide show

data_preparation.py +11 -81

data_preparation.py CHANGED Viewed

@@ -35,9 +35,6 @@ task = Task.init(project_name= 'smallGroupProject', task_name = 'data_prep')
 task.set_random_seed(SEED)
 clearml_logger = task.get_logger()
-print("✅ Checkpoint: Imports, SEED and ClearML are set")
 # Loading dataset from HugginFace and checking it
 try:
     ds = load_dataset("DScomp380/plant_village")
@@ -45,38 +42,8 @@ except Exception as e:
     print(f"Error loading the dataset: {e}")
 data_plants = ds['train']
-print("--- Verification ---")
-# Verification
-print(f"\nLoaded object type: {type(data_plants)}")
-print("\n --- \n")
 data_length = len(data_plants)
-print(f"\nLoaded object size: {data_length}")
-print("\n --- \n")
 features = data_plants.features
-print(f"\nDataset features: {features}")
-print("\n --- \n")
-# Verifying label count
-if 'label' in features and hasattr(features['label'], 'num_classes'):
-    label_count = features['label'].num_classes
-    print(f"Number of disease categories (labels): {label_count}")
-else:
-    print("Couldnt determine the labels automatically")
-print("\n --- \n")
-# Verifying single sample
-sample = data_plants[0]
-print(f"Sample image type: {type(sample['image'])}")
-print(f"Sample label: {sample['label']}")
-print("\n --- \n")
-print("✅ Checkpoint: Dataset is loaded and data is checked")
 # --------------------------- Data selection --------------------------------
 # Creating the prototyping dataset
 SUBSET_RATIO = 0.25 # 25% for prototyping
@@ -97,10 +64,6 @@ subset_indices = indices[:subset_size]
 prototyping_dataset = data_plants.select(subset_indices)
-print("✅ Checkpoint: Prototyping dataset is created")
-#Verifying
-print(f"Prototyping dataset size: {len(prototyping_dataset)}")
 # ---- Exploratory data analysis (EDA) ----
@@ -113,7 +76,7 @@ label_count = df_labels.value_counts(sort = False)
 min_count = label_count.min()
 clearml_logger.report_scalar(
-    title="Classes Counts",
     series="Min Class Count",
     value=min_count,
     iteration=1
@@ -121,7 +84,7 @@ clearml_logger.report_scalar(
 max_count = label_count.max()
 clearml_logger.report_scalar(
-    title="Classes Counts",
     series="Max Class Count",
     value=max_count,
     iteration=1
@@ -129,46 +92,22 @@ clearml_logger.report_scalar(
 mean_count = label_count.mean()
 clearml_logger.report_scalar(
-    title="Classes Counts",
     series="Imbalance Ratio (Max/Min)",
     value=(max_count / min_count),
     iteration=1
 )
 print("--- Class imbalance analysis --- ")
 print(f"Max labels in a class: {max_count}")
 print(f"Min labels in a class: {min_count}")
 print(f"Mean labels in a class: {mean_count}")
 print(f"Imbalance ratio: {max_count/min_count:.2f}")
-print("✅ Checkpoint: Class distribution is calculated")
 # Mapping indeces to class names
 class_names = features['label'].names
 formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
 label_count.index = formatted_class_names
-# Creating bar chart with labels distribution
-label_count.plot(kind='bar', figsize=(15,6))
-plt.xlabel('Labels')
-plt.ylabel('Sample count')
-plt.title('Class distribution among chosen samples')
-plot_file = 'class_distribution.png'
-plt.savefig(plot_file)
-clearml_logger.report_image(
-    title="EDA",                    # The title for the plot section in ClearML
-    series="Class Distribution",    # The name of this specific plot
-    iteration=1,                  # The experiment step
-    local_path=plot_file      # The path to the file you just saved
-)
-# To see the plot uncomment but itll pause the code
-#plt.show()
-print("✅ Checkpoint: Plot with classes distributions is created and saved")
 # --------------- Data Splits ------------
 def get_transform_pipelines():
     """
@@ -188,8 +127,6 @@ def get_transform_pipelines():
         # Normalise the Tensor; Standartises pixel values
         transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
     ])
-    print("✅ Checkpoint: Transform pipeline created")
     # Augmentation pipeline (to change some parameters of the pictures to create "new" ones)
     augmentation_pipeline = transforms.Compose([
         # Randomly changing some parameters of pictures to enrich dataset
@@ -200,10 +137,7 @@ def get_transform_pipelines():
         # Convert to Tensor and Normalise
         transforms.ToTensor(),
         transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
-    ])
-    print("✅ Checkpoint: Augmentation pipeline created")
     # Return both pipelines
     return normalisation_pipeline, augmentation_pipeline
@@ -232,8 +166,6 @@ def get_prototype_loaders(batch_size=32):
     proto_val_split = split_2_dict['train']
     proto_test_split = split_2_dict['test']
-    print("✅ Checkpoint: Prototype Dataset splitted")
     # -- Putting splits through pipelines --
     proto_train_split.set_transform(augmentation_pipeline)
     proto_val_split.set_transform(normalisation_pipeline)
@@ -244,7 +176,6 @@ def get_prototype_loaders(batch_size=32):
     proto_val_loader = DataLoader(dataset = proto_val_split, batch_size = batch_size, shuffle = False )
     proto_test_loader = DataLoader(dataset = proto_test_split, batch_size = batch_size, shuffle = False )
-    print("✅ Checkpoint: Prototype DataLoaders are set")
     return proto_train_loader, proto_val_loader, proto_test_loader
@@ -272,8 +203,6 @@ def get_final_loaders(batch_size=32):
     val_split = split_2_dict['train']
     test_split = split_2_dict['test']
-    print("✅ Checkpoint: Final Dataset splitted")
     # -- Putting splits through pipelines --
     train_split.set_transform(augmentation_pipeline)
     val_split.set_transform(normalisation_pipeline)
@@ -283,22 +212,23 @@ def get_final_loaders(batch_size=32):
     train_loader = DataLoader(dataset = train_split, batch_size = batch_size, shuffle = True )
     val_loader = DataLoader(dataset = val_split, batch_size = batch_size, shuffle = False )
     test_loader = DataLoader(dataset = test_split, batch_size = batch_size, shuffle = False )
-    print("✅ Checkpoint: Final DataLoaders are set")
     return train_loader, val_loader, test_loader
 # ----------------------------------------------------------------------
 if __name__ == "__main__":
-    print("\nRunning data_preparation.py")
     train_loader, val_loader, test_loader = get_prototype_loaders(batch_size=32)
     print("\n--- Handoff Test Successful ---")
     print(f"Train loader batches: {len(train_loader)}")
     print(f"Validation loader batches: {len(val_loader)}")
     print(f"Test loader batches: {len(test_loader)}")
     # Close the ClearML task
     task.close()
-    print("\n--- Script Finished ---")

 task.set_random_seed(SEED)
 clearml_logger = task.get_logger()
 # Loading dataset from HugginFace and checking it
 try:
     ds = load_dataset("DScomp380/plant_village")
     print(f"Error loading the dataset: {e}")
 data_plants = ds['train']
 data_length = len(data_plants)
 features = data_plants.features
 # --------------------------- Data selection --------------------------------
 # Creating the prototyping dataset
 SUBSET_RATIO = 0.25 # 25% for prototyping
 prototyping_dataset = data_plants.select(subset_indices)
 # ---- Exploratory data analysis (EDA) ----
 min_count = label_count.min()
 clearml_logger.report_scalar(
+    title="Exploratory data analysis (EDA)",
     series="Min Class Count",
     value=min_count,
     iteration=1
 max_count = label_count.max()
 clearml_logger.report_scalar(
+    title="Exploratory data analysis (EDA)",
     series="Max Class Count",
     value=max_count,
     iteration=1
 mean_count = label_count.mean()
 clearml_logger.report_scalar(
+    title="Exploratory data analysis (EDA)",
     series="Imbalance Ratio (Max/Min)",
     value=(max_count / min_count),
     iteration=1
 )
 print("--- Class imbalance analysis --- ")
 print(f"Max labels in a class: {max_count}")
 print(f"Min labels in a class: {min_count}")
 print(f"Mean labels in a class: {mean_count}")
 print(f"Imbalance ratio: {max_count/min_count:.2f}")
 # Mapping indeces to class names
 class_names = features['label'].names
 formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
 label_count.index = formatted_class_names
 # --------------- Data Splits ------------
 def get_transform_pipelines():
     """
         # Normalise the Tensor; Standartises pixel values
         transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
     ])
     # Augmentation pipeline (to change some parameters of the pictures to create "new" ones)
     augmentation_pipeline = transforms.Compose([
         # Randomly changing some parameters of pictures to enrich dataset
         # Convert to Tensor and Normalise
         transforms.ToTensor(),
         transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
+    ])
     # Return both pipelines
     return normalisation_pipeline, augmentation_pipeline
     proto_val_split = split_2_dict['train']
     proto_test_split = split_2_dict['test']
     # -- Putting splits through pipelines --
     proto_train_split.set_transform(augmentation_pipeline)
     proto_val_split.set_transform(normalisation_pipeline)
     proto_val_loader = DataLoader(dataset = proto_val_split, batch_size = batch_size, shuffle = False )
     proto_test_loader = DataLoader(dataset = proto_test_split, batch_size = batch_size, shuffle = False )
     return proto_train_loader, proto_val_loader, proto_test_loader
     val_split = split_2_dict['train']
     test_split = split_2_dict['test']
     # -- Putting splits through pipelines --
     train_split.set_transform(augmentation_pipeline)
     val_split.set_transform(normalisation_pipeline)
     train_loader = DataLoader(dataset = train_split, batch_size = batch_size, shuffle = True )
     val_loader = DataLoader(dataset = val_split, batch_size = batch_size, shuffle = False )
     test_loader = DataLoader(dataset = test_split, batch_size = batch_size, shuffle = False )
     return train_loader, val_loader, test_loader
 # ----------------------------------------------------------------------
 if __name__ == "__main__":
     train_loader, val_loader, test_loader = get_prototype_loaders(batch_size=32)
     print("\n--- Handoff Test Successful ---")
     print(f"Train loader batches: {len(train_loader)}")
     print(f"Validation loader batches: {len(val_loader)}")
     print(f"Test loader batches: {len(test_loader)}")
+    train_loader_fin, val_loader_fin, test_loader_fin = get_final_loaders(batch_size=32)
+    print("\n--- Handoff Test Successful ---")
+    print(f"Train loader batches: {len(train_loader_fin)}")
+    print(f"Validation loader batches: {len(val_loader_fin)}")
+    print(f"Test loader batches: {len(test_loader_fin)}")
     # Close the ClearML task
     task.close()
+    print("\n--- Script Finished ---")