Spaces:

k23064919
/

smallGroupProject

Running

App Files Files Community

ra1425 commited on Nov 25, 2025

Commit

83d4d7f

1 Parent(s): 18d7ed3

Fix: Conflict of formats between Data prep and Training, fix Tensor size

Browse files

Files changed (2) hide show

dataPrep/helpers/create_dataset.py +4 -3
dataPrep/helpers/transforms_loaders.py +14 -6

dataPrep/helpers/create_dataset.py CHANGED Viewed

@@ -35,13 +35,14 @@ def make_subset(dataset_link, subset_ratio, clearml_logger):
     subset_indices = indices[:subset_size]
     prototyping_dataset = data_plants.select(subset_indices)
-    # ---------- Register subset in ClearML ----------
     clearml_dataset = Dataset.create(
         dataset_name="Plant Village Prototype",
         dataset_project="Small Group Project",
         dataset_tags=["prototype", "subset"],
-        use_current_task=True
     )
     clearml_dataset.add_tags([
         f"subset_ratio_{subset_ratio}",

     subset_indices = indices[:subset_size]
     prototyping_dataset = data_plants.select(subset_indices)
+# I THINK WE NEED TO REMOVE THIS LATER
+# We dont really need to upload subset everytime (Im not sure tho)
+    # Register subset in ClearML
     clearml_dataset = Dataset.create(
         dataset_name="Plant Village Prototype",
         dataset_project="Small Group Project",
         dataset_tags=["prototype", "subset"],
+        use_current_task=False
     )
     clearml_dataset.add_tags([
         f"subset_ratio_{subset_ratio}",

dataPrep/helpers/transforms_loaders.py CHANGED Viewed

@@ -9,13 +9,13 @@ from torch.utils.data import DataLoader
 # Standard ImageNet mean and std - Used to normalize the tensors
 IMAGENET_MEAN = [0.485, 0.456, 0.406]
 IMAGENET_STD = [0.229, 0.224, 0.225]
 # Defines and returns the normalization pipeline.
 def make_norm_pipeline():
     # Pipeline ensures image format is consistent (for Val/Test)
     normalisation = transforms.Compose([
         # Convert PIL Image to a PyTorch Tensor, scales pixel values from [0, 255] to [0.0, 1.0]
         transforms.ToTensor(),
@@ -35,7 +35,7 @@ def make_augment_pipeline(aug_config):
     # Augmentation pipeline (to create "new" images by changing some parameters)
     augmentation = transforms.Compose([
         # Randomly changing some parameters of pictures to enrich dataset
         transforms.RandomRotation(rotation),
         transforms.ColorJitter(brightness=brightness, saturation=saturation),
@@ -57,6 +57,14 @@ def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config):
     normalisation = make_norm_pipeline()
     augmentation = make_augment_pipeline(aug_config)
     # 70/30 split creates train set
     split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
     train_split = split_1['train']
@@ -68,9 +76,9 @@ def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config):
     val_split, test_split = split_2['train'], split_2['test']
     # Put each split through pipelines
-    train_split.set_transform(augmentation)
-    val_split.set_transform(normalisation)
-    test_split.set_transform(normalisation)
     # Create dataloader for each
     train_loader = DataLoader(train_split, batch_size=batch_size, shuffle=True)

 # Standard ImageNet mean and std - Used to normalize the tensors
 IMAGENET_MEAN = [0.485, 0.456, 0.406]
 IMAGENET_STD = [0.229, 0.224, 0.225]
+IMAGE_SIZE = (256, 256)
 # Defines and returns the normalization pipeline.
 def make_norm_pipeline():
     # Pipeline ensures image format is consistent (for Val/Test)
     normalisation = transforms.Compose([
+        transforms.Resize(IMAGE_SIZE),
         # Convert PIL Image to a PyTorch Tensor, scales pixel values from [0, 255] to [0.0, 1.0]
         transforms.ToTensor(),
     # Augmentation pipeline (to create "new" images by changing some parameters)
     augmentation = transforms.Compose([
+        transforms.Resize(IMAGE_SIZE),
         # Randomly changing some parameters of pictures to enrich dataset
         transforms.RandomRotation(rotation),
         transforms.ColorJitter(brightness=brightness, saturation=saturation),
     normalisation = make_norm_pipeline()
     augmentation = make_augment_pipeline(aug_config)
+    def apply_augmentation(batch):
+        batch['image'] = [augmentation(x) for x in batch['image']]
+        return batch
+    def apply_normalisation(batch):
+        batch['image'] = [normalisation(x) for x in batch['image']]
+        return batch
     # 70/30 split creates train set
     split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
     train_split = split_1['train']
     val_split, test_split = split_2['train'], split_2['test']
     # Put each split through pipelines
+    train_split.set_transform(apply_augmentation)
+    val_split.set_transform(apply_normalisation)
+    test_split.set_transform(apply_normalisation)
     # Create dataloader for each
     train_loader = DataLoader(train_split, batch_size=batch_size, shuffle=True)