Spaces:
Running
Running
ra1425
commited on
Commit
·
83d4d7f
1
Parent(s):
18d7ed3
Fix: Conflict of formats between Data prep and Training, fix Tensor size
Browse files
dataPrep/helpers/create_dataset.py
CHANGED
|
@@ -35,13 +35,14 @@ def make_subset(dataset_link, subset_ratio, clearml_logger):
|
|
| 35 |
subset_indices = indices[:subset_size]
|
| 36 |
|
| 37 |
prototyping_dataset = data_plants.select(subset_indices)
|
| 38 |
-
|
| 39 |
-
|
|
|
|
| 40 |
clearml_dataset = Dataset.create(
|
| 41 |
dataset_name="Plant Village Prototype",
|
| 42 |
dataset_project="Small Group Project",
|
| 43 |
dataset_tags=["prototype", "subset"],
|
| 44 |
-
use_current_task=
|
| 45 |
)
|
| 46 |
clearml_dataset.add_tags([
|
| 47 |
f"subset_ratio_{subset_ratio}",
|
|
|
|
| 35 |
subset_indices = indices[:subset_size]
|
| 36 |
|
| 37 |
prototyping_dataset = data_plants.select(subset_indices)
|
| 38 |
+
# I THINK WE NEED TO REMOVE THIS LATER
|
| 39 |
+
# We dont really need to upload subset everytime (Im not sure tho)
|
| 40 |
+
# Register subset in ClearML
|
| 41 |
clearml_dataset = Dataset.create(
|
| 42 |
dataset_name="Plant Village Prototype",
|
| 43 |
dataset_project="Small Group Project",
|
| 44 |
dataset_tags=["prototype", "subset"],
|
| 45 |
+
use_current_task=False
|
| 46 |
)
|
| 47 |
clearml_dataset.add_tags([
|
| 48 |
f"subset_ratio_{subset_ratio}",
|
dataPrep/helpers/transforms_loaders.py
CHANGED
|
@@ -9,13 +9,13 @@ from torch.utils.data import DataLoader
|
|
| 9 |
# Standard ImageNet mean and std - Used to normalize the tensors
|
| 10 |
IMAGENET_MEAN = [0.485, 0.456, 0.406]
|
| 11 |
IMAGENET_STD = [0.229, 0.224, 0.225]
|
| 12 |
-
|
| 13 |
# Defines and returns the normalization pipeline.
|
| 14 |
def make_norm_pipeline():
|
| 15 |
|
| 16 |
# Pipeline ensures image format is consistent (for Val/Test)
|
| 17 |
normalisation = transforms.Compose([
|
| 18 |
-
|
| 19 |
# Convert PIL Image to a PyTorch Tensor, scales pixel values from [0, 255] to [0.0, 1.0]
|
| 20 |
transforms.ToTensor(),
|
| 21 |
|
|
@@ -35,7 +35,7 @@ def make_augment_pipeline(aug_config):
|
|
| 35 |
|
| 36 |
# Augmentation pipeline (to create "new" images by changing some parameters)
|
| 37 |
augmentation = transforms.Compose([
|
| 38 |
-
|
| 39 |
# Randomly changing some parameters of pictures to enrich dataset
|
| 40 |
transforms.RandomRotation(rotation),
|
| 41 |
transforms.ColorJitter(brightness=brightness, saturation=saturation),
|
|
@@ -57,6 +57,14 @@ def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config):
|
|
| 57 |
normalisation = make_norm_pipeline()
|
| 58 |
augmentation = make_augment_pipeline(aug_config)
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
# 70/30 split creates train set
|
| 61 |
split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
|
| 62 |
train_split = split_1['train']
|
|
@@ -68,9 +76,9 @@ def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config):
|
|
| 68 |
val_split, test_split = split_2['train'], split_2['test']
|
| 69 |
|
| 70 |
# Put each split through pipelines
|
| 71 |
-
train_split.set_transform(
|
| 72 |
-
val_split.set_transform(
|
| 73 |
-
test_split.set_transform(
|
| 74 |
|
| 75 |
# Create dataloader for each
|
| 76 |
train_loader = DataLoader(train_split, batch_size=batch_size, shuffle=True)
|
|
|
|
| 9 |
# Standard ImageNet mean and std - Used to normalize the tensors
|
| 10 |
IMAGENET_MEAN = [0.485, 0.456, 0.406]
|
| 11 |
IMAGENET_STD = [0.229, 0.224, 0.225]
|
| 12 |
+
IMAGE_SIZE = (256, 256)
|
| 13 |
# Defines and returns the normalization pipeline.
|
| 14 |
def make_norm_pipeline():
|
| 15 |
|
| 16 |
# Pipeline ensures image format is consistent (for Val/Test)
|
| 17 |
normalisation = transforms.Compose([
|
| 18 |
+
transforms.Resize(IMAGE_SIZE),
|
| 19 |
# Convert PIL Image to a PyTorch Tensor, scales pixel values from [0, 255] to [0.0, 1.0]
|
| 20 |
transforms.ToTensor(),
|
| 21 |
|
|
|
|
| 35 |
|
| 36 |
# Augmentation pipeline (to create "new" images by changing some parameters)
|
| 37 |
augmentation = transforms.Compose([
|
| 38 |
+
transforms.Resize(IMAGE_SIZE),
|
| 39 |
# Randomly changing some parameters of pictures to enrich dataset
|
| 40 |
transforms.RandomRotation(rotation),
|
| 41 |
transforms.ColorJitter(brightness=brightness, saturation=saturation),
|
|
|
|
| 57 |
normalisation = make_norm_pipeline()
|
| 58 |
augmentation = make_augment_pipeline(aug_config)
|
| 59 |
|
| 60 |
+
def apply_augmentation(batch):
|
| 61 |
+
batch['image'] = [augmentation(x) for x in batch['image']]
|
| 62 |
+
return batch
|
| 63 |
+
|
| 64 |
+
def apply_normalisation(batch):
|
| 65 |
+
batch['image'] = [normalisation(x) for x in batch['image']]
|
| 66 |
+
return batch
|
| 67 |
+
|
| 68 |
# 70/30 split creates train set
|
| 69 |
split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
|
| 70 |
train_split = split_1['train']
|
|
|
|
| 76 |
val_split, test_split = split_2['train'], split_2['test']
|
| 77 |
|
| 78 |
# Put each split through pipelines
|
| 79 |
+
train_split.set_transform(apply_augmentation)
|
| 80 |
+
val_split.set_transform(apply_normalisation)
|
| 81 |
+
test_split.set_transform(apply_normalisation)
|
| 82 |
|
| 83 |
# Create dataloader for each
|
| 84 |
train_loader = DataLoader(train_split, batch_size=batch_size, shuffle=True)
|