Spaces:
Running
Running
Yusuf
commited on
Commit
·
78fbc90
1
Parent(s):
ee1f1d3
configure dataloader workers
Browse files
dataPrep/helpers/clearml_data.py
CHANGED
|
@@ -11,7 +11,7 @@ Takes latest Data Prep ClearML task from project and reconstruct:
|
|
| 11 |
- data loaders for both full and subset datasets
|
| 12 |
- Aug settings used
|
| 13 |
'''
|
| 14 |
-
def extract_latest_data_task(project_name: str = "Small Group Project"):
|
| 15 |
|
| 16 |
# --------- Get latest Data Preparation task from ClearML ---------
|
| 17 |
|
|
@@ -76,7 +76,7 @@ def extract_latest_data_task(project_name: str = "Small Group Project"):
|
|
| 76 |
subset_dataset = full_dataset.select(subset_indices)
|
| 77 |
|
| 78 |
# Get data loaders for both full and subset datasets
|
| 79 |
-
subset_loaders, full_loaders, aug_config = get_data_loaders(data_params, subset_dataset, full_dataset)
|
| 80 |
batch_size = int(data_params['General/dataloaders/batch_size'])
|
| 81 |
seed = int(data_params['General/seed'])
|
| 82 |
|
|
@@ -99,7 +99,7 @@ def extract_latest_data_task(project_name: str = "Small Group Project"):
|
|
| 99 |
Takes a given dataset, subset, data params to create DataLoaders
|
| 100 |
Loaders split data into train, val, test
|
| 101 |
'''
|
| 102 |
-
def get_data_loaders(data_params, subset_dataset, full_dataset):
|
| 103 |
|
| 104 |
# Extract data parameters- these will be used in the DataLoaders
|
| 105 |
seed = int(data_params['General/seed'])
|
|
@@ -115,7 +115,7 @@ def get_data_loaders(data_params, subset_dataset, full_dataset):
|
|
| 115 |
|
| 116 |
# Create DataLoaders using the parameters from data prep
|
| 117 |
subset_loaders = make_dataset_loaders(
|
| 118 |
-
subset_dataset, seed, batch_size, test_size, aug_config
|
| 119 |
)
|
| 120 |
|
| 121 |
print("\n--- Handoff Test Successful ---")
|
|
@@ -125,7 +125,7 @@ def get_data_loaders(data_params, subset_dataset, full_dataset):
|
|
| 125 |
|
| 126 |
|
| 127 |
full_loaders = make_dataset_loaders(
|
| 128 |
-
full_dataset, seed, batch_size, test_size, aug_config
|
| 129 |
)
|
| 130 |
|
| 131 |
print("\n--- Handoff Test Successful ---")
|
|
|
|
| 11 |
- data loaders for both full and subset datasets
|
| 12 |
- Aug settings used
|
| 13 |
'''
|
| 14 |
+
def extract_latest_data_task(project_name: str = "Small Group Project", num_workers: int = 8):
|
| 15 |
|
| 16 |
# --------- Get latest Data Preparation task from ClearML ---------
|
| 17 |
|
|
|
|
| 76 |
subset_dataset = full_dataset.select(subset_indices)
|
| 77 |
|
| 78 |
# Get data loaders for both full and subset datasets
|
| 79 |
+
subset_loaders, full_loaders, aug_config = get_data_loaders(data_params, subset_dataset, full_dataset, num_workers=num_workers)
|
| 80 |
batch_size = int(data_params['General/dataloaders/batch_size'])
|
| 81 |
seed = int(data_params['General/seed'])
|
| 82 |
|
|
|
|
| 99 |
Takes a given dataset, subset, data params to create DataLoaders
|
| 100 |
Loaders split data into train, val, test
|
| 101 |
'''
|
| 102 |
+
def get_data_loaders(data_params, subset_dataset, full_dataset, num_workers):
|
| 103 |
|
| 104 |
# Extract data parameters- these will be used in the DataLoaders
|
| 105 |
seed = int(data_params['General/seed'])
|
|
|
|
| 115 |
|
| 116 |
# Create DataLoaders using the parameters from data prep
|
| 117 |
subset_loaders = make_dataset_loaders(
|
| 118 |
+
subset_dataset, seed, batch_size, test_size, aug_config, workers=num_workers
|
| 119 |
)
|
| 120 |
|
| 121 |
print("\n--- Handoff Test Successful ---")
|
|
|
|
| 125 |
|
| 126 |
|
| 127 |
full_loaders = make_dataset_loaders(
|
| 128 |
+
full_dataset, seed, batch_size, test_size, aug_config, workers=num_workers
|
| 129 |
)
|
| 130 |
|
| 131 |
print("\n--- Handoff Test Successful ---")
|
dataPrep/helpers/transforms_loaders.py
CHANGED
|
@@ -47,24 +47,25 @@ def make_augment_pipeline(aug_config):
|
|
| 47 |
return augmentation
|
| 48 |
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
"""
|
| 51 |
Creates and returns DataLoaders (train, val, test) for a given dataset.
|
| 52 |
Performs a 70/15/15 split
|
| 53 |
"""
|
| 54 |
-
def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config):
|
| 55 |
|
| 56 |
# Define transformation pipelines for the dataset
|
| 57 |
normalisation = make_norm_pipeline()
|
| 58 |
augmentation = make_augment_pipeline(aug_config)
|
| 59 |
|
| 60 |
-
def apply_augmentation(batch):
|
| 61 |
-
batch['image'] = [augmentation(x) for x in batch['image']]
|
| 62 |
-
return batch
|
| 63 |
-
|
| 64 |
-
def apply_normalisation(batch):
|
| 65 |
-
batch['image'] = [normalisation(x) for x in batch['image']]
|
| 66 |
-
return batch
|
| 67 |
-
|
| 68 |
# 70/30 split creates train set
|
| 69 |
split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
|
| 70 |
train_split = split_1['train']
|
|
@@ -76,14 +77,34 @@ def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config):
|
|
| 76 |
val_split, test_split = split_2['train'], split_2['test']
|
| 77 |
|
| 78 |
# Put each split through pipelines
|
| 79 |
-
train_split.set_transform(apply_augmentation)
|
| 80 |
-
val_split.set_transform(apply_normalisation)
|
| 81 |
-
test_split.set_transform(apply_normalisation)
|
| 82 |
|
| 83 |
# Create dataloader for each
|
| 84 |
-
train_loader = DataLoader(
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
dataset_loaders = {
|
| 89 |
"train": train_loader,
|
|
|
|
| 47 |
return augmentation
|
| 48 |
|
| 49 |
|
| 50 |
+
def apply_augmentation(batch, augmentation):
|
| 51 |
+
batch['image'] = [augmentation(x) for x in batch['image']]
|
| 52 |
+
return batch
|
| 53 |
+
|
| 54 |
+
def apply_normalisation(batch, normalisation):
|
| 55 |
+
batch['image'] = [normalisation(x) for x in batch['image']]
|
| 56 |
+
return batch
|
| 57 |
+
|
| 58 |
+
|
| 59 |
"""
|
| 60 |
Creates and returns DataLoaders (train, val, test) for a given dataset.
|
| 61 |
Performs a 70/15/15 split
|
| 62 |
"""
|
| 63 |
+
def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config, workers=8):
|
| 64 |
|
| 65 |
# Define transformation pipelines for the dataset
|
| 66 |
normalisation = make_norm_pipeline()
|
| 67 |
augmentation = make_augment_pipeline(aug_config)
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
# 70/30 split creates train set
|
| 70 |
split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
|
| 71 |
train_split = split_1['train']
|
|
|
|
| 77 |
val_split, test_split = split_2['train'], split_2['test']
|
| 78 |
|
| 79 |
# Put each split through pipelines
|
| 80 |
+
train_split.set_transform(lambda batch: apply_augmentation(batch, augmentation))
|
| 81 |
+
val_split.set_transform(lambda batch: apply_normalisation(batch, normalisation))
|
| 82 |
+
test_split.set_transform(lambda batch: apply_normalisation(batch, normalisation))
|
| 83 |
|
| 84 |
# Create dataloader for each
|
| 85 |
+
train_loader = DataLoader(
|
| 86 |
+
train_split,
|
| 87 |
+
batch_size=batch_size,
|
| 88 |
+
shuffle=True,
|
| 89 |
+
pin_memory=True,
|
| 90 |
+
num_workers=workers
|
| 91 |
+
)
|
| 92 |
+
val_loader = DataLoader(
|
| 93 |
+
val_split,
|
| 94 |
+
batch_size=batch_size,
|
| 95 |
+
shuffle=False,
|
| 96 |
+
pin_memory=True,
|
| 97 |
+
num_workers=workers
|
| 98 |
+
)
|
| 99 |
+
test_loader = DataLoader(
|
| 100 |
+
test_split,
|
| 101 |
+
batch_size=batch_size,
|
| 102 |
+
shuffle=False,
|
| 103 |
+
pin_memory=True,
|
| 104 |
+
num_workers=workers
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
print(f"\nWorkers used in DataLoaders: {workers}\n")
|
| 108 |
|
| 109 |
dataset_loaders = {
|
| 110 |
"train": train_loader,
|
trainingModel/run_training.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
|
| 2 |
from clearml import Task
|
| 3 |
from dataPrep.helpers.clearml_data import extract_latest_data_task
|
| 4 |
|
|
@@ -8,8 +8,9 @@ from trainingModel.helpers.Training import train_model
|
|
| 8 |
|
| 9 |
|
| 10 |
# -------------- Load Data --------------
|
|
|
|
| 11 |
project_name = "Small Group Project"
|
| 12 |
-
subset_loaders, full_loaders, data_prep_metadata = extract_latest_data_task(project_name=project_name)
|
| 13 |
|
| 14 |
|
| 15 |
# -------- ClearML Training Task Setup --------
|
|
@@ -30,6 +31,7 @@ training_config = {
|
|
| 30 |
"learning_rate": 1e-3,
|
| 31 |
"optimizer": "adam",
|
| 32 |
"save_path": "best_model.pt",
|
|
|
|
| 33 |
}
|
| 34 |
training_task.connect(training_config)
|
| 35 |
|
|
|
|
| 1 |
+
import os
|
| 2 |
from clearml import Task
|
| 3 |
from dataPrep.helpers.clearml_data import extract_latest_data_task
|
| 4 |
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
# -------------- Load Data --------------
|
| 11 |
+
NUM_WORKERS = 0
|
| 12 |
project_name = "Small Group Project"
|
| 13 |
+
subset_loaders, full_loaders, data_prep_metadata = extract_latest_data_task(project_name=project_name, num_workers=NUM_WORKERS)
|
| 14 |
|
| 15 |
|
| 16 |
# -------- ClearML Training Task Setup --------
|
|
|
|
| 31 |
"learning_rate": 1e-3,
|
| 32 |
"optimizer": "adam",
|
| 33 |
"save_path": "best_model.pt",
|
| 34 |
+
"num_workers": NUM_WORKERS
|
| 35 |
}
|
| 36 |
training_task.connect(training_config)
|
| 37 |
|