Yusuf commited on
Commit
78fbc90
·
1 Parent(s): ee1f1d3

configure dataloader workers

Browse files
dataPrep/helpers/clearml_data.py CHANGED
@@ -11,7 +11,7 @@ Takes latest Data Prep ClearML task from project and reconstruct:
11
  - data loaders for both full and subset datasets
12
  - Aug settings used
13
  '''
14
- def extract_latest_data_task(project_name: str = "Small Group Project"):
15
 
16
  # --------- Get latest Data Preparation task from ClearML ---------
17
 
@@ -76,7 +76,7 @@ def extract_latest_data_task(project_name: str = "Small Group Project"):
76
  subset_dataset = full_dataset.select(subset_indices)
77
 
78
  # Get data loaders for both full and subset datasets
79
- subset_loaders, full_loaders, aug_config = get_data_loaders(data_params, subset_dataset, full_dataset)
80
  batch_size = int(data_params['General/dataloaders/batch_size'])
81
  seed = int(data_params['General/seed'])
82
 
@@ -99,7 +99,7 @@ def extract_latest_data_task(project_name: str = "Small Group Project"):
99
  Takes a given dataset, subset, data params to create DataLoaders
100
  Loaders split data into train, val, test
101
  '''
102
- def get_data_loaders(data_params, subset_dataset, full_dataset):
103
 
104
  # Extract data parameters- these will be used in the DataLoaders
105
  seed = int(data_params['General/seed'])
@@ -115,7 +115,7 @@ def get_data_loaders(data_params, subset_dataset, full_dataset):
115
 
116
  # Create DataLoaders using the parameters from data prep
117
  subset_loaders = make_dataset_loaders(
118
- subset_dataset, seed, batch_size, test_size, aug_config
119
  )
120
 
121
  print("\n--- Handoff Test Successful ---")
@@ -125,7 +125,7 @@ def get_data_loaders(data_params, subset_dataset, full_dataset):
125
 
126
 
127
  full_loaders = make_dataset_loaders(
128
- full_dataset, seed, batch_size, test_size, aug_config
129
  )
130
 
131
  print("\n--- Handoff Test Successful ---")
 
11
  - data loaders for both full and subset datasets
12
  - Aug settings used
13
  '''
14
+ def extract_latest_data_task(project_name: str = "Small Group Project", num_workers: int = 8):
15
 
16
  # --------- Get latest Data Preparation task from ClearML ---------
17
 
 
76
  subset_dataset = full_dataset.select(subset_indices)
77
 
78
  # Get data loaders for both full and subset datasets
79
+ subset_loaders, full_loaders, aug_config = get_data_loaders(data_params, subset_dataset, full_dataset, num_workers=num_workers)
80
  batch_size = int(data_params['General/dataloaders/batch_size'])
81
  seed = int(data_params['General/seed'])
82
 
 
99
  Takes a given dataset, subset, data params to create DataLoaders
100
  Loaders split data into train, val, test
101
  '''
102
+ def get_data_loaders(data_params, subset_dataset, full_dataset, num_workers):
103
 
104
  # Extract data parameters- these will be used in the DataLoaders
105
  seed = int(data_params['General/seed'])
 
115
 
116
  # Create DataLoaders using the parameters from data prep
117
  subset_loaders = make_dataset_loaders(
118
+ subset_dataset, seed, batch_size, test_size, aug_config, workers=num_workers
119
  )
120
 
121
  print("\n--- Handoff Test Successful ---")
 
125
 
126
 
127
  full_loaders = make_dataset_loaders(
128
+ full_dataset, seed, batch_size, test_size, aug_config, workers=num_workers
129
  )
130
 
131
  print("\n--- Handoff Test Successful ---")
dataPrep/helpers/transforms_loaders.py CHANGED
@@ -47,24 +47,25 @@ def make_augment_pipeline(aug_config):
47
  return augmentation
48
 
49
 
 
 
 
 
 
 
 
 
 
50
  """
51
  Creates and returns DataLoaders (train, val, test) for a given dataset.
52
  Performs a 70/15/15 split
53
  """
54
- def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config):
55
 
56
  # Define transformation pipelines for the dataset
57
  normalisation = make_norm_pipeline()
58
  augmentation = make_augment_pipeline(aug_config)
59
 
60
- def apply_augmentation(batch):
61
- batch['image'] = [augmentation(x) for x in batch['image']]
62
- return batch
63
-
64
- def apply_normalisation(batch):
65
- batch['image'] = [normalisation(x) for x in batch['image']]
66
- return batch
67
-
68
  # 70/30 split creates train set
69
  split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
70
  train_split = split_1['train']
@@ -76,14 +77,34 @@ def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config):
76
  val_split, test_split = split_2['train'], split_2['test']
77
 
78
  # Put each split through pipelines
79
- train_split.set_transform(apply_augmentation)
80
- val_split.set_transform(apply_normalisation)
81
- test_split.set_transform(apply_normalisation)
82
 
83
  # Create dataloader for each
84
- train_loader = DataLoader(train_split, batch_size=batch_size, shuffle=True)
85
- val_loader = DataLoader(val_split, batch_size=batch_size, shuffle=False)
86
- test_loader = DataLoader(test_split, batch_size=batch_size, shuffle=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  dataset_loaders = {
89
  "train": train_loader,
 
47
  return augmentation
48
 
49
 
50
+ def apply_augmentation(batch, augmentation):
51
+ batch['image'] = [augmentation(x) for x in batch['image']]
52
+ return batch
53
+
54
+ def apply_normalisation(batch, normalisation):
55
+ batch['image'] = [normalisation(x) for x in batch['image']]
56
+ return batch
57
+
58
+
59
  """
60
  Creates and returns DataLoaders (train, val, test) for a given dataset.
61
  Performs a 70/15/15 split
62
  """
63
+ def make_dataset_loaders(dataset, seed, batch_size, test_size, aug_config, workers=8):
64
 
65
  # Define transformation pipelines for the dataset
66
  normalisation = make_norm_pipeline()
67
  augmentation = make_augment_pipeline(aug_config)
68
 
 
 
 
 
 
 
 
 
69
  # 70/30 split creates train set
70
  split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
71
  train_split = split_1['train']
 
77
  val_split, test_split = split_2['train'], split_2['test']
78
 
79
  # Put each split through pipelines
80
+ train_split.set_transform(lambda batch: apply_augmentation(batch, augmentation))
81
+ val_split.set_transform(lambda batch: apply_normalisation(batch, normalisation))
82
+ test_split.set_transform(lambda batch: apply_normalisation(batch, normalisation))
83
 
84
  # Create dataloader for each
85
+ train_loader = DataLoader(
86
+ train_split,
87
+ batch_size=batch_size,
88
+ shuffle=True,
89
+ pin_memory=True,
90
+ num_workers=workers
91
+ )
92
+ val_loader = DataLoader(
93
+ val_split,
94
+ batch_size=batch_size,
95
+ shuffle=False,
96
+ pin_memory=True,
97
+ num_workers=workers
98
+ )
99
+ test_loader = DataLoader(
100
+ test_split,
101
+ batch_size=batch_size,
102
+ shuffle=False,
103
+ pin_memory=True,
104
+ num_workers=workers
105
+ )
106
+
107
+ print(f"\nWorkers used in DataLoaders: {workers}\n")
108
 
109
  dataset_loaders = {
110
  "train": train_loader,
trainingModel/run_training.py CHANGED
@@ -1,4 +1,4 @@
1
-
2
  from clearml import Task
3
  from dataPrep.helpers.clearml_data import extract_latest_data_task
4
 
@@ -8,8 +8,9 @@ from trainingModel.helpers.Training import train_model
8
 
9
 
10
  # -------------- Load Data --------------
 
11
  project_name = "Small Group Project"
12
- subset_loaders, full_loaders, data_prep_metadata = extract_latest_data_task(project_name=project_name)
13
 
14
 
15
  # -------- ClearML Training Task Setup --------
@@ -30,6 +31,7 @@ training_config = {
30
  "learning_rate": 1e-3,
31
  "optimizer": "adam",
32
  "save_path": "best_model.pt",
 
33
  }
34
  training_task.connect(training_config)
35
 
 
1
+ import os
2
  from clearml import Task
3
  from dataPrep.helpers.clearml_data import extract_latest_data_task
4
 
 
8
 
9
 
10
  # -------------- Load Data --------------
11
+ NUM_WORKERS = 0
12
  project_name = "Small Group Project"
13
+ subset_loaders, full_loaders, data_prep_metadata = extract_latest_data_task(project_name=project_name, num_workers=NUM_WORKERS)
14
 
15
 
16
  # -------- ClearML Training Task Setup --------
 
31
  "learning_rate": 1e-3,
32
  "optimizer": "adam",
33
  "save_path": "best_model.pt",
34
+ "num_workers": NUM_WORKERS
35
  }
36
  training_task.connect(training_config)
37