Yusuf commited on
Commit
0abee12
·
1 Parent(s): 4452b74

chore: extract load data prep from training

Browse files
dataPrep/helpers/clearml_data.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+
4
+ from clearml import Task, Dataset
5
+ from datasets import load_dataset
6
+ from dataPrep.helpers.transforms_loaders import make_dataset_loaders
7
+
8
+
9
+ '''
10
+ Takes latest Data Prep ClearML task from project and extracts data loaders and metadata
11
+ '''
12
+ def extract_latest_data_task(project_name: str = "Small Group Project"):
13
+
14
+ all_tasks = Task.get_tasks(project_name=project_name)
15
+ if not all_tasks:
16
+ raise RuntimeError(f"No tasks found in project '{project_name}'")
17
+
18
+ dp_tasks = [t for t in all_tasks if t.name == "Data Preparation"]
19
+ if not dp_tasks:
20
+ raise RuntimeError("No 'Data Preparation' tasks found in this project!")
21
+
22
+ # Latest Data Prep Task
23
+ latest_task = max(dp_tasks, key=lambda t: t.id)
24
+ DYNAMIC_TASK_ID = latest_task.id
25
+ DATA_PREP = Task.get_task(task_id=DYNAMIC_TASK_ID)
26
+
27
+ # Dataset ID
28
+ config_objects = DATA_PREP.get_configuration_objects()
29
+ raw_meta = config_objects["Dataset Metadata"]
30
+ dataset_id = raw_meta.split("=")[1].strip().replace('"', "")
31
+
32
+ # Load ClearML Dataset
33
+ subset_clearml = Dataset.get(dataset_id=dataset_id)
34
+ local_folder = subset_clearml.get_local_copy()
35
+
36
+ subset_indices = np.load(os.path.join(local_folder, "subset_indices.npy"))
37
+
38
+ # Load Dataset Parameters
39
+ data_params = DATA_PREP.get_parameters()
40
+ dataset_link = data_params['General/dataset/link']
41
+
42
+ # Load Full Dataset
43
+ try:
44
+ ds = load_dataset(dataset_link)
45
+ except Exception as e:
46
+ raise RuntimeError(f"Error loading the dataset: {e}")
47
+
48
+ full_dataset = ds['train']
49
+
50
+ # Apply subset indices to full dataset - this gives you the same subset as data prep
51
+ subset_dataset = full_dataset.select(subset_indices)
52
+
53
+ # Get data loaders for both full and subset datasets
54
+ subset_loaders, full_loaders, aug_config = get_data_loaders(data_params, subset_dataset, full_dataset)
55
+ batch_size = int(data_params['General/dataloaders/batch_size'])
56
+ seed = int(data_params['General/seed'])
57
+
58
+
59
+ # Gather data prep task metadata
60
+ data_prep_metadata = {
61
+ "data_prep_task_id": DYNAMIC_TASK_ID,
62
+ "dataset_id": dataset_id,
63
+ "dataset_link": dataset_link,
64
+ "augmentation_used": aug_config,
65
+ "batch_size_used": batch_size,
66
+ "seed_used": seed,
67
+ }
68
+
69
+ return subset_loaders, full_loaders, data_prep_metadata
70
+
71
+
72
+ '''
73
+ Takes a given dataset, subset, data params to create DataLoaders
74
+ Loaders split data into train, val, test
75
+ '''
76
+ def get_data_loaders(data_params, subset_dataset, full_dataset):
77
+
78
+ # Extract data parameters- these will be used in the DataLoaders
79
+ seed = int(data_params['General/seed'])
80
+ batch_size = int(data_params['General/dataloaders/batch_size'])
81
+ test_size = float(data_params['General/dataloaders/test_size'])
82
+
83
+ aug_config = {
84
+ 'rotation': float(data_params['General/augmentation/rotation']),
85
+ 'brightness': float(data_params['General/augmentation/brightness']),
86
+ 'saturation': float(data_params['General/augmentation/saturation']),
87
+ 'blur': float(data_params['General/augmentation/blur'])
88
+ }
89
+
90
+ # Create DataLoaders using the parameters from data prep
91
+ subset_loaders = make_dataset_loaders(
92
+ subset_dataset, seed, batch_size, test_size, aug_config
93
+ )
94
+
95
+ print("\n--- Handoff Test Successful ---")
96
+ print(f"Prototype Train loader batches: {len(subset_loaders['train'])}")
97
+ print(f"Prototype Validation loader batches: {len(subset_loaders['val'])}")
98
+ print(f"Prototype Test loader batches: {len(subset_loaders['test'])}")
99
+
100
+
101
+ full_loaders = make_dataset_loaders(
102
+ full_dataset, seed, batch_size, test_size, aug_config
103
+ )
104
+
105
+ print("\n--- Handoff Test Successful ---")
106
+ print(f"Train loader batches: {len(full_loaders['train'])}")
107
+ print(f"Validation loader batches: {len(full_loaders['val'])}")
108
+ print(f"Test loader batches: {len(full_loaders['test'])}")
109
+
110
+ return subset_loaders, full_loaders, aug_config
trainingModel/run_training.py CHANGED
@@ -1,9 +1,6 @@
1
- import os
2
- import numpy as np
3
 
4
- from clearml import Task, Dataset
5
- from datasets import load_dataset
6
- from dataPrep.helpers.transforms_loaders import make_dataset_loaders
7
 
8
  import torch
9
  from models.modelOne import modelOne
@@ -11,79 +8,8 @@ from trainingModel.Training import train_model
11
 
12
 
13
  # -------------- Load Data --------------
14
-
15
- all_tasks = Task.get_tasks(project_name="Small Group Project")
16
- if not all_tasks:
17
- raise RuntimeError("No tasks found in project 'Small Group Project'")
18
-
19
- dp_tasks = [t for t in all_tasks if t.name == "Data Preparation"]
20
- if not dp_tasks:
21
- raise RuntimeError("No 'Data Preparation' tasks found in this project!")
22
-
23
- # Latest Data Prep Task
24
- latest_task = max(dp_tasks, key=lambda t: t.id)
25
- DYNAMIC_TASK_ID = latest_task.id
26
- DATA_PREP = Task.get_task(task_id=DYNAMIC_TASK_ID)
27
-
28
- # Dataset ID
29
- config_objects = DATA_PREP.get_configuration_objects()
30
- raw_meta = config_objects["Dataset Metadata"]
31
- dataset_id = raw_meta.split("=")[1].strip().replace('"', "")
32
-
33
- # Load ClearML Dataset
34
- subset_clearml = Dataset.get(dataset_id=dataset_id)
35
- local_folder = subset_clearml.get_local_copy()
36
-
37
- subset_indices = np.load(os.path.join(local_folder, "subset_indices.npy"))
38
-
39
- # Load Dataset Parameters
40
- data_params = DATA_PREP.get_parameters()
41
- dataset_link = data_params['General/dataset/link']
42
-
43
- # Load Full Dataset
44
- try:
45
- ds = load_dataset(dataset_link)
46
- except Exception as e:
47
- raise RuntimeError(f"Error loading the dataset: {e}")
48
-
49
- full_dataset = ds['train']
50
-
51
- # Apply subset indices to full dataset - this gives you the same subset as data prep
52
- subset_dataset = full_dataset.select(subset_indices)
53
-
54
-
55
- # Extract parameters from data prep task - these will create the DataLoaders
56
- seed = int(data_params['General/seed'])
57
- batch_size = int(data_params['General/dataloaders/batch_size'])
58
- test_size = float(data_params['General/dataloaders/test_size'])
59
-
60
- aug_config = {
61
- 'rotation': float(data_params['General/augmentation/rotation']),
62
- 'brightness': float(data_params['General/augmentation/brightness']),
63
- 'saturation': float(data_params['General/augmentation/saturation']),
64
- 'blur': float(data_params['General/augmentation/blur'])
65
- }
66
-
67
- # Create DataLoaders using the parameters from data prep
68
- subset_loaders = make_dataset_loaders(
69
- subset_dataset, seed, batch_size, test_size, aug_config
70
- )
71
-
72
- print("\n--- Handoff Test Successful ---")
73
- print(f"Prototype Train loader batches: {len(subset_loaders['train'])}")
74
- print(f"Prototype Validation loader batches: {len(subset_loaders['val'])}")
75
- print(f"Prototype Test loader batches: {len(subset_loaders['test'])}")
76
-
77
-
78
- full_loaders = make_dataset_loaders(
79
- full_dataset, seed, batch_size, test_size, aug_config
80
- )
81
-
82
- print("\n--- Handoff Test Successful ---")
83
- print(f"Train loader batches: {len(full_loaders['train'])}")
84
- print(f"Validation loader batches: {len(full_loaders['val'])}")
85
- print(f"Test loader batches: {len(full_loaders['test'])}")
86
- # -------------- DATA PREP ENDS --------------
87
 
88
 
89
  # -------- ClearML Training Task Setup --------
@@ -95,21 +21,13 @@ training_task = Task.init(
95
 
96
  # Detail the data prep task used
97
  training_logger = training_task.get_logger()
98
- data_prep_metadata = {
99
- "data_prep_task_id": DYNAMIC_TASK_ID,
100
- "dataset_id": dataset_id,
101
- "dataset_link": dataset_link,
102
- "augmentation_used": aug_config,
103
- "seed_used": seed,
104
- }
105
- training_task.connect(data_prep_metadata, name="data_prep_metadata")
106
 
107
  # Training parameters - Modify these to experiment
108
  training_config = {
109
  "num_classes": 39,
110
  "n_epochs": 3,
111
  "learning_rate": 1e-3,
112
- "batch_size": batch_size,
113
  "optimizer": "adam",
114
  "save_path": "best_model.pt",
115
  }
 
 
 
1
 
2
+ from clearml import Task
3
+ from dataPrep.helpers.clearml_data import extract_latest_data_task
 
4
 
5
  import torch
6
  from models.modelOne import modelOne
 
8
 
9
 
10
  # -------------- Load Data --------------
11
+ project_name = "Small Group Project"
12
+ subset_loaders, full_loaders, data_prep_metadata = extract_latest_data_task(project_name=project_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  # -------- ClearML Training Task Setup --------
 
21
 
22
  # Detail the data prep task used
23
  training_logger = training_task.get_logger()
24
+ training_task.connect(data_prep_metadata, name="data_prep_metadata_READONLY")
 
 
 
 
 
 
 
25
 
26
  # Training parameters - Modify these to experiment
27
  training_config = {
28
  "num_classes": 39,
29
  "n_epochs": 3,
30
  "learning_rate": 1e-3,
 
31
  "optimizer": "adam",
32
  "save_path": "best_model.pt",
33
  }