Spaces:
Running
Running
Yusuf
commited on
Commit
·
c638d1e
1
Parent(s):
7b10a4d
feat: clearml training metrics
Browse files- trainingModel/Training.py +8 -1
- trainingModel/run_training.py +53 -11
trainingModel/Training.py
CHANGED
|
@@ -139,5 +139,12 @@ def train_model(
|
|
| 139 |
print(f"\nTraining finished. Best val accuracy: {best_accuracy:.4f}")
|
| 140 |
print(f"Best model weights saved to: {save_path}")
|
| 141 |
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
|
|
|
| 139 |
print(f"\nTraining finished. Best val accuracy: {best_accuracy:.4f}")
|
| 140 |
print(f"Best model weights saved to: {save_path}")
|
| 141 |
|
| 142 |
+
training_metrics = {
|
| 143 |
+
"losses": training_losses,
|
| 144 |
+
"accuracies": training_accuracies,
|
| 145 |
+
"val_accuracies": val_accuracies,
|
| 146 |
+
"best_accuracy": best_accuracy,
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
return training_metrics
|
| 150 |
|
trainingModel/run_training.py
CHANGED
|
@@ -10,7 +10,8 @@ from models.modelOne import modelOne
|
|
| 10 |
from trainingModel.Training import train_model
|
| 11 |
|
| 12 |
|
| 13 |
-
#
|
|
|
|
| 14 |
all_tasks = Task.get_tasks(project_name="Small Group Project")
|
| 15 |
if not all_tasks:
|
| 16 |
raise RuntimeError("No tasks found in project 'Small Group Project'")
|
|
@@ -19,6 +20,7 @@ dp_tasks = [t for t in all_tasks if t.name == "Data Preparation"]
|
|
| 19 |
if not dp_tasks:
|
| 20 |
raise RuntimeError("No 'Data Preparation' tasks found in this project!")
|
| 21 |
|
|
|
|
| 22 |
latest_task = max(dp_tasks, key=lambda t: t.id)
|
| 23 |
DYNAMIC_TASK_ID = latest_task.id
|
| 24 |
DATA_PREP = Task.get_task(task_id=DYNAMIC_TASK_ID)
|
|
@@ -32,8 +34,7 @@ dataset_id = raw_meta.split("=")[1].strip().replace('"', "")
|
|
| 32 |
subset_clearml = Dataset.get(dataset_id=dataset_id)
|
| 33 |
local_folder = subset_clearml.get_local_copy()
|
| 34 |
|
| 35 |
-
|
| 36 |
-
subset_indices = np.load(subset_indices_path)
|
| 37 |
|
| 38 |
# Load Dataset Parameters
|
| 39 |
data_params = DATA_PREP.get_parameters()
|
|
@@ -84,24 +85,65 @@ print("\n--- Handoff Test Successful ---")
|
|
| 84 |
print(f"Train loader batches: {len(full_loaders['train'])}")
|
| 85 |
print(f"Validation loader batches: {len(full_loaders['val'])}")
|
| 86 |
print(f"Test loader batches: {len(full_loaders['test'])}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
|
| 89 |
# -------- Build the ML model --------
|
| 90 |
-
model = modelOne(noOfClasses=
|
| 91 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 92 |
|
| 93 |
|
| 94 |
# ------- Train the model (on subset for now) -------
|
| 95 |
|
| 96 |
-
#When calling this function, the model should be trained on the given dataset
|
| 97 |
-
|
| 98 |
print("\n--- Starting Model Training on Subset ---")
|
| 99 |
-
train_model(
|
| 100 |
model=model,
|
| 101 |
train_loader=subset_loaders['train'],
|
| 102 |
val_loader=subset_loaders['val'],
|
| 103 |
device=device,
|
| 104 |
-
n_epochs=
|
| 105 |
-
lr=
|
| 106 |
-
save_path="
|
| 107 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
from trainingModel.Training import train_model
|
| 11 |
|
| 12 |
|
| 13 |
+
# -------------- Load Data --------------
|
| 14 |
+
|
| 15 |
all_tasks = Task.get_tasks(project_name="Small Group Project")
|
| 16 |
if not all_tasks:
|
| 17 |
raise RuntimeError("No tasks found in project 'Small Group Project'")
|
|
|
|
| 20 |
if not dp_tasks:
|
| 21 |
raise RuntimeError("No 'Data Preparation' tasks found in this project!")
|
| 22 |
|
| 23 |
+
# Latest Data Prep Task
|
| 24 |
latest_task = max(dp_tasks, key=lambda t: t.id)
|
| 25 |
DYNAMIC_TASK_ID = latest_task.id
|
| 26 |
DATA_PREP = Task.get_task(task_id=DYNAMIC_TASK_ID)
|
|
|
|
| 34 |
subset_clearml = Dataset.get(dataset_id=dataset_id)
|
| 35 |
local_folder = subset_clearml.get_local_copy()
|
| 36 |
|
| 37 |
+
subset_indices = np.load(os.path.join(local_folder, "subset_indices.npy"))
|
|
|
|
| 38 |
|
| 39 |
# Load Dataset Parameters
|
| 40 |
data_params = DATA_PREP.get_parameters()
|
|
|
|
| 85 |
print(f"Train loader batches: {len(full_loaders['train'])}")
|
| 86 |
print(f"Validation loader batches: {len(full_loaders['val'])}")
|
| 87 |
print(f"Test loader batches: {len(full_loaders['test'])}")
|
| 88 |
+
# -------------- DATA PREP ENDS --------------
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
# -------- ClearML Training Task Setup --------
|
| 92 |
+
training_task = Task.init(
|
| 93 |
+
project_name="Small Group Project",
|
| 94 |
+
task_name="Model Training",
|
| 95 |
+
reuse_last_task_id=False,
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
training_logger = training_task.get_logger()
|
| 99 |
+
training_task.connect({"data_prep_task_used": DYNAMIC_TASK_ID})
|
| 100 |
+
|
| 101 |
+
# Training parameters - Modify these to experiment
|
| 102 |
+
training_config = {
|
| 103 |
+
"num_classes": 39,
|
| 104 |
+
"n_epochs": 1,
|
| 105 |
+
"learning_rate": 1e-3,
|
| 106 |
+
"batch_size": batch_size,
|
| 107 |
+
"save_path": "best_model.pt",
|
| 108 |
+
}
|
| 109 |
+
training_task.connect(training_config)
|
| 110 |
|
| 111 |
|
| 112 |
# -------- Build the ML model --------
|
| 113 |
+
model = modelOne(noOfClasses=training_config["num_classes"])
|
| 114 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 115 |
|
| 116 |
|
| 117 |
# ------- Train the model (on subset for now) -------
|
| 118 |
|
|
|
|
|
|
|
| 119 |
print("\n--- Starting Model Training on Subset ---")
|
| 120 |
+
training_metrics = train_model(
|
| 121 |
model=model,
|
| 122 |
train_loader=subset_loaders['train'],
|
| 123 |
val_loader=subset_loaders['val'],
|
| 124 |
device=device,
|
| 125 |
+
n_epochs=training_config["n_epochs"],
|
| 126 |
+
lr=training_config["learning_rate"],
|
| 127 |
+
save_path=training_config["save_path"],
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
# ----------- Log metrics to ClearML -----------
|
| 132 |
+
# Per-batch training losses and accuracies
|
| 133 |
+
for i, loss in enumerate(training_metrics["losses"]):
|
| 134 |
+
training_logger.report_scalar("train", "loss_per_batch", value=loss, iteration=i)
|
| 135 |
+
|
| 136 |
+
for i, acc in enumerate(training_metrics["accuracies"]):
|
| 137 |
+
training_logger.report_scalar("train", "accuracy_per_batch", value=acc, iteration=i)
|
| 138 |
+
|
| 139 |
+
# Per-epoch validation accuracy
|
| 140 |
+
for epoch, acc in enumerate(training_metrics["val_accuracies"]):
|
| 141 |
+
training_logger.report_scalar("validation", "accuracy_per_epoch", value=acc, iteration=epoch)
|
| 142 |
+
|
| 143 |
+
training_logger.report_single_value("best_val_accuracy", training_metrics["best_accuracy"])
|
| 144 |
+
|
| 145 |
+
# Upload best model as artifact
|
| 146 |
+
training_task.upload_artifact("best_model", training_config["save_path"])
|
| 147 |
+
|
| 148 |
+
print("\nTraining complete.")
|
| 149 |
+
training_task.close()
|