File size: 2,742 Bytes
0abee12
 
8e6181a
7ef5142
84cfdfc
89f7e00
3f67469
8e6181a
 
c638d1e
78fbc90
0abee12
78fbc90
c638d1e
 
 
 
3f67469
c638d1e
 
 
 
25fbc07
c638d1e
0abee12
c638d1e
 
 
 
3f67469
c638d1e
25fbc07
c638d1e
78fbc90
c638d1e
 
8e6181a
 
 
89f7e00
8e6181a
3f67469
8e6181a
3f67469
 
 
 
8e6181a
 
18d7ed3
 
c638d1e
8e6181a
 
 
c638d1e
 
25fbc07
c638d1e
e6d94e8
c638d1e
 
 
 
25fbc07
e6d94e8
 
 
 
 
c638d1e
25fbc07
c638d1e
4452b74
c638d1e
e6d94e8
c638d1e
 
 
 
 
 
7ef5142
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from clearml import Task
from dataPrep.helpers.clearml_data import extract_latest_data_task

import torch
from models.modelOne import modelOne
from models.modelTwo import BetterCNN
from trainingModel.helpers.Training import train_model


# -------------- Load Data --------------
NUM_WORKERS = 0
project_name = "Small Group Project"
subset_loaders, full_loaders, data_prep_metadata = extract_latest_data_task(project_name=project_name, num_workers=NUM_WORKERS)


# -------- ClearML Training Task Setup --------
training_task = Task.init(
    project_name=f"{project_name}/Model Training",
    task_name="Model Training",
    reuse_last_task_id=False,
)

# Detail the data prep task used
training_logger = training_task.get_logger()
training_task.connect(data_prep_metadata, name="data_prep_metadata_READONLY")

# Training parameters - Modify these to experiment
training_config = {
    "num_classes": 39,
    "n_epochs": 1,
    "learning_rate": 1e-3,
    "optimizer": "adam",
    "save_path": "best_model.pt",
    "num_workers": NUM_WORKERS
}
training_task.connect(training_config)


# -------- Build the ML model --------
model = BetterCNN(noOfClasses=training_config["num_classes"])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Print device info
print(f"\n**Using device: {device}**\n") 
if device.type == 'cuda':
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

# ------- Train the model (on subset for now) -------
 
print("\n--- Starting Model Training on Subset ---")
training_metrics = train_model(
    model=model,
    train_loader=subset_loaders['train'],
    val_loader=subset_loaders['val'],
    n_epochs=training_config["n_epochs"],
    lr=training_config["learning_rate"],
    num_classes=training_config["num_classes"],
    save_path=training_config["save_path"],
    early_stop=3,
)


# ----------- Log metrics to ClearML -----------
# Per-epoch training losses and accuracies
for epoch, loss in enumerate(training_metrics["losses"]):
    training_logger.report_scalar("training epoch loss", "loss", value=loss, iteration=epoch)

for epoch, acc in enumerate(training_metrics["accuracies"]):
    training_logger.report_scalar("training epoch accuracy", "accuracy", value=acc, iteration=epoch)

# Per-epoch validation accuracies
for epoch, acc in enumerate(training_metrics["val_accuracies"]):
    training_logger.report_scalar("validation epoch accuracy", "accuracy", value=acc, iteration=epoch)

# Best validation accuracy
training_logger.report_single_value("best_val_accuracy", training_metrics["best_accuracy"])

# Upload best model as artifact
training_task.upload_artifact("best_model", training_config["save_path"])

print("\nTraining complete.")
training_task.close()