Spaces:
Sleeping
Sleeping
File size: 2,742 Bytes
0abee12 8e6181a 7ef5142 84cfdfc 89f7e00 3f67469 8e6181a c638d1e 78fbc90 0abee12 78fbc90 c638d1e 3f67469 c638d1e 25fbc07 c638d1e 0abee12 c638d1e 3f67469 c638d1e 25fbc07 c638d1e 78fbc90 c638d1e 8e6181a 89f7e00 8e6181a 3f67469 8e6181a 3f67469 8e6181a 18d7ed3 c638d1e 8e6181a c638d1e 25fbc07 c638d1e e6d94e8 c638d1e 25fbc07 e6d94e8 c638d1e 25fbc07 c638d1e 4452b74 c638d1e e6d94e8 c638d1e 7ef5142 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
from clearml import Task
from dataPrep.helpers.clearml_data import extract_latest_data_task
import torch
from models.modelOne import modelOne
from models.modelTwo import BetterCNN
from trainingModel.helpers.Training import train_model
# -------------- Load Data --------------
NUM_WORKERS = 0
project_name = "Small Group Project"
subset_loaders, full_loaders, data_prep_metadata = extract_latest_data_task(project_name=project_name, num_workers=NUM_WORKERS)
# -------- ClearML Training Task Setup --------
training_task = Task.init(
project_name=f"{project_name}/Model Training",
task_name="Model Training",
reuse_last_task_id=False,
)
# Detail the data prep task used
training_logger = training_task.get_logger()
training_task.connect(data_prep_metadata, name="data_prep_metadata_READONLY")
# Training parameters - Modify these to experiment
training_config = {
"num_classes": 39,
"n_epochs": 1,
"learning_rate": 1e-3,
"optimizer": "adam",
"save_path": "best_model.pt",
"num_workers": NUM_WORKERS
}
training_task.connect(training_config)
# -------- Build the ML model --------
model = BetterCNN(noOfClasses=training_config["num_classes"])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Print device info
print(f"\n**Using device: {device}**\n")
if device.type == 'cuda':
print(f"GPU Name: {torch.cuda.get_device_name(0)}")
# ------- Train the model (on subset for now) -------
print("\n--- Starting Model Training on Subset ---")
training_metrics = train_model(
model=model,
train_loader=subset_loaders['train'],
val_loader=subset_loaders['val'],
n_epochs=training_config["n_epochs"],
lr=training_config["learning_rate"],
num_classes=training_config["num_classes"],
save_path=training_config["save_path"],
early_stop=3,
)
# ----------- Log metrics to ClearML -----------
# Per-epoch training losses and accuracies
for epoch, loss in enumerate(training_metrics["losses"]):
training_logger.report_scalar("training epoch loss", "loss", value=loss, iteration=epoch)
for epoch, acc in enumerate(training_metrics["accuracies"]):
training_logger.report_scalar("training epoch accuracy", "accuracy", value=acc, iteration=epoch)
# Per-epoch validation accuracies
for epoch, acc in enumerate(training_metrics["val_accuracies"]):
training_logger.report_scalar("validation epoch accuracy", "accuracy", value=acc, iteration=epoch)
# Best validation accuracy
training_logger.report_single_value("best_val_accuracy", training_metrics["best_accuracy"])
# Upload best model as artifact
training_task.upload_artifact("best_model", training_config["save_path"])
print("\nTraining complete.")
training_task.close() |