azure-scripts / train_h100_clean.py
vivekvar's picture
azure home scripts: data gen, training, misc
a70eb3d verified
#!/usr/bin/env python3
# 3-class clean training on H100 NVL
# Classes: 0 no-helmet | 1 with-helmet | 2 triple-riding
from ultralytics import YOLO
import torch, os
print('GPU:', torch.cuda.get_device_name(0), '|', torch.cuda.get_device_properties(0).total_memory/1e9, 'GB')
# Start from pretrained yolo26m (auto-downloads if missing)
model = YOLO('yolo26m.pt')
results = model.train(
data='/home/azureuser/clean_merged_data/data.yaml',
epochs=150,
imgsz=640,
batch=64, # H100 NVL has 95GB, can push batch high
device=0,
workers=8,
project='runs_clean',
name='h100_3class',
exist_ok=True,
amp=True,
cos_lr=True,
close_mosaic=15,
# augmentation — important for 10k image dataset
mosaic=1.0,
mixup=0.15,
copy_paste=0.3, # boost with-helmet via cross-image pasting
hsv_h=0.015, hsv_s=0.7, hsv_v=0.4,
degrees=5.0,
translate=0.1,
scale=0.5,
fliplr=0.5,
# loss
cls=1.0, # classification loss weight (bump if still confused)
box=7.5,
dfl=1.5,
# regularization
weight_decay=0.0005,
dropout=0.0,
# schedule
optimizer='auto',
lr0=0.01,
patience=40,
plots=True,
verbose=True,
)
print('TRAIN DONE — running val on test split')
m = YOLO('runs_clean/h100_3class/weights/best.pt')
m.val(data='/home/azureuser/clean_merged_data/data.yaml', split='test', plots=True, save_json=True)