#!/usr/bin/env python3
# 3-class clean training on H100 NVL
# Classes: 0 no-helmet | 1 with-helmet | 2 triple-riding
from ultralytics import YOLO
import torch, os

print('GPU:', torch.cuda.get_device_name(0), '|', torch.cuda.get_device_properties(0).total_memory/1e9, 'GB')

# Start from pretrained yolo26m (auto-downloads if missing)
model = YOLO('yolo26m.pt')

results = model.train(
    data='/home/azureuser/clean_merged_data/data.yaml',
    epochs=150,
    imgsz=640,
    batch=64,           # H100 NVL has 95GB, can push batch high
    device=0,
    workers=8,
    project='runs_clean',
    name='h100_3class',
    exist_ok=True,
    amp=True,
    cos_lr=True,
    close_mosaic=15,
    # augmentation — important for 10k image dataset
    mosaic=1.0,
    mixup=0.15,
    copy_paste=0.3,     # boost with-helmet via cross-image pasting
    hsv_h=0.015, hsv_s=0.7, hsv_v=0.4,
    degrees=5.0,
    translate=0.1,
    scale=0.5,
    fliplr=0.5,
    # loss
    cls=1.0,            # classification loss weight (bump if still confused)
    box=7.5,
    dfl=1.5,
    # regularization
    weight_decay=0.0005,
    dropout=0.0,
    # schedule
    optimizer='auto',
    lr0=0.01,
    patience=40,
    plots=True,
    verbose=True,
)
print('TRAIN DONE — running val on test split')
m = YOLO('runs_clean/h100_3class/weights/best.pt')
m.val(data='/home/azureuser/clean_merged_data/data.yaml', split='test', plots=True, save_json=True)