#!/usr/bin/env python3 # v3 max-throughput: yolo26m on 6673-img dataset, batch=128, cache=ram from ultralytics import YOLO import torch print('GPU:', torch.cuda.get_device_name(0), '|', round(torch.cuda.get_device_properties(0).total_memory/1e9), 'GB') model = YOLO('yolo26m.pt') model.train( data='/home/azureuser/merged_v3/data.yaml', epochs=200, imgsz=640, batch=128, # 2x v2 — should hit ~70GB VRAM device=0, workers=16, # feed data faster cache='ram', # dataset is ~1GB, fits easily project='runs_v3', name='h100_3class_v3', exist_ok=True, amp=True, cos_lr=True, close_mosaic=20, mosaic=1.0, mixup=0.15, copy_paste=0.3, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, degrees=5.0, translate=0.1, scale=0.5, fliplr=0.5, cls=1.0, box=7.5, dfl=1.5, weight_decay=0.0005, optimizer='auto', patience=60, plots=True, verbose=True, ) print('TRAIN DONE — running val + test') m = YOLO('runs_v3/h100_3class_v3/weights/best.pt') print('--- VAL ---'); m.val(data='/home/azureuser/merged_v3/data.yaml', split='val') print('--- TEST ---'); m.val(data='/home/azureuser/merged_v3/data.yaml', split='test') print('--- TEST + TTA ---'); m.val(data='/home/azureuser/merged_v3/data.yaml', split='test', augment=True)