azure-scripts / train_h100_final.py

azure home scripts: data gen, training, misc

a70eb3d verified 19 days ago

6.93 kB

	#!/usr/bin/env python3
	"""
	Final Training on H100 - 96GB VRAM Beast!
	Merges ALL datasets and trains with maximum performance
	"""

	from roboflow import Roboflow
	from ultralytics import YOLO
	import torch
	import os
	import shutil
	import yaml
	import glob
	from pathlib import Path

	print("=" * 70)
	print("FINAL TRAINING ON H100 - BALANCED DATASET")
	print("=" * 70)

	# Check GPU
	print(f"\nGPU Available: {torch.cuda.is_available()}")
	if torch.cuda.is_available():
	print(f"GPU: {torch.cuda.get_device_name(0)}")
	print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.0f} GB")

	# Step 1: Download all datasets from Roboflow
	print("\n" + "=" * 70)
	print("STEP 1: Downloading Datasets from Roboflow")
	print("=" * 70)

	rf = Roboflow(api_key="cMpZOr1EizWFVrJ0Au4o")

	# Dataset 1: New 212 helmet images
	print("\nDataset 1: New helmet images (212)...")
	project1 = rf.workspace("team11s-workspace-man05").project("helmet-detection-ihomd")
	ds1 = project1.version(1).download("yolov8", location="~/helmet_212")

	# Dataset 2: Old no-helmet (499) from first account
	print("\nDataset 2: No-helmet images (499)...")
	rf2 = Roboflow(api_key="qeQs9chVa3kU0XnpTZsd")
	project2 = rf2.workspace("nyc-nleyq").project("indian-cctv-traffic-violations")
	ds2 = project2.version(1).download("yolov8", location="~/no_helmet_499")

	# Dataset 3: With-helmet (300) from second account
	print("\nDataset 3: With-helmet images (300)...")
	project3 = rf2.workspace("vivekvarikuti").project("withhelmet")
	ds3 = project3.version(1).download("yolov8", location="~/with_helmet_300")

	# Dataset 4: Triple-riding from original (626)
	print("\nDataset 4: Triple-riding (626)...")
	project4 = rf2.workspace("triple-ride-rsysj").project("triple-riding-detection-pniom")
	ds4 = project4.version(1).download("yolov8", location="~/triple_riding_626")

	print("\n✅ All datasets downloaded!")

	# Step 2: Merge all datasets
	print("\n" + "=" * 70)
	print("STEP 2: Merging ALL Datasets")
	print("=" * 70)

	MERGED_DIR = os.path.expanduser("~/final_merged_h100")

	for split in ['train', 'valid', 'test']:
	os.makedirs(f"{MERGED_DIR}/{split}/images", exist_ok=True)
	os.makedirs(f"{MERGED_DIR}/{split}/labels", exist_ok=True)

	# Collect all classes
	all_classes = set()
	datasets = [
	(ds1.location, 'helmet212'),
	(ds2.location, 'nohelmet499'),
	(ds3.location, 'withhelmet300'),
	(ds4.location, 'triple626')
	]

	class_configs = {}
	for ds_path, ds_name in datasets:
	yaml_path = f"{ds_path}/data.yaml"
	if os.path.exists(yaml_path):
	with open(yaml_path, 'r') as f:
	cfg = yaml.safe_load(f)
	class_configs[ds_name] = cfg
	if 'names' in cfg:
	all_classes.update(cfg['names'])

	unified_classes = sorted(list(all_classes))
	print(f"\nUnified classes ({len(unified_classes)}): {unified_classes}")

	# Create class mappings
	class_maps = {}
	for ds_name, cfg in class_configs.items():
	class_maps[ds_name] = {}
	if 'names' in cfg:
	for i, cls in enumerate(cfg['names']):
	class_maps[ds_name][i] = unified_classes.index(cls)

	# Copy and merge datasets
	def copy_with_remap(src_dir, prefix, class_mapping):
	total = 0
	for split in ['train', 'valid', 'test']:
	src_img = f"{src_dir}/{split}/images"
	src_lbl = f"{src_dir}/{split}/labels"

	if not os.path.exists(src_img):
	continue

	imgs = glob.glob(f"{src_img}/.jpg") + glob.glob(f"{src_img}/.png")

	for img_path in imgs:
	img_name = os.path.basename(img_path)
	lbl_name = Path(img_path).stem + '.txt'
	lbl_path = f"{src_lbl}/{lbl_name}"

	# Copy image with prefix
	dst_img = f"{MERGED_DIR}/{split}/images/{prefix}_{img_name}"
	shutil.copy2(img_path, dst_img)

	# Remap and copy label
	if os.path.exists(lbl_path):
	with open(lbl_path, 'r') as f:
	lines = f.readlines()

	remapped = []
	for line in lines:
	parts = line.strip().split()
	if len(parts) >= 5:
	old_cls = int(parts[0])
	new_cls = class_mapping.get(old_cls, old_cls)
	remapped.append(f"{new_cls} {' '.join(parts[1:])}\n")

	if remapped:
	dst_lbl = f"{MERGED_DIR}/{split}/labels/{prefix}_{lbl_name}"
	with open(dst_lbl, 'w') as f:
	f.writelines(remapped)
	total += 1

	return total

	print("\nCopying datasets...")
	for (ds_path, ds_name), prefix in zip(datasets, ['h212', 'nh499', 'wh300', 'tr626']):
	count = copy_with_remap(ds_path, prefix, class_maps.get(ds_name, {}))
	print(f" {ds_name}: {count} images")

	# Count final
	print("\nFinal merged dataset:")
	for split in ['train', 'valid', 'test']:
	imgs = glob.glob(f"{MERGED_DIR}/{split}/images/*")
	print(f" {split}: {len(imgs)} images")

	# Create YAML
	merged_yaml = {
	'path': MERGED_DIR,
	'train': 'train/images',
	'val': 'valid/images',
	'test': 'test/images',
	'nc': len(unified_classes),
	'names': unified_classes
	}

	yaml_path = f"{MERGED_DIR}/data.yaml"
	with open(yaml_path, 'w') as f:
	yaml.dump(merged_yaml, f, default_flow_style=False)

	print(f"\nConfig saved: {yaml_path}")

	# Step 3: Train on H100 with OPTIMIZED settings
	print("\n" + "=" * 70)
	print("STEP 3: TRAINING ON H100 (96GB VRAM!)")
	print("=" * 70)

	model = YOLO('yolo26m.pt')

	print(f"\nTraining config:")
	print(f" Model: YOLO26m")
	print(f" Epochs: 150 (faster with H100)")
	print(f" Batch: -1 (auto - H100 can handle 64-128!)")
	print(f" Image size: 640")
	print(f" Classes: {len(unified_classes)}")

	print("\nStarting training...")

	results = model.train(
	data=yaml_path,
	epochs=150, # Fewer epochs needed with large batch on H100
	imgsz=640,
	batch=-1, # Auto batch (H100 will use 64-128!)
	cache='ram', # H100 has tons of RAM
	device=0,
	workers=8,
	patience=30,
	name='h100_final',
	project='outputs',

	# Augmentation
	hsv_h=0.015,
	hsv_s=0.7,
	hsv_v=0.4,
	degrees=10,
	translate=0.1,
	scale=0.5,
	fliplr=0.5,
	mosaic=1.0,
	mixup=0.1,

	lr0=0.01,
	lrf=0.01,
	amp=True,
	val=True,
	plots=True,
	)

	print("\n" + "=" * 70)
	print("TRAINING COMPLETE!")
	print("=" * 70)

	# Validate
	metrics = model.val()
	print(f"\nFinal Metrics:")
	print(f" mAP50: {metrics.box.map50:.4f} ({metrics.box.map50*100:.1f}%)")
	print(f" mAP50-95: {metrics.box.map:.4f} ({metrics.box.map*100:.1f}%)")
	print(f" Precision: {metrics.box.mp:.4f} ({metrics.box.mp*100:.1f}%)")
	print(f" Recall: {metrics.box.mr:.4f} ({metrics.box.mr*100:.1f}%)")

	# Export
	print("\nExporting to ONNX...")
	model.export(format='onnx', dynamic=True, simplify=True)

	print("\n" + "=" * 70)
	print("Model saved: outputs/h100_final/weights/best.pt")
	print("=" * 70)