arm-model / model /train_high_accuracy.py

Upload folder using huggingface_hub

5b86813 verified 7 days ago

10.5 kB

	#!/usr/bin/env python3
	"""
	High-Accuracy Training Script for Road Anomaly Detection
	=========================================================
	Optimised for: RTX 2050 (4 GB), i5-12450H, 15 GB RAM

	Model: YOLO11s — 9.4M params, 21.5 GFLOPs (3.6× more than 11n)

	Usage:
	python train_high_accuracy.py # Full training (300 epochs)
	python train_high_accuracy.py --dry-run # Quick 2-epoch test run
	"""

	import os
	import sys
	import shutil
	import logging
	import argparse
	from pathlib import Path
	from datetime import datetime

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s",
	handlers=[logging.StreamHandler(), logging.FileHandler("training_optimised.log")],
	)
	logger = logging.getLogger("train_optimised")


	def main():
	parser = argparse.ArgumentParser(description="Train YOLO11s for road anomaly detection")
	parser.add_argument("--dry-run", action="store_true",
	help="Quick 2-epoch test to verify everything works")
	args = parser.parse_args()

	try:
	import torch
	from ultralytics import YOLO
	except ImportError as e:
	print(f"Missing dependency: {e}")
	print("Run: pip install ultralytics torch")
	sys.exit(1)

	is_dry_run = args.dry_run
	epochs = 2 if is_dry_run else 300
	run_name = "dry_run" if is_dry_run else "high_accuracy_s"

	print()
	print("=" * 60)
	if is_dry_run:
	print(" DRY RUN — 2 epochs to verify setup")
	else:
	print(" HIGH-ACCURACY ROAD ANOMALY DETECTION TRAINING")
	print(" YOLO11s • RTX 2050 (4 GB) optimised")
	print("=" * 60)
	print(f" Started: {datetime.now():%Y-%m-%d %H:%M:%S}")
	print()

	# ── GPU check ──
	if torch.cuda.is_available():
	gpu = torch.cuda.get_device_properties(0)
	vram_gb = gpu.total_memory / (1024 ** 3)
	print(f" GPU: {gpu.name} ({vram_gb:.1f} GB)")
	else:
	vram_gb = 0
	print(" WARNING: No GPU — training will be very slow")

	# ── Pick batch size based on VRAM ──
	# YOLO11s at 640px: ~2.2 GB at batch=2, ~3.2 GB at batch=4
	if vram_gb >= 6:
	batch = 8
	elif vram_gb >= 4:
	batch = 4
	else:
	batch = 2 # Safe for RTX 2050 (3.7 GB) with YOLO11s

	# ── Dataset path (use absolute path to avoid any issues) ──
	script_dir = Path(__file__).resolve().parent
	data_yaml = script_dir / "dataset" / "data.yaml"
	if not data_yaml.exists():
	print(f" ERROR: Dataset not found: {data_yaml}")
	sys.exit(1)

	# Count images
	train_imgs = list((script_dir / "dataset" / "train" / "images").glob("*.jpg"))
	valid_imgs = list((script_dir / "dataset" / "valid" / "images").glob("*.jpg"))
	print(f" Dataset: {len(train_imgs)} train / {len(valid_imgs)} val images")
	print(f" Batch size: {batch}")
	print(f" Image size: 640 (native 600x600 — no downscaling)")
	print(f" Epochs: {epochs}")
	print()

	# ── Load model ──
	# YOLO11s: 9.4M params, 21.5 GFLOPs — 3.6× more capacity than 11n
	# Fits in 3.7 GB VRAM at batch=2 with AMP (~2.2 GB)
	model_name = "yolo11s.pt"
	print(f" Base model: {model_name}")
	model = YOLO(model_name)

	# ══════════════════════════════════════════════════════════════════
	# TRAINING — optimised hyperparameters
	# ══════════════════════════════════════════════════════════════════
	try:
	results = model.train(
	# ── Data ──
	data=str(data_yaml),
	imgsz=640, # Match native 600×600 (padded to 640)

	# ── Training schedule ──
	epochs=epochs,
	patience=0 if is_dry_run else 50, # No early stop in dry run
	batch=batch, # Fit in 4 GB VRAM

	# ── Optimiser ──
	optimizer="AdamW",
	lr0=0.002, # Slightly higher LR for small dataset
	lrf=0.01, # Final LR = lr0 × lrf (cosine decay)
	momentum=0.937,
	weight_decay=0.0005,
	warmup_epochs=10, # Longer warmup for stability
	warmup_momentum=0.5,
	warmup_bias_lr=0.01,

	# ── Augmentation (aggressive for small dataset) ──
	hsv_h=0.02, # Hue shift
	hsv_s=0.75, # Saturation shift
	hsv_v=0.5, # Value/brightness shift
	degrees=15.0, # Rotation ±15°
	translate=0.2, # Translation ±20%
	scale=0.5, # Scale ±50%
	shear=5.0, # Shear ±5°
	perspective=0.0001, # Slight perspective warp
	flipud=0.1, # Vertical flip (road can be upside-down in data)
	fliplr=0.5, # Horizontal flip
	mosaic=1.0, # Full mosaic — critical for small datasets
	mixup=0.15, # Mix images to reduce overfitting
	copy_paste=0.1, # Copy-paste augmentation
	erasing=0.2, # Random erasing (dropout-like)
	close_mosaic=20, # Disable mosaic last 20 epochs for fine-tuning

	# ── Performance ──
	device=0,
	workers=4, # 12 threads but limited RAM
	cache="disk", # Don't eat RAM (only 15 GB)
	amp=True, # Mixed precision — saves VRAM

	# ── Saving ──
	project="road_anomaly",
	name=run_name,
	exist_ok=True,
	save=True,
	save_period=25, # Checkpoint every 25 epochs
	val=True,
	plots=True,

	# ── Advanced ──
	cos_lr=True, # Cosine learning rate schedule
	nbs=64, # Nominal batch size for LR scaling
	)
	except Exception as e:
	logger.error("Training failed: %s", e)
	import traceback
	traceback.print_exc()
	sys.exit(1)

	# ══════════════════════════════════════════════════════════════════
	# Post-training
	# ══════════════════════════════════════════════════════════════════

	# ══════════════════════════════════════════════════════════════════
	# Post-training — copy weights & validate
	# ══════════════════════════════════════════════════════════════════

	# Get save_dir from the trainer (NOT from results — results is metrics)
	save_dir = Path(model.trainer.save_dir)
	logger.info("Training save dir: %s", save_dir)

	best_src = save_dir / "weights" / "best.pt"
	last_src = save_dir / "weights" / "last.pt"

	# Fallback: search if the expected path doesn't exist
	if not best_src.exists():
	logger.warning("best.pt not at expected path: %s", best_src)
	logger.info("Searching for best.pt...")
	for search_root in [Path("road_anomaly"), Path("runs"), script_dir]:
	if not search_root.exists():
	continue
	candidates = sorted(search_root.rglob("best.pt"),
	key=lambda p: p.stat().st_mtime, reverse=True)
	if candidates:
	best_src = candidates[0]
	last_src = best_src.parent / "last.pt"
	logger.info("Found best.pt at: %s", best_src)
	break

	if not best_src.exists():
	logger.error("FATAL: best.pt not found anywhere after training!")
	logger.error("Check these directories manually:")
	logger.error(" %s", save_dir)
	for p in Path(".").rglob("best.pt"):
	logger.error(" Found: %s", p)
	sys.exit(1)

	# Copy to standard locations
	dest_dir = script_dir / "runs"
	dest_dir.mkdir(parents=True, exist_ok=True)

	dest_best = dest_dir / "best.pt"
	shutil.copy2(best_src, dest_best)
	logger.info("Best model copied to: %s", dest_best)

	# Also copy to project root for convenience
	shutil.copy2(best_src, script_dir / "best.pt")
	logger.info("Best model copied to: %s", script_dir / "best.pt")

	if last_src.exists():
	shutil.copy2(last_src, dest_dir / "last.pt")
	logger.info("Last model copied to: %s", dest_dir / "last.pt")

	# ── Final validation ──
	print()
	print("=" * 60)
	print(" FINAL VALIDATION")
	print("=" * 60)

	try:
	best_model = YOLO(str(dest_best))
	metrics = best_model.val(data=str(data_yaml), imgsz=640, device=0)

	p = metrics.box.mp
	r = metrics.box.mr
	f1 = 2 * p * r / (p + r) if (p + r) > 0 else 0.0

	print(f" mAP@0.5: {metrics.box.map50*100:.1f}%")
	print(f" mAP@0.5:0.95: {metrics.box.map*100:.1f}%")
	print(f" Precision: {p*100:.1f}%")
	print(f" Recall: {r*100:.1f}%")
	print(f" F1-score: {f1*100:.1f}%")
	print(f" Inference: {metrics.speed['inference']:.1f} ms/image")
	print()

	# Per-class
	print(" Per-class mAP@0.5:")
	for i, ap in enumerate(metrics.box.ap50):
	print(f" {best_model.names[i]:>20s}: {ap*100:.1f}%")
	print("=" * 60)
	except Exception as e:
	logger.error("Validation failed: %s", e)
	print(" Validation failed but model was saved successfully.")
	print(f" Model at: {dest_best}")

	print()
	print(f" Finished: {datetime.now():%Y-%m-%d %H:%M:%S}")
	print(f" Model saved to: {dest_best}")
	print(" Run 'python evaluate.py' to re-check anytime.")
	print()


	if __name__ == "__main__":
	main()