File size: 10,546 Bytes

5b86813

#!/usr/bin/env python3
"""
High-Accuracy Training Script for Road Anomaly Detection
=========================================================
Optimised for: RTX 2050 (4 GB), i5-12450H, 15 GB RAM

Model: YOLO11s — 9.4M params, 21.5 GFLOPs (3.6× more than 11n)

Usage:
  python train_high_accuracy.py              # Full training (300 epochs)
  python train_high_accuracy.py --dry-run    # Quick 2-epoch test run
"""

import os
import sys
import shutil
import logging
import argparse
from pathlib import Path
from datetime import datetime

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler(), logging.FileHandler("training_optimised.log")],
)
logger = logging.getLogger("train_optimised")


def main():
    parser = argparse.ArgumentParser(description="Train YOLO11s for road anomaly detection")
    parser.add_argument("--dry-run", action="store_true",
                        help="Quick 2-epoch test to verify everything works")
    args = parser.parse_args()

    try:
        import torch
        from ultralytics import YOLO
    except ImportError as e:
        print(f"Missing dependency: {e}")
        print("Run: pip install ultralytics torch")
        sys.exit(1)

    is_dry_run = args.dry_run
    epochs = 2 if is_dry_run else 300
    run_name = "dry_run" if is_dry_run else "high_accuracy_s"

    print()
    print("=" * 60)
    if is_dry_run:
        print("  DRY RUN — 2 epochs to verify setup")
    else:
        print("  HIGH-ACCURACY ROAD ANOMALY DETECTION TRAINING")
    print("  YOLO11s • RTX 2050 (4 GB) optimised")
    print("=" * 60)
    print(f"  Started: {datetime.now():%Y-%m-%d %H:%M:%S}")
    print()

    # ── GPU check ──
    if torch.cuda.is_available():
        gpu = torch.cuda.get_device_properties(0)
        vram_gb = gpu.total_memory / (1024 ** 3)
        print(f"  GPU: {gpu.name} ({vram_gb:.1f} GB)")
    else:
        vram_gb = 0
        print("  WARNING: No GPU — training will be very slow")

    # ── Pick batch size based on VRAM ──
    # YOLO11s at 640px: ~2.2 GB at batch=2, ~3.2 GB at batch=4
    if vram_gb >= 6:
        batch = 8
    elif vram_gb >= 4:
        batch = 4
    else:
        batch = 2   # Safe for RTX 2050 (3.7 GB) with YOLO11s

    # ── Dataset path (use absolute path to avoid any issues) ──
    script_dir = Path(__file__).resolve().parent
    data_yaml = script_dir / "dataset" / "data.yaml"
    if not data_yaml.exists():
        print(f"  ERROR: Dataset not found: {data_yaml}")
        sys.exit(1)

    # Count images
    train_imgs = list((script_dir / "dataset" / "train" / "images").glob("*.jpg"))
    valid_imgs = list((script_dir / "dataset" / "valid" / "images").glob("*.jpg"))
    print(f"  Dataset: {len(train_imgs)} train / {len(valid_imgs)} val images")
    print(f"  Batch size: {batch}")
    print(f"  Image size: 640 (native 600x600 — no downscaling)")
    print(f"  Epochs: {epochs}")
    print()

    # ── Load model ──
    # YOLO11s: 9.4M params, 21.5 GFLOPs — 3.6× more capacity than 11n
    # Fits in 3.7 GB VRAM at batch=2 with AMP (~2.2 GB)
    model_name = "yolo11s.pt"
    print(f"  Base model: {model_name}")
    model = YOLO(model_name)

    # ══════════════════════════════════════════════════════════════════
    #  TRAINING — optimised hyperparameters
    # ══════════════════════════════════════════════════════════════════
    try:
        results = model.train(
            # ── Data ──
            data=str(data_yaml),
            imgsz=640,                  # Match native 600×600 (padded to 640)

            # ── Training schedule ──
            epochs=epochs,
            patience=0 if is_dry_run else 50,  # No early stop in dry run
            batch=batch,                # Fit in 4 GB VRAM

        # ── Optimiser ──
        optimizer="AdamW",
        lr0=0.002,                  # Slightly higher LR for small dataset
        lrf=0.01,                   # Final LR = lr0 × lrf (cosine decay)
        momentum=0.937,
        weight_decay=0.0005,
        warmup_epochs=10,           # Longer warmup for stability
        warmup_momentum=0.5,
        warmup_bias_lr=0.01,

            # ── Augmentation (aggressive for small dataset) ──
            hsv_h=0.02,                 # Hue shift
            hsv_s=0.75,                 # Saturation shift
            hsv_v=0.5,                  # Value/brightness shift
            degrees=15.0,               # Rotation ±15°
            translate=0.2,              # Translation ±20%
            scale=0.5,                  # Scale ±50%
            shear=5.0,                  # Shear ±5°
            perspective=0.0001,         # Slight perspective warp
            flipud=0.1,                 # Vertical flip (road can be upside-down in data)
            fliplr=0.5,                 # Horizontal flip
            mosaic=1.0,                 # Full mosaic — critical for small datasets
            mixup=0.15,                 # Mix images to reduce overfitting
            copy_paste=0.1,             # Copy-paste augmentation
            erasing=0.2,                # Random erasing (dropout-like)
            close_mosaic=20,            # Disable mosaic last 20 epochs for fine-tuning

            # ── Performance ──
            device=0,
            workers=4,                  # 12 threads but limited RAM
            cache="disk",               # Don't eat RAM (only 15 GB)
            amp=True,                   # Mixed precision — saves VRAM

            # ── Saving ──
            project="road_anomaly",
            name=run_name,
            exist_ok=True,
            save=True,
            save_period=25,             # Checkpoint every 25 epochs
            val=True,
            plots=True,

            # ── Advanced ──
            cos_lr=True,                # Cosine learning rate schedule
            nbs=64,                     # Nominal batch size for LR scaling
        )
    except Exception as e:
        logger.error("Training failed: %s", e)
        import traceback
        traceback.print_exc()
        sys.exit(1)

    # ══════════════════════════════════════════════════════════════════
    #  Post-training
    # ══════════════════════════════════════════════════════════════════

    # ══════════════════════════════════════════════════════════════════
    #  Post-training — copy weights & validate
    # ══════════════════════════════════════════════════════════════════

    # Get save_dir from the trainer (NOT from results — results is metrics)
    save_dir = Path(model.trainer.save_dir)
    logger.info("Training save dir: %s", save_dir)

    best_src = save_dir / "weights" / "best.pt"
    last_src = save_dir / "weights" / "last.pt"

    # Fallback: search if the expected path doesn't exist
    if not best_src.exists():
        logger.warning("best.pt not at expected path: %s", best_src)
        logger.info("Searching for best.pt...")
        for search_root in [Path("road_anomaly"), Path("runs"), script_dir]:
            if not search_root.exists():
                continue
            candidates = sorted(search_root.rglob("best.pt"),
                                key=lambda p: p.stat().st_mtime, reverse=True)
            if candidates:
                best_src = candidates[0]
                last_src = best_src.parent / "last.pt"
                logger.info("Found best.pt at: %s", best_src)
                break

    if not best_src.exists():
        logger.error("FATAL: best.pt not found anywhere after training!")
        logger.error("Check these directories manually:")
        logger.error("  %s", save_dir)
        for p in Path(".").rglob("best.pt"):
            logger.error("  Found: %s", p)
        sys.exit(1)

    # Copy to standard locations
    dest_dir = script_dir / "runs"
    dest_dir.mkdir(parents=True, exist_ok=True)

    dest_best = dest_dir / "best.pt"
    shutil.copy2(best_src, dest_best)
    logger.info("Best model copied to: %s", dest_best)

    # Also copy to project root for convenience
    shutil.copy2(best_src, script_dir / "best.pt")
    logger.info("Best model copied to: %s", script_dir / "best.pt")

    if last_src.exists():
        shutil.copy2(last_src, dest_dir / "last.pt")
        logger.info("Last model copied to: %s", dest_dir / "last.pt")

    # ── Final validation ──
    print()
    print("=" * 60)
    print("  FINAL VALIDATION")
    print("=" * 60)

    try:
        best_model = YOLO(str(dest_best))
        metrics = best_model.val(data=str(data_yaml), imgsz=640, device=0)

        p = metrics.box.mp
        r = metrics.box.mr
        f1 = 2 * p * r / (p + r) if (p + r) > 0 else 0.0

        print(f"  mAP@0.5:       {metrics.box.map50*100:.1f}%")
        print(f"  mAP@0.5:0.95:  {metrics.box.map*100:.1f}%")
        print(f"  Precision:      {p*100:.1f}%")
        print(f"  Recall:         {r*100:.1f}%")
        print(f"  F1-score:       {f1*100:.1f}%")
        print(f"  Inference:      {metrics.speed['inference']:.1f} ms/image")
        print()

        # Per-class
        print("  Per-class mAP@0.5:")
        for i, ap in enumerate(metrics.box.ap50):
            print(f"    {best_model.names[i]:>20s}: {ap*100:.1f}%")
        print("=" * 60)
    except Exception as e:
        logger.error("Validation failed: %s", e)
        print("  Validation failed but model was saved successfully.")
        print(f"  Model at: {dest_best}")

    print()
    print(f"  Finished: {datetime.now():%Y-%m-%d %H:%M:%S}")
    print(f"  Model saved to: {dest_best}")
    print("  Run 'python evaluate.py' to re-check anytime.")
    print()


if __name__ == "__main__":
    main()