"""Find best and worst predictions by per-sample IoU and generate showcase figures."""

import json
from pathlib import Path

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from tqdm import tqdm

PROJECT_ROOT = Path(__file__).resolve().parents[1]


def iou(pred: np.ndarray, gt: np.ndarray) -> float:
    intersection = np.logical_and(pred, gt).sum()
    union = np.logical_or(pred, gt).sum()
    return float(intersection / union) if union > 0 else 0.0


def score_all():
    """Score every test prediction against ground truth. Returns dict of per-class scored lists."""
    with open(PROJECT_ROOT / "data" / "splits" / "test.json") as f:
        test_samples = json.load(f)

    masks_dir = PROJECT_ROOT / "outputs" / "masks"
    scores = {"taping": [], "cracks": []}

    for sample in tqdm(test_samples, desc="Scoring predictions"):
        img_stem = Path(sample["image_path"]).stem
        ds = sample["dataset"]

        candidates = list(masks_dir.glob(f"{img_stem}__*.png"))
        if not candidates:
            continue

        gt = np.array(Image.open(sample["mask_path"]).convert("L"))
        gt_bin = (gt > 127).astype(np.uint8)

        best_iou = -1
        best_pred_path = None
        best_prompt = None
        for pred_path in candidates:
            pred = np.array(Image.open(pred_path).convert("L").resize(
                (gt.shape[1], gt.shape[0]), Image.NEAREST))
            pred_bin = (pred > 127).astype(np.uint8)
            score = iou(pred_bin, gt_bin)
            if score > best_iou:
                best_iou = score
                best_pred_path = pred_path
                best_prompt = pred_path.stem.split("__")[1].replace("_", " ")

        scores[ds].append({
            "image_path": sample["image_path"],
            "mask_path": sample["mask_path"],
            "pred_path": str(best_pred_path),
            "prompt": best_prompt,
            "iou": best_iou,
            "dataset": ds,
        })

    return scores


def pick_ranked(scores, n_per_class=3, best=True):
    """Pick top-N or bottom-N per class by IoU."""
    result = []
    for ds in ["cracks", "taping"]:
        # Filter out zero-IoU (no prediction found) for worst — keep only actual failures
        pool = [s for s in scores[ds] if s["iou"] > 0] if not best else scores[ds]
        ranked = sorted(pool, key=lambda x: x["iou"], reverse=best)
        selected = ranked[:n_per_class]
        result.extend(selected)

        label = "best" if best else "worst"
        print(f"\n{ds} {label} {n_per_class}:")
        for r in selected:
            print(f"  IoU={r['iou']:.4f}  {Path(r['image_path']).name}  \"{r['prompt']}\"")

    return result


def generate_grid(examples, output_path, title=""):
    """Generate original | ground truth | prediction comparison grid."""
    n = len(examples)
    fig, axes = plt.subplots(n, 3, figsize=(14, 4.0 * n))
    if n == 1:
        axes = [axes]

    if title:
        fig.suptitle(title, fontsize=16, fontweight="bold", y=0.998)

    for i, ex in enumerate(examples):
        img = Image.open(ex["image_path"]).convert("RGB")
        gt = Image.open(ex["mask_path"]).convert("L")
        pred = Image.open(ex["pred_path"]).convert("L").resize(
            (gt.size[0], gt.size[1]), Image.NEAREST)

        label = ex["dataset"].capitalize()

        axes[i][0].imshow(img)
        axes[i][0].set_title(f"Input — {label}", fontsize=11, fontweight="bold")
        axes[i][0].axis("off")

        axes[i][1].imshow(gt, cmap="gray", vmin=0, vmax=255)
        axes[i][1].set_title("Ground Truth", fontsize=11)
        axes[i][1].axis("off")

        axes[i][2].imshow(pred, cmap="gray", vmin=0, vmax=255)
        axes[i][2].set_title(f"Predicted — \"{ex['prompt']}\"  (IoU {ex['iou']:.2f})", fontsize=11)
        axes[i][2].axis("off")

    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches="tight", facecolor="white")
    plt.close()
    print(f"Saved → {output_path}")


if __name__ == "__main__":
    figures_dir = PROJECT_ROOT / "reports" / "figures"
    scores = score_all()

    # Best predictions (3 per class)
    best = pick_ranked(scores, n_per_class=3, best=True)
    generate_grid(best, figures_dir / "best_predictions.png",
                  title="Best Test-Set Predictions (by IoU)")

    # Worst predictions (3 per class) — only samples where model actually predicted something
    worst = pick_ranked(scores, n_per_class=3, best=False)
    generate_grid(worst, figures_dir / "failure_cases.png",
                  title="Failure Cases — Worst Test-Set Predictions (by IoU)")