"""M1 model-evaluation harness: candidate VLMs × images × styles → markdown report.

Designed to run on a CUDA box (DGX Spark) so results transfer to ZeroGPU:

    uv sync --extra local
    uv run python -m small_cuts.eval --images ~/eval-photos --out eval-report.md

Smoke test anywhere (no weights):

    uv run python -m small_cuts.eval --images ~/eval-photos --backend mock
"""

from __future__ import annotations

import argparse
import tempfile
import time
from pathlib import Path

from PIL import Image

from .narrator import MockBackend, Narration, TransformersBackend, narrate

CANDIDATE_MODELS = [
    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
    "Qwen/Qwen2.5-VL-3B-Instruct",
    "Qwen/Qwen2.5-VL-7B-Instruct",
    "google/gemma-3-4b-it",
]

EVAL_STYLES = ["deadpan", "noir", "nature_doc"]

# .heic/.heif (iPhone default) decode via pillow-heif, registered in small_cuts/__init__.py
IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".webp", ".heic", ".heif"}

# Real Small Cuts input is video (Ray-Ban / phone clips). When a directory holds
# videos, we sample frames so the model eval runs on representative stills.
VIDEO_SUFFIXES = {".mov", ".mp4", ".m4v", ".webm", ".avi", ".mkv"}

RUBRIC = (
    "Score each cell 1-5 on: **S**pecificity (names real visible things), "
    "**G**roundedness (no invented objects/people), **V**oice (style lands). "
    "A model needs S>=4 and G>=4 on most images to be the pick."
)


def _sample_video_frames(
    video: Path,
    every_n_seconds: float = 3.0,
    output_dir: Path | None = None,
) -> list[Path]:
    """Extract frames from a video into an output directory; return their paths."""
    from .frames import sample_frames

    images = sample_frames(video, every_n_seconds=every_n_seconds)
    output = output_dir or Path(tempfile.mkdtemp(prefix="small-cuts-eval-frames-"))
    output.mkdir(parents=True, exist_ok=True)
    out_paths: list[Path] = []
    for i, img in enumerate(images):
        out = output / f"{video.stem}_frame{i:06d}.jpg"
        img.save(out)
        out_paths.append(out)
    return out_paths


def load_images(images_dir: Path, frame_dir: Path | None = None) -> list[Path]:
    if not images_dir.exists():
        raise SystemExit(f"Directory does not exist: {images_dir}")
    entries = sorted(p for p in images_dir.iterdir() if p.is_file())
    paths = [p for p in entries if p.suffix.lower() in IMAGE_SUFFIXES]
    videos = [p for p in entries if p.suffix.lower() in VIDEO_SUFFIXES]
    for video in videos:
        print(f"Sampling frames from {video.name}")
        paths.extend(_sample_video_frames(video, output_dir=frame_dir))
    if not paths:
        listing = "\n".join(f"  {p.name}" for p in entries) or "  (directory is empty)"
        raise SystemExit(
            f"No images or videos found in {images_dir}.\n"
            f"Directory contains:\n{listing}\n"
            f"Recognized image suffixes: {sorted(IMAGE_SUFFIXES)}\n"
            f"Recognized video suffixes: {sorted(VIDEO_SUFFIXES)}"
        )
    return sorted(paths)


def run_model(
    model_id: str, image_paths: list[Path], styles: list[str], backend_name: str
) -> dict[tuple[str, str], Narration]:
    backend = MockBackend() if backend_name == "mock" else TransformersBackend(model_id=model_id)
    results: dict[tuple[str, str], Narration] = {}
    for path in image_paths:
        image = Image.open(path).convert("RGB")
        for style in styles:
            result = narrate(image, style_key=style, backend=backend)
            results[(path.name, style)] = result
            print(f"  {model_id} | {path.name} | {style} | {result.latency_s:.1f}s")
    return results


def render_report(
    all_results: dict[str, dict[tuple[str, str], Narration]],
    image_paths: list[Path],
    styles: list[str],
) -> str:
    lines = [
        "# Small Cuts — M1 Narrator Model Eval",
        "",
        f"Generated {time.strftime('%Y-%m-%d %H:%M:%S')}.",
        "",
        RUBRIC,
        "",
    ]
    for path in image_paths:
        lines.append(f"## {path.name}")
        lines.append("")
        lines.append("| Model | Style | Narration | Latency | S | G | V |")
        lines.append("|---|---|---|---|---|---|---|")
        for model_id, results in all_results.items():
            for style in styles:
                narration = results.get((path.name, style))
                if narration is None:
                    lines.append(f"| {model_id} | {style} | (failed) | - |  |  |  |")
                    continue
                text = narration.text.replace("\n", " ").replace("|", "\\|")
                lines.append(
                    f"| {model_id} | {style} | {text} | {narration.latency_s:.1f}s |  |  |  |"
                )
        lines.append("")
    return "\n".join(lines)


def main(argv: list[str] | None = None) -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--images", type=Path, required=True, help="Directory of eval photos")
    parser.add_argument("--models", nargs="*", default=CANDIDATE_MODELS)
    parser.add_argument("--styles", nargs="*", default=EVAL_STYLES)
    parser.add_argument("--out", type=Path, default=Path("eval-report.md"))
    parser.add_argument("--backend", choices=["transformers", "mock"], default="transformers")
    args = parser.parse_args(argv)

    with tempfile.TemporaryDirectory(prefix="small-cuts-eval-frames-") as frame_dir:
        image_paths = load_images(args.images, frame_dir=Path(frame_dir))
        models = args.models if args.backend == "transformers" else ["mock"]
        all_results = {}
        failures = []
        for model_id in models:
            try:
                all_results[model_id] = run_model(model_id, image_paths, args.styles, args.backend)
            except Exception as exc:  # one gated/broken model must not kill the eval
                failures.append(f"{model_id}: {type(exc).__name__}: {exc}")
                print(f"  FAILED {model_id}: {exc}")
        if not all_results:
            raise SystemExit("All models failed:\n" + "\n".join(failures))
        report = render_report(all_results, image_paths, args.styles)
        if failures:
            report += "\n## Failed models\n\n" + "\n".join(f"- {f}" for f in failures) + "\n"
        args.out.write_text(report)
    print(f"\nReport written to {args.out}")


if __name__ == "__main__":
    main()