"""M1 model-evaluation harness: candidate VLMs × images × styles → markdown report. Designed to run on a CUDA box (DGX Spark) so results transfer to ZeroGPU: uv sync --extra local uv run python -m small_cuts.eval --images ~/eval-photos --out eval-report.md Smoke test anywhere (no weights): uv run python -m small_cuts.eval --images ~/eval-photos --backend mock """ from __future__ import annotations import argparse import tempfile import time from pathlib import Path from PIL import Image from .narrator import MockBackend, Narration, TransformersBackend, narrate CANDIDATE_MODELS = [ "HuggingFaceTB/SmolVLM2-2.2B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct", "Qwen/Qwen2.5-VL-7B-Instruct", "google/gemma-3-4b-it", ] EVAL_STYLES = ["deadpan", "noir", "nature_doc"] # .heic/.heif (iPhone default) decode via pillow-heif, registered in small_cuts/__init__.py IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".webp", ".heic", ".heif"} # Real Small Cuts input is video (Ray-Ban / phone clips). When a directory holds # videos, we sample frames so the model eval runs on representative stills. VIDEO_SUFFIXES = {".mov", ".mp4", ".m4v", ".webm", ".avi", ".mkv"} RUBRIC = ( "Score each cell 1-5 on: **S**pecificity (names real visible things), " "**G**roundedness (no invented objects/people), **V**oice (style lands). " "A model needs S>=4 and G>=4 on most images to be the pick." ) def _sample_video_frames( video: Path, every_n_seconds: float = 3.0, output_dir: Path | None = None, ) -> list[Path]: """Extract frames from a video into an output directory; return their paths.""" from .frames import sample_frames images = sample_frames(video, every_n_seconds=every_n_seconds) output = output_dir or Path(tempfile.mkdtemp(prefix="small-cuts-eval-frames-")) output.mkdir(parents=True, exist_ok=True) out_paths: list[Path] = [] for i, img in enumerate(images): out = output / f"{video.stem}_frame{i:06d}.jpg" img.save(out) out_paths.append(out) return out_paths def load_images(images_dir: Path, frame_dir: Path | None = None) -> list[Path]: if not images_dir.exists(): raise SystemExit(f"Directory does not exist: {images_dir}") entries = sorted(p for p in images_dir.iterdir() if p.is_file()) paths = [p for p in entries if p.suffix.lower() in IMAGE_SUFFIXES] videos = [p for p in entries if p.suffix.lower() in VIDEO_SUFFIXES] for video in videos: print(f"Sampling frames from {video.name}") paths.extend(_sample_video_frames(video, output_dir=frame_dir)) if not paths: listing = "\n".join(f" {p.name}" for p in entries) or " (directory is empty)" raise SystemExit( f"No images or videos found in {images_dir}.\n" f"Directory contains:\n{listing}\n" f"Recognized image suffixes: {sorted(IMAGE_SUFFIXES)}\n" f"Recognized video suffixes: {sorted(VIDEO_SUFFIXES)}" ) return sorted(paths) def run_model( model_id: str, image_paths: list[Path], styles: list[str], backend_name: str ) -> dict[tuple[str, str], Narration]: backend = MockBackend() if backend_name == "mock" else TransformersBackend(model_id=model_id) results: dict[tuple[str, str], Narration] = {} for path in image_paths: image = Image.open(path).convert("RGB") for style in styles: result = narrate(image, style_key=style, backend=backend) results[(path.name, style)] = result print(f" {model_id} | {path.name} | {style} | {result.latency_s:.1f}s") return results def render_report( all_results: dict[str, dict[tuple[str, str], Narration]], image_paths: list[Path], styles: list[str], ) -> str: lines = [ "# Small Cuts — M1 Narrator Model Eval", "", f"Generated {time.strftime('%Y-%m-%d %H:%M:%S')}.", "", RUBRIC, "", ] for path in image_paths: lines.append(f"## {path.name}") lines.append("") lines.append("| Model | Style | Narration | Latency | S | G | V |") lines.append("|---|---|---|---|---|---|---|") for model_id, results in all_results.items(): for style in styles: narration = results.get((path.name, style)) if narration is None: lines.append(f"| {model_id} | {style} | (failed) | - | | | |") continue text = narration.text.replace("\n", " ").replace("|", "\\|") lines.append( f"| {model_id} | {style} | {text} | {narration.latency_s:.1f}s | | | |" ) lines.append("") return "\n".join(lines) def main(argv: list[str] | None = None) -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--images", type=Path, required=True, help="Directory of eval photos") parser.add_argument("--models", nargs="*", default=CANDIDATE_MODELS) parser.add_argument("--styles", nargs="*", default=EVAL_STYLES) parser.add_argument("--out", type=Path, default=Path("eval-report.md")) parser.add_argument("--backend", choices=["transformers", "mock"], default="transformers") args = parser.parse_args(argv) with tempfile.TemporaryDirectory(prefix="small-cuts-eval-frames-") as frame_dir: image_paths = load_images(args.images, frame_dir=Path(frame_dir)) models = args.models if args.backend == "transformers" else ["mock"] all_results = {} failures = [] for model_id in models: try: all_results[model_id] = run_model(model_id, image_paths, args.styles, args.backend) except Exception as exc: # one gated/broken model must not kill the eval failures.append(f"{model_id}: {type(exc).__name__}: {exc}") print(f" FAILED {model_id}: {exc}") if not all_results: raise SystemExit("All models failed:\n" + "\n".join(failures)) report = render_report(all_results, image_paths, args.styles) if failures: report += "\n## Failed models\n\n" + "\n".join(f"- {f}" for f in failures) + "\n" args.out.write_text(report) print(f"\nReport written to {args.out}") if __name__ == "__main__": main()