Image Feature Extraction
MLX
English
data-label-factory
vision
dataset-labeling
object-detection
apple-silicon
gemma
falcon-perception
openrouter
yolo
Instructions to use waltgrace/data-label-factory with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use waltgrace/data-label-factory with MLX:
# Download the model from the Hub pip install huggingface_hub[hf_xet] huggingface-cli download --local-dir data-label-factory waltgrace/data-label-factory
- Notebooks
- Google Colab
- Kaggle
- Local Apps
- LM Studio
| """ | |
| benchmark.py — v1 vs v2 comparison for data_label_factory. | |
| Run the same images through multiple labeling backends, compute deterministic | |
| metrics on each, and generate a comparison report. | |
| Usage: | |
| # Compare two existing experiment COCO files | |
| data_label_factory benchmark \ | |
| --a experiments/2026-04-13_falcon/label_falcon/drones.coco.json \ | |
| --b experiments/2026-04-13_wilddet3d/label_wilddet3d/drones.coco.json | |
| # Run a fresh benchmark: label the same images with multiple backends | |
| data_label_factory benchmark \ | |
| --project projects/drones.yaml \ | |
| --backends falcon,wilddet3d \ | |
| --limit 50 | |
| # Score a single experiment | |
| data_label_factory benchmark --score experiments/latest/ | |
| # MODEL BENCHMARK — compare VLMs for filter/verify accuracy | |
| # Run the same filter prompt through multiple OpenRouter models + local Qwen | |
| data_label_factory benchmark --models \ | |
| --project projects/drones.yaml \ | |
| --model-list "google/gemma-4-26b-a4b-it,meta-llama/llama-4-scout,qwen" \ | |
| --limit 30 | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import sys | |
| import time | |
| from collections import defaultdict | |
| from datetime import datetime | |
| from pathlib import Path | |
| from .metrics import ( | |
| score_coco, score_experiment, match_annotations, | |
| ComparisonReport, ExperimentScore, verify_bbox_rules, | |
| ) | |
| def _print_score(name: str, score: ExperimentScore): | |
| """Pretty-print an experiment score.""" | |
| print(f"\n {name}") | |
| print(f" images: {score.total_images}") | |
| print(f" annotations: {score.total_annotations}") | |
| print(f" pass rate: {score.pass_rate:.1%}") | |
| print(f" mean score: {score.mean_score:.3f}") | |
| print(f" mean area%: {score.mean_area_ratio:.3f}") | |
| if score.rule_breakdown: | |
| print(f" rules:") | |
| for rule, rate in sorted(score.rule_breakdown.items()): | |
| flag = "ok" if rate >= 0.95 else "WARN" if rate >= 0.80 else "FAIL" | |
| print(f" {rule:20s} {rate:6.1%} {flag}") | |
| if score.per_category: | |
| print(f" categories:") | |
| for cat, cnt in sorted(score.per_category.items(), key=lambda x: -x[1]): | |
| print(f" {cat:30s} {cnt:5d}") | |
| def _print_comparison(name_a: str, name_b: str, report: ComparisonReport): | |
| """Pretty-print a comparison report.""" | |
| print(f"\n {name_a} vs {name_b}") | |
| print(f" matched pairs: {len(report.matched)}") | |
| print(f" only in {name_a}: {len(report.unmatched_a)}") | |
| print(f" only in {name_b}: {len(report.unmatched_b)}") | |
| print(f" precision: {report.precision:.3f}") | |
| print(f" recall: {report.recall:.3f}") | |
| print(f" F1: {report.f1:.3f}") | |
| print(f" mean IoU: {report.mean_iou:.3f}") | |
| print(f" category agree: {report.category_agreement:.1%}") | |
| if report.per_category: | |
| print(f" per-category:") | |
| for cat, stats in sorted(report.per_category.items()): | |
| print(f" {cat:30s} matched={stats['matched']} " | |
| f"only_a={stats['only_a']} only_b={stats['only_b']}") | |
| def cmd_benchmark_compare(args): | |
| """Compare two COCO files.""" | |
| with open(args.a) as f: | |
| coco_a = json.load(f) | |
| with open(args.b) as f: | |
| coco_b = json.load(f) | |
| name_a = os.path.basename(os.path.dirname(args.a)) | |
| name_b = os.path.basename(os.path.dirname(args.b)) | |
| print("=" * 70) | |
| print(f"BENCHMARK: {name_a} vs {name_b}") | |
| print("=" * 70) | |
| # Score each independently | |
| score_a = score_coco(coco_a) | |
| score_b = score_coco(coco_b) | |
| _print_score(f"[A] {name_a}", score_a) | |
| _print_score(f"[B] {name_b}", score_b) | |
| # Cross-compare on shared images | |
| imgs_a = {img["file_name"]: img for img in coco_a.get("images", [])} | |
| imgs_b = {img["file_name"]: img for img in coco_b.get("images", [])} | |
| shared = set(imgs_a) & set(imgs_b) | |
| if shared: | |
| cats_a = {c["id"]: c["name"] for c in coco_a.get("categories", [])} | |
| cats_b = {c["id"]: c["name"] for c in coco_b.get("categories", [])} | |
| anns_a_by_img = defaultdict(list) | |
| for ann in coco_a.get("annotations", []): | |
| img = imgs_a.get({v["id"]: v for v in coco_a["images"]}.get(ann["image_id"], {}).get("file_name", ""), {}) | |
| # Simpler: build id->filename map | |
| id_to_fname_a = {img["id"]: img["file_name"] for img in coco_a.get("images", [])} | |
| id_to_fname_b = {img["id"]: img["file_name"] for img in coco_b.get("images", [])} | |
| anns_a_by_img = defaultdict(list) | |
| for ann in coco_a.get("annotations", []): | |
| fname = id_to_fname_a.get(ann["image_id"], "") | |
| if fname in shared: | |
| a = dict(ann) | |
| a["category"] = cats_a.get(ann.get("category_id"), ann.get("category", "")) | |
| anns_a_by_img[fname].append(a) | |
| anns_b_by_img = defaultdict(list) | |
| for ann in coco_b.get("annotations", []): | |
| fname = id_to_fname_b.get(ann["image_id"], "") | |
| if fname in shared: | |
| b = dict(ann) | |
| b["category"] = cats_b.get(ann.get("category_id"), ann.get("category", "")) | |
| anns_b_by_img[fname].append(b) | |
| # Aggregate comparison across all shared images | |
| all_matched = [] | |
| all_unmatched_a = [] | |
| all_unmatched_b = [] | |
| for fname in shared: | |
| report = match_annotations(anns_a_by_img[fname], anns_b_by_img[fname]) | |
| all_matched.extend(report.matched) | |
| all_unmatched_a.extend(report.unmatched_a) | |
| all_unmatched_b.extend(report.unmatched_b) | |
| per_cat = defaultdict(lambda: {"matched": 0, "only_a": 0, "only_b": 0}) | |
| for m in all_matched: | |
| per_cat[m.ann_a.get("category", "?")]["matched"] += 1 | |
| for a in all_unmatched_a: | |
| per_cat[a.get("category", "?")]["only_a"] += 1 | |
| for b in all_unmatched_b: | |
| per_cat[b.get("category", "?")]["only_b"] += 1 | |
| from .metrics import ComparisonReport as CR | |
| overall = CR( | |
| matched=all_matched, | |
| unmatched_a=all_unmatched_a, | |
| unmatched_b=all_unmatched_b, | |
| per_category=dict(per_cat), | |
| ) | |
| print(f"\n Shared images: {len(shared)}") | |
| _print_comparison(name_a, name_b, overall) | |
| else: | |
| print("\n No shared images between the two COCO files.") | |
| # Save report | |
| report_path = args.output or "benchmark_report.json" | |
| report = { | |
| "timestamp": datetime.now().isoformat(timespec="seconds"), | |
| "file_a": args.a, | |
| "file_b": args.b, | |
| "score_a": { | |
| "total_images": score_a.total_images, | |
| "total_annotations": score_a.total_annotations, | |
| "pass_rate": round(score_a.pass_rate, 4), | |
| "mean_score": round(score_a.mean_score, 4), | |
| "rule_breakdown": {k: round(v, 4) for k, v in score_a.rule_breakdown.items()}, | |
| "per_category": score_a.per_category, | |
| }, | |
| "score_b": { | |
| "total_images": score_b.total_images, | |
| "total_annotations": score_b.total_annotations, | |
| "pass_rate": round(score_b.pass_rate, 4), | |
| "mean_score": round(score_b.mean_score, 4), | |
| "rule_breakdown": {k: round(v, 4) for k, v in score_b.rule_breakdown.items()}, | |
| "per_category": score_b.per_category, | |
| }, | |
| } | |
| if shared: | |
| report["comparison"] = { | |
| "shared_images": len(shared), | |
| "matched": len(overall.matched), | |
| "only_a": len(overall.unmatched_a), | |
| "only_b": len(overall.unmatched_b), | |
| "precision": round(overall.precision, 4), | |
| "recall": round(overall.recall, 4), | |
| "f1": round(overall.f1, 4), | |
| "mean_iou": round(overall.mean_iou, 4), | |
| "category_agreement": round(overall.category_agreement, 4), | |
| "per_category": overall.per_category, | |
| } | |
| with open(report_path, "w") as f: | |
| json.dump(report, f, indent=2) | |
| print(f"\n Report saved: {report_path}") | |
| def cmd_benchmark_score(args): | |
| """Score a single experiment directory.""" | |
| exp_dir = args.score | |
| if not os.path.isdir(exp_dir): | |
| print(f"Not a directory: {exp_dir}") | |
| sys.exit(1) | |
| print("=" * 70) | |
| print(f"SCORING: {exp_dir}") | |
| print("=" * 70) | |
| scores = score_experiment(exp_dir) | |
| if not scores: | |
| print(" No COCO files found.") | |
| return | |
| for name, score in scores.items(): | |
| _print_score(name, score) | |
| def cmd_benchmark_run(args): | |
| """Run a fresh benchmark: label same images with multiple backends.""" | |
| from .project import load_project | |
| from .providers import create_provider | |
| from .experiments import make_experiment_dir, write_config, update_latest_symlink | |
| proj = load_project(args.project) | |
| backends = [b.strip() for b in args.backends.split(",")] | |
| img_root = proj.local_image_dir() | |
| if not os.path.exists(img_root): | |
| print(f"No images at {img_root}; run gather first.") | |
| sys.exit(1) | |
| # Collect images | |
| images = [] | |
| for root, _, names in os.walk(img_root): | |
| for n in names: | |
| if n.lower().endswith((".jpg", ".jpeg", ".png", ".webp")): | |
| full = os.path.join(root, n) | |
| rel = os.path.relpath(full, img_root) | |
| if "/" not in rel: | |
| continue | |
| images.append((rel, full)) | |
| if args.limit > 0: | |
| images = images[:args.limit] | |
| print("=" * 70) | |
| print(f"BENCHMARK RUN: {proj.project_name}") | |
| print(f" images: {len(images)}") | |
| print(f" backends: {backends}") | |
| print(f" queries: {proj.falcon_queries}") | |
| print("=" * 70) | |
| exp = make_experiment_dir(f"benchmark-{proj.project_name}") | |
| write_config(exp, { | |
| "type": "benchmark", | |
| "project": proj.project_name, | |
| "backends": backends, | |
| "n_images": len(images), | |
| "queries": proj.falcon_queries, | |
| }) | |
| update_latest_symlink(exp) | |
| coco_files = {} | |
| for backend_name in backends: | |
| print(f"\n>>> Backend: {backend_name}") | |
| try: | |
| provider = create_provider(backend_name) | |
| except Exception as e: | |
| print(f" SKIP: {e}") | |
| continue | |
| status = provider.status() | |
| if not status.get("alive"): | |
| print(f" SKIP: {backend_name} not alive — {status.get('info', '')}") | |
| continue | |
| # Build COCO | |
| from PIL import Image as PILImage | |
| coco = { | |
| "info": { | |
| "description": f"benchmark {proj.project_name} via {backend_name}", | |
| "date_created": datetime.now().isoformat(timespec="seconds"), | |
| }, | |
| "images": [], | |
| "annotations": [], | |
| "categories": [ | |
| {"id": i + 1, "name": q, "supercategory": "object"} | |
| for i, q in enumerate(proj.falcon_queries) | |
| ], | |
| } | |
| cat_id = {q: i + 1 for i, q in enumerate(proj.falcon_queries)} | |
| next_img_id, next_ann_id = 1, 1 | |
| t0 = time.time() | |
| for i, (rel, full) in enumerate(images, 1): | |
| try: | |
| im = PILImage.open(full) | |
| iw, ih = im.size | |
| except Exception: | |
| continue | |
| img_id = next_img_id | |
| next_img_id += 1 | |
| coco["images"].append({ | |
| "id": img_id, "file_name": rel, "width": iw, "height": ih | |
| }) | |
| result = provider.label_image(full, proj.falcon_queries, image_wh=(iw, ih)) | |
| for ann in result.annotations: | |
| cat_name = ann.get("category", proj.falcon_queries[0]) | |
| cid = cat_id.get(cat_name) | |
| if cid is None: | |
| # Add dynamic category | |
| cid = len(coco["categories"]) + 1 | |
| coco["categories"].append({"id": cid, "name": cat_name, "supercategory": "object"}) | |
| cat_id[cat_name] = cid | |
| coco["annotations"].append({ | |
| "id": next_ann_id, | |
| "image_id": img_id, | |
| "category_id": cid, | |
| "bbox": ann["bbox"], | |
| "area": round(ann["bbox"][2] * ann["bbox"][3], 2), | |
| "iscrowd": 0, | |
| "score": ann.get("score", 1.0), | |
| }) | |
| next_ann_id += 1 | |
| if i % 10 == 0 or i == len(images): | |
| elapsed = time.time() - t0 | |
| rate = i / max(elapsed, 1) | |
| eta = (len(images) - i) / max(rate, 0.001) / 60 | |
| n_ann = len(coco["annotations"]) | |
| print(f" [{i:4d}/{len(images)}] anns={n_ann} ETA {eta:.1f} min") | |
| # Save COCO | |
| out_dir = os.path.join(exp, f"label_{backend_name}") | |
| os.makedirs(out_dir, exist_ok=True) | |
| out_path = os.path.join(out_dir, f"{proj.project_name}.coco.json") | |
| with open(out_path, "w") as f: | |
| json.dump(coco, f, indent=2) | |
| coco_files[backend_name] = out_path | |
| print(f" Saved: {out_path} ({len(coco['annotations'])} annotations)") | |
| # Score and compare | |
| if len(coco_files) >= 1: | |
| print("\n" + "=" * 70) | |
| print("RESULTS") | |
| print("=" * 70) | |
| scores = {} | |
| for name, path in coco_files.items(): | |
| with open(path) as f: | |
| coco = json.load(f) | |
| scores[name] = score_coco(coco) | |
| _print_score(name, scores[name]) | |
| if len(coco_files) >= 2: | |
| names = list(coco_files.keys()) | |
| print(f"\n Cross-comparison:") | |
| for i in range(len(names)): | |
| for j in range(i + 1, len(names)): | |
| args_cmp = argparse.Namespace( | |
| a=coco_files[names[i]], | |
| b=coco_files[names[j]], | |
| output=os.path.join(exp, f"compare_{names[i]}_vs_{names[j]}.json"), | |
| ) | |
| cmd_benchmark_compare(args_cmp) | |
| print(f"\n Benchmark experiment: {exp}") | |
| def cmd_benchmark_models(args): | |
| """MODEL BENCHMARK — compare VLMs for filter/verify accuracy. | |
| Runs the same filter prompt through multiple models (local Qwen, OpenRouter | |
| models, etc.) on the same images and compares YES/NO agreement rates. | |
| This answers: "which VLM is best at filtering images for my dataset?" | |
| """ | |
| from .project import load_project | |
| from .providers import create_provider | |
| from .experiments import make_experiment_dir, write_config, update_latest_symlink | |
| proj = load_project(args.project) | |
| model_list = [m.strip() for m in args.model_list.split(",")] | |
| img_root = proj.local_image_dir() | |
| if not os.path.exists(img_root): | |
| print(f"No images at {img_root}; run gather first.") | |
| sys.exit(1) | |
| images = [] | |
| for root, _, names in os.walk(img_root): | |
| for n in names: | |
| if n.lower().endswith((".jpg", ".jpeg", ".png", ".webp")): | |
| full = os.path.join(root, n) | |
| rel = os.path.relpath(full, img_root) | |
| if "/" not in rel: | |
| continue | |
| images.append((rel, full)) | |
| if args.limit > 0: | |
| images = images[:args.limit] | |
| prompt = proj.prompt("filter") | |
| print("=" * 70) | |
| print(f"MODEL BENCHMARK: {proj.project_name}") | |
| print(f" images: {len(images)}") | |
| print(f" models: {model_list}") | |
| print(f" prompt: {prompt[:80]}...") | |
| print("=" * 70) | |
| exp = make_experiment_dir(f"model-bench-{proj.project_name}") | |
| write_config(exp, { | |
| "type": "model_benchmark", | |
| "project": proj.project_name, | |
| "models": model_list, | |
| "n_images": len(images), | |
| "prompt": prompt, | |
| }) | |
| update_latest_symlink(exp) | |
| # For each model, run filter on all images | |
| all_results: dict[str, list[dict]] = {} | |
| for model_spec in model_list: | |
| # Determine provider: "qwen", "gemma" are local; anything with "/" is OpenRouter | |
| if model_spec in ("qwen", "gemma"): | |
| provider_name = model_spec | |
| model_id = model_spec | |
| try: | |
| provider = create_provider(provider_name) | |
| except Exception as e: | |
| print(f"\n SKIP {model_spec}: {e}") | |
| continue | |
| else: | |
| provider_name = "openrouter" | |
| model_id = model_spec | |
| try: | |
| provider = create_provider("openrouter", config={"model": model_id}) | |
| except Exception as e: | |
| print(f"\n SKIP {model_spec}: {e}") | |
| continue | |
| status = provider.status() | |
| if not status.get("alive"): | |
| print(f"\n SKIP {model_spec}: not alive — {status.get('info', '')}") | |
| continue | |
| print(f"\n>>> {model_id}") | |
| results = [] | |
| counts = {"YES": 0, "NO": 0, "UNKNOWN": 0, "ERROR": 0} | |
| t0 = time.time() | |
| for i, (rel, full) in enumerate(images, 1): | |
| try: | |
| fr = provider.filter_image(full, prompt) | |
| verdict = fr.verdict | |
| raw = fr.raw_answer | |
| elapsed_img = fr.elapsed | |
| except Exception as e: | |
| verdict, raw, elapsed_img = "ERROR", str(e)[:80], 0 | |
| counts[verdict] = counts.get(verdict, 0) + 1 | |
| results.append({ | |
| "image": rel, | |
| "verdict": verdict, | |
| "raw_answer": raw[:120], | |
| "elapsed": round(elapsed_img, 3), | |
| }) | |
| if i % 10 == 0 or i == len(images): | |
| elapsed_total = time.time() - t0 | |
| rate = i / max(elapsed_total, 1) | |
| eta = (len(images) - i) / max(rate, 0.001) / 60 | |
| print(f" [{i:4d}/{len(images)}] YES={counts['YES']} NO={counts['NO']} " | |
| f"ERR={counts.get('ERROR',0)} {rate:.1f} img/s ETA {eta:.1f}m") | |
| elapsed_total = time.time() - t0 | |
| all_results[model_id] = results | |
| # Save per-model results | |
| out_dir = os.path.join(exp, f"filter_{model_id.replace('/', '_')}") | |
| os.makedirs(out_dir, exist_ok=True) | |
| with open(os.path.join(out_dir, "keep_list.json"), "w") as f: | |
| json.dump({ | |
| "model": model_id, | |
| "provider": provider_name, | |
| "project": proj.project_name, | |
| "counts": counts, | |
| "elapsed_total": round(elapsed_total, 1), | |
| "results": results, | |
| }, f, indent=2) | |
| # ── Comparison Report ── | |
| if len(all_results) < 2: | |
| print("\n Need at least 2 models to compare.") | |
| print(f" Experiment: {exp}") | |
| return | |
| print("\n" + "=" * 70) | |
| print("MODEL COMPARISON") | |
| print("=" * 70) | |
| # Build verdict matrix: image → {model: verdict} | |
| image_names = [r["image"] for r in list(all_results.values())[0]] | |
| models = list(all_results.keys()) | |
| # Per-model stats | |
| print(f"\n {'Model':40s} {'YES':>5s} {'NO':>5s} {'UNK':>5s} {'ERR':>5s} {'YES%':>6s} {'avg_s':>6s}") | |
| print(" " + "-" * 72) | |
| model_stats = {} | |
| for model_id in models: | |
| results = all_results[model_id] | |
| yes = sum(1 for r in results if r["verdict"] == "YES") | |
| no = sum(1 for r in results if r["verdict"] == "NO") | |
| unk = sum(1 for r in results if r["verdict"] == "UNKNOWN") | |
| err = sum(1 for r in results if r["verdict"] == "ERROR") | |
| avg_s = sum(r["elapsed"] for r in results) / max(len(results), 1) | |
| yes_pct = yes / max(len(results), 1) | |
| model_stats[model_id] = {"yes": yes, "no": no, "unk": unk, "err": err, | |
| "yes_pct": yes_pct, "avg_s": avg_s} | |
| short = model_id[-38:] if len(model_id) > 38 else model_id | |
| print(f" {short:40s} {yes:5d} {no:5d} {unk:5d} {err:5d} {yes_pct:5.0%} {avg_s:6.2f}") | |
| # Pairwise agreement | |
| print(f"\n Pairwise agreement:") | |
| for i, m1 in enumerate(models): | |
| for j, m2 in enumerate(models): | |
| if j <= i: | |
| continue | |
| r1 = all_results[m1] | |
| r2 = all_results[m2] | |
| agree = sum(1 for a, b in zip(r1, r2) if a["verdict"] == b["verdict"]) | |
| total = min(len(r1), len(r2)) | |
| pct = agree / max(total, 1) | |
| s1 = m1[-20:] if len(m1) > 20 else m1 | |
| s2 = m2[-20:] if len(m2) > 20 else m2 | |
| print(f" {s1} vs {s2}: {agree}/{total} ({pct:.0%})") | |
| # Disagreement examples (show where models disagree most) | |
| print(f"\n Top disagreements:") | |
| n_shown = 0 | |
| for idx in range(len(image_names)): | |
| verdicts = {m: all_results[m][idx]["verdict"] for m in models} | |
| unique = set(verdicts.values()) - {"ERROR", "UNKNOWN"} | |
| if len(unique) > 1 and n_shown < 10: | |
| img = image_names[idx] | |
| verdict_str = " ".join(f"{m[-15:]}={v}" for m, v in verdicts.items()) | |
| print(f" {img[:50]:50s} {verdict_str}") | |
| n_shown += 1 | |
| # Save comparison report | |
| report = { | |
| "timestamp": datetime.now().isoformat(timespec="seconds"), | |
| "project": proj.project_name, | |
| "n_images": len(images), | |
| "prompt": prompt, | |
| "models": model_stats, | |
| } | |
| report_path = os.path.join(exp, "model_comparison.json") | |
| with open(report_path, "w") as f: | |
| json.dump(report, f, indent=2) | |
| print(f"\n Report: {report_path}") | |
| print(f" Experiment: {exp}") | |
| def main(argv: list[str] | None = None): | |
| p = argparse.ArgumentParser( | |
| prog="data_label_factory benchmark", | |
| description="Compare labeling backends with deterministic metrics.", | |
| ) | |
| group = p.add_mutually_exclusive_group(required=True) | |
| group.add_argument("--compare", nargs=2, metavar=("COCO_A", "COCO_B"), | |
| help="Compare two existing COCO files") | |
| group.add_argument("--score", metavar="EXP_DIR", | |
| help="Score a single experiment directory") | |
| group.add_argument("--run", action="store_true", | |
| help="Run a fresh benchmark with multiple backends") | |
| group.add_argument("--models", action="store_true", | |
| help="Model benchmark: compare VLMs for filter/verify accuracy") | |
| p.add_argument("--project", help="Project YAML (for --run / --models)") | |
| p.add_argument("--backends", default="falcon", | |
| help="Comma-separated backends to benchmark (for --run)") | |
| p.add_argument("--model-list", | |
| default="qwen,google/gemma-4-26b-a4b-it", | |
| help="Comma-separated model IDs for --models. " | |
| "Use 'qwen'/'gemma' for local, or OpenRouter model IDs " | |
| "(e.g. google/gemma-4-26b-a4b-it, meta-llama/llama-4-scout)") | |
| p.add_argument("--limit", type=int, default=0, help="Max images") | |
| p.add_argument("--output", help="Output report path (for --compare)") | |
| args = p.parse_args(argv) | |
| if args.compare: | |
| args.a, args.b = args.compare | |
| cmd_benchmark_compare(args) | |
| elif args.score: | |
| cmd_benchmark_score(args) | |
| elif args.run: | |
| if not args.project: | |
| p.error("--project is required with --run") | |
| cmd_benchmark_run(args) | |
| elif args.models: | |
| if not args.project: | |
| p.error("--project is required with --models") | |
| cmd_benchmark_models(args) | |