Buckets:

evalstate
/

birch-html

Files

xet

evalstate/birch-html / analysis /data /artifact-summary.json

evalstate

4 days ago

download

raw

204 kB

	[
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.4-mini",
	"model_slug": "codexresponses-gpt-5-4-mini",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 41655,
	"generation_ok": true,
	"generation_duration_s": 233.57,
	"input_tokens": 257043,
	"output_tokens": 19565,
	"total_tokens": 276608,
	"billing_tokens": 276608,
	"reasoning_tokens": 13843,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 236032,
	"total_cache_tokens": 236032,
	"effective_input_tokens": 21011,
	"display_input_tokens": 257043,
	"usage_event_count": 12,
	"tool_calls": 16,
	"turn_count": 12,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 12,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publica",
	"deterministic_failures": 0,
	"deterministic_warnings": 2,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 1,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 1,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 1,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 99,
	"task_score": 19.8,
	"task_score_max": 20,
	"quality_score": 99,
	"quality_cap_reason": "",
	"quality_class": "warn"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.4-mini",
	"model_slug": "codexresponses-gpt-5-4-mini",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 40247,
	"generation_ok": true,
	"generation_duration_s": 251.091,
	"input_tokens": 1602209,
	"output_tokens": 16541,
	"total_tokens": 1618750,
	"billing_tokens": 1618750,
	"reasoning_tokens": 10735,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1516544,
	"total_cache_tokens": 1516544,
	"effective_input_tokens": 85665,
	"display_input_tokens": 1602209,
	"usage_event_count": 24,
	"tool_calls": 39,
	"turn_count": 24,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 24,
	"self_check_mode": "checker-cli-error,run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run python skill/scripts/check_birch_renderings.py --help \| sed -n '1,220p' \| checker CLI usage error \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.4-mini",
	"model_slug": "codexresponses-gpt-5-4-mini",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 51503,
	"generation_ok": true,
	"generation_duration_s": 228.357,
	"input_tokens": 538144,
	"output_tokens": 20613,
	"total_tokens": 558757,
	"billing_tokens": 558757,
	"reasoning_tokens": 12973,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 489472,
	"total_cache_tokens": 489472,
	"effective_input_tokens": 48672,
	"display_input_tokens": 538144,
	"usage_event_count": 14,
	"tool_calls": 29,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 14,
	"self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| shell referenced checker: rg -n \"^def (contract_findings\|compare_stats\|screenshot_findings\|artifact_screenshot_findings\|geometry_findings\|render_markdown\|capture\|find_chrome\|capture_height_for_viewport\|css_ \| ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-co \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-fina",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.4-mini",
	"model_slug": "codexresponses-gpt-5-4-mini",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 48838,
	"generation_ok": true,
	"generation_duration_s": 249.193,
	"input_tokens": 122451,
	"output_tokens": 13529,
	"total_tokens": 135980,
	"billing_tokens": 135980,
	"reasoning_tokens": 8129,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 103936,
	"total_cache_tokens": 103936,
	"effective_input_tokens": 18515,
	"display_input_tokens": 122451,
	"usage_event_count": 8,
	"tool_calls": 11,
	"turn_count": 8,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 8,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/implementation-plan.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.4-mini",
	"model_slug": "codexresponses-gpt-5-4-mini",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 55271,
	"generation_ok": true,
	"generation_duration_s": 193.592,
	"input_tokens": 280048,
	"output_tokens": 17564,
	"total_tokens": 297612,
	"billing_tokens": 297612,
	"reasoning_tokens": 9912,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 261120,
	"total_cache_tokens": 261120,
	"effective_input_tokens": 18928,
	"display_input_tokens": 280048,
	"usage_event_count": 14,
	"tool_calls": 18,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 4,
	"self_check_failed_runs": 3,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 14,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final && uv run --with matplotlib python - <<'PY'\nfrom pathlib impor \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres \| ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\npath = Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/benchmark-comparison.h \| ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\nimport re\npath = Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/benchmark-co",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.5",
	"model_slug": "codexresponses-gpt-5-5",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 41967,
	"generation_ok": true,
	"generation_duration_s": 118.283,
	"input_tokens": 95354,
	"output_tokens": 5337,
	"total_tokens": 100691,
	"billing_tokens": 100691,
	"reasoning_tokens": 402,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 78848,
	"total_cache_tokens": 78848,
	"effective_input_tokens": 16506,
	"display_input_tokens": 95354,
	"usage_event_count": 10,
	"tool_calls": 10,
	"turn_count": 10,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 1,
	"self_corrected_after_checker": true,
	"self_correction_verified": true,
	"assistant_turns_trace": 10,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-fin \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.5",
	"model_slug": "codexresponses-gpt-5-5",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 44204,
	"generation_ok": true,
	"generation_duration_s": 164.43,
	"input_tokens": 388756,
	"output_tokens": 7268,
	"total_tokens": 396024,
	"billing_tokens": 396024,
	"reasoning_tokens": 2335,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 346624,
	"total_cache_tokens": 346624,
	"effective_input_tokens": 42132,
	"display_input_tokens": 388756,
	"usage_event_count": 16,
	"tool_calls": 22,
	"turn_count": 16,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 3,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 16,
	"self_check_mode": "checker-cli-error,run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-fin \| checker CLI usage error \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.5",
	"model_slug": "codexresponses-gpt-5-5",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 57189,
	"generation_ok": true,
	"generation_duration_s": 178.972,
	"input_tokens": 450726,
	"output_tokens": 9063,
	"total_tokens": 459789,
	"billing_tokens": 459789,
	"reasoning_tokens": 477,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 400896,
	"total_cache_tokens": 400896,
	"effective_input_tokens": 49830,
	"display_input_tokens": 450726,
	"usage_event_count": 14,
	"tool_calls": 25,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 14,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/module-explainer.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <met \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/mod \| ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\np=Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/module-explainer.html')\ns=p.read",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 1,
	"vlm_warnings": 1,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 1,
	"vlm_warning_units": 1,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 91,
	"task_score": 18.2,
	"task_score_max": 20,
	"quality_score": 91,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.5",
	"model_slug": "codexresponses-gpt-5-5",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 49708,
	"generation_ok": true,
	"generation_duration_s": 144.313,
	"input_tokens": 129170,
	"output_tokens": 6893,
	"total_tokens": 136063,
	"billing_tokens": 136063,
	"reasoning_tokens": 369,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 91136,
	"total_cache_tokens": 91136,
	"effective_input_tokens": 38034,
	"display_input_tokens": 129170,
	"usage_event_count": 11,
	"tool_calls": 13,
	"turn_count": 11,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 11,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexre \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-fin \| ran checker CLI: cd /home/shaun/source/birch-html && python3 - <<'PY'\nfrom pathlib import Path\np=Path('eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/implementation-plan.html')",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.5",
	"model_slug": "codexresponses-gpt-5-5",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 53609,
	"generation_ok": true,
	"generation_duration_s": 142.604,
	"input_tokens": 126650,
	"output_tokens": 6524,
	"total_tokens": 133174,
	"billing_tokens": 133174,
	"reasoning_tokens": 491,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 101376,
	"total_cache_tokens": 101376,
	"effective_input_tokens": 25274,
	"display_input_tokens": 126650,
	"usage_event_count": 11,
	"tool_calls": 13,
	"turn_count": 11,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 11,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && python3 - <<'PY'\nfrom pathlib import Path\np=Path('eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/benchmark-comparison.html' \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexspark",
	"model_slug": "codexspark",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexspark-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/codexspark/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 17281,
	"generation_ok": true,
	"generation_duration_s": 82.34,
	"input_tokens": 825347,
	"output_tokens": 23923,
	"total_tokens": 849270,
	"billing_tokens": 849270,
	"reasoning_tokens": 13374,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 770688,
	"total_cache_tokens": 770688,
	"effective_input_tokens": 54659,
	"display_input_tokens": 825347,
	"usage_event_count": 32,
	"tool_calls": 31,
	"turn_count": 32,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 32,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 6,
	"deterministic_warnings": 2,
	"vlm_failures": 1,
	"vlm_warnings": 0,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 1,
	"vlm_failure_units": 1,
	"vlm_warning_units": 0,
	"desktop_failures": 1,
	"desktop_warnings": 1,
	"mobile_failures": 2,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 1,
	"mobile_deep_failures": 2,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "codexspark",
	"model_slug": "codexspark",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexspark-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/codexspark/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 9658,
	"generation_ok": false,
	"generation_duration_s": 60.395,
	"input_tokens": 1737615,
	"output_tokens": 21291,
	"total_tokens": 1758906,
	"billing_tokens": 1758906,
	"reasoning_tokens": 17081,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1702656,
	"total_cache_tokens": 1702656,
	"effective_input_tokens": 86941,
	"display_input_tokens": 1789597,
	"usage_event_count": 41,
	"tool_calls": 32,
	"turn_count": 26,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 3,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 41,
	"self_check_mode": "checker-shell-reference,read-checker",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| sed -n '1,260p' \| shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| sed -n '260,560p' \| shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| sed -n '560,920p' \| shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| sed -n '920,1320p'",
	"deterministic_failures": 8,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 2,
	"desktop_warnings": 0,
	"mobile_failures": 2,
	"mobile_warnings": 0,
	"deep_failures": 2,
	"deep_warnings": 0,
	"mobile_deep_failures": 2,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "codexspark",
	"model_slug": "codexspark",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexspark-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/codexspark/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 16366,
	"generation_ok": false,
	"generation_duration_s": 87.747,
	"input_tokens": 2740590,
	"output_tokens": 27049,
	"total_tokens": 2767639,
	"billing_tokens": 2767639,
	"reasoning_tokens": 15704,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 2024320,
	"total_cache_tokens": 2024320,
	"effective_input_tokens": 202803,
	"display_input_tokens": 2227123,
	"usage_event_count": 35,
	"tool_calls": 51,
	"turn_count": 42,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 35,
	"self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| shell referenced checker: cd /home/shaun/source/birch-html && rg -n \"class\\s*=\\\"(flow-node\|flow-edge\|flow-list\|flow-step\|metric-row\|chart-panel\|finding\|code-block\|copyable\|timeline)\" styles/birch-system.css \| shell referenced checker: cd /home/shaun/source/birch-html && wc -l scripts/check_birch_renderings.py \| shell referenced checker: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexspark-publication-final && cat > eval-runs/skill-with-shell-codexspark-publication-final/module-explain \| ran checker CLI: cd /home/shaun/source/birch-html && cat > eval-runs/skill-with-shell-codexspark-publication-final/module-explainer.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <meta char",
	"deterministic_failures": 0,
	"deterministic_warnings": 4,
	"vlm_failures": 4,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 1,
	"vlm_failure_units": 1,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 1,
	"mobile_failures": 0,
	"mobile_warnings": 1,
	"deep_failures": 0,
	"deep_warnings": 1,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 1,
	"artifact_present": true,
	"artifact_score_100": 91,
	"task_score": 18.2,
	"task_score_max": 20,
	"quality_score": 91,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "codexspark",
	"model_slug": "codexspark",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexspark-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/codexspark/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 46864,
	"generation_ok": true,
	"generation_duration_s": 91.953,
	"input_tokens": 1108319,
	"output_tokens": 14746,
	"total_tokens": 1123065,
	"billing_tokens": 1123065,
	"reasoning_tokens": 8043,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1055232,
	"total_cache_tokens": 1055232,
	"effective_input_tokens": 53087,
	"display_input_tokens": 1108319,
	"usage_event_count": 35,
	"tool_calls": 37,
	"turn_count": 35,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 35,
	"self_check_mode": "checker-cli-error,checker-shell-reference,read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexspark-publication-final/implementation-plan.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <meta charset \| ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --help \| head -n 120 \| checker CLI usage error \| ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexspark-publication-final/implementation-plan.html --no- \| ran checker CLI: python - <<'PY'\nfrom pathlib import Path\nfrom inspect import getsourcelines\nimport importlib.util\np=Path('/home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py')\nte",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexspark",
	"model_slug": "codexspark",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexspark-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/codexspark/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 55786,
	"generation_ok": true,
	"generation_duration_s": 41.038,
	"input_tokens": 681289,
	"output_tokens": 5651,
	"total_tokens": 686940,
	"billing_tokens": 686940,
	"reasoning_tokens": 4100,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 628224,
	"total_cache_tokens": 628224,
	"effective_input_tokens": 53065,
	"display_input_tokens": 681289,
	"usage_event_count": 24,
	"tool_calls": 23,
	"turn_count": 24,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 24,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "deepseek",
	"model_slug": "deepseek",
	"source_kind": "clean-final",
	"label": "skill-with-shell-deepseek-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/deepseek/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 62489,
	"generation_ok": true,
	"generation_duration_s": 280.24,
	"input_tokens": 594128,
	"output_tokens": 18097,
	"total_tokens": 612225,
	"billing_tokens": 612225,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 560512,
	"total_cache_tokens": 560512,
	"effective_input_tokens": 33616,
	"display_input_tokens": 594128,
	"usage_event_count": 18,
	"tool_calls": 20,
	"turn_count": 18,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 18,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/numeric-dat \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-deepseek",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "deepseek",
	"model_slug": "deepseek",
	"source_kind": "clean-final",
	"label": "skill-with-shell-deepseek-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/deepseek/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 62789,
	"generation_ok": true,
	"generation_duration_s": 294.1,
	"input_tokens": 784186,
	"output_tokens": 14634,
	"total_tokens": 798820,
	"billing_tokens": 798820,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 749440,
	"total_cache_tokens": 749440,
	"effective_input_tokens": 34746,
	"display_input_tokens": 784186,
	"usage_event_count": 26,
	"tool_calls": 30,
	"turn_count": 26,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 26,
	"self_check_mode": "checker-shell-reference,run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/code-review \| shell referenced checker: cd /home/shaun/source/birch-html && head -30 skill/scripts/check_birch_renderings.py \| grep -A5 \"add_argument\" \| shell referenced checker: cd /home/shaun/source/birch-html && grep -n \"artifact\\\|--artifact\" skill/scripts/check_birch_renderings.py \| head -10 \| ran checker CLI: cd /home/shaun/source/birch-html && rm -f skill/reports/birch-rendering-check.json skill/reports/birch-rendering-check.md && uv run --with pillow python skill/scripts/check_birch_r \| shell referenced checker: cd /home/shaun/source/birch-html && grep -n \"ROOT\\s*=\" skill/scripts/check_birch_renderings.py \| head -3 \| ran checker CLI: cd /home/shaun/source/birch-html && rm -f skill/reports/birch-rendering-check-code-review.json && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /ho",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "deepseek",
	"model_slug": "deepseek",
	"source_kind": "clean-final",
	"label": "skill-with-shell-deepseek-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/deepseek/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 31473,
	"generation_ok": false,
	"generation_duration_s": 177.334,
	"input_tokens": 215656,
	"output_tokens": 9938,
	"total_tokens": 225594,
	"billing_tokens": 225594,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 449920,
	"total_cache_tokens": 449920,
	"effective_input_tokens": 48511,
	"display_input_tokens": 498431,
	"usage_event_count": 10,
	"tool_calls": 10,
	"turn_count": 6,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 10,
	"self_check_mode": "read-checker",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
	"deterministic_failures": 8,
	"deterministic_warnings": 1,
	"vlm_failures": 7,
	"vlm_warnings": 0,
	"deterministic_failure_units": 3,
	"deterministic_warning_units": 1,
	"vlm_failure_units": 2,
	"vlm_warning_units": 0,
	"desktop_failures": 1,
	"desktop_warnings": 1,
	"mobile_failures": 3,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 3,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 20.0,
	"task_score": 4.0,
	"task_score_max": 20,
	"quality_score": 20.0,
	"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "deepseek",
	"model_slug": "deepseek",
	"source_kind": "clean-final",
	"label": "skill-with-shell-deepseek-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/deepseek/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 52099,
	"generation_ok": true,
	"generation_duration_s": 112.544,
	"input_tokens": 173739,
	"output_tokens": 6911,
	"total_tokens": 180650,
	"billing_tokens": 180650,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 160128,
	"total_cache_tokens": 160128,
	"effective_input_tokens": 13611,
	"display_input_tokens": 173739,
	"usage_event_count": 12,
	"tool_calls": 15,
	"turn_count": 12,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 12,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/implementat",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "deepseek",
	"model_slug": "deepseek",
	"source_kind": "clean-final",
	"label": "skill-with-shell-deepseek-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/deepseek/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 78962,
	"generation_ok": true,
	"generation_duration_s": 378.136,
	"input_tokens": 767427,
	"output_tokens": 27984,
	"total_tokens": 795411,
	"billing_tokens": 795411,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 717696,
	"total_cache_tokens": 717696,
	"effective_input_tokens": 49731,
	"display_input_tokens": 767427,
	"usage_event_count": 18,
	"tool_calls": 22,
	"turn_count": 18,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 18,
	"self_check_mode": "checker-shell-reference",
	"self_check_evidence": "shell referenced checker: cd /home/shaun/source/birch-html && ls skill/scripts/check_birch_renderings.py 2>&1 && echo \"---\" && head -5 eval-runs/skill-with-shell-deepseek-publication-final/benchmark-compari",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gemini35flash",
	"model_slug": "gemini35flash",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gemini35flash-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/gemini35flash/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 53215,
	"generation_ok": true,
	"generation_duration_s": 114.216,
	"input_tokens": 1371616,
	"output_tokens": 5260,
	"total_tokens": 1376876,
	"billing_tokens": 1376876,
	"reasoning_tokens": 12418,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1116684,
	"total_cache_tokens": 1116684,
	"effective_input_tokens": 254932,
	"display_input_tokens": 1371616,
	"usage_event_count": 29,
	"tool_calls": 28,
	"turn_count": 29,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 29,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gemini35flash-publication-final/numeric-data.html \| ran checker CLI: uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/numeric-dat",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gemini35flash",
	"model_slug": "gemini35flash",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gemini35flash-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/gemini35flash/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 53047,
	"generation_ok": true,
	"generation_duration_s": 193.238,
	"input_tokens": 1684136,
	"output_tokens": 6902,
	"total_tokens": 1691038,
	"billing_tokens": 1691038,
	"reasoning_tokens": 23273,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1424691,
	"total_cache_tokens": 1424691,
	"effective_input_tokens": 259445,
	"display_input_tokens": 1684136,
	"usage_event_count": 34,
	"tool_calls": 33,
	"turn_count": 34,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 34,
	"self_check_mode": "checker-cli-error,run-checker-cli",
	"self_check_evidence": "ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --help \| checker CLI usage error \| ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/co \| ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --no-capture --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publica",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gemini35flash",
	"model_slug": "gemini35flash",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gemini35flash-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/gemini35flash/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 57420,
	"generation_ok": true,
	"generation_duration_s": 203.178,
	"input_tokens": 2196880,
	"output_tokens": 10222,
	"total_tokens": 2207102,
	"billing_tokens": 2207102,
	"reasoning_tokens": 22501,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1965131,
	"total_cache_tokens": 1965131,
	"effective_input_tokens": 231749,
	"display_input_tokens": 2196880,
	"usage_event_count": 33,
	"tool_calls": 32,
	"turn_count": 33,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": false,
	"self_check_runs": 2,
	"self_check_failed_runs": 2,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 33,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read scripts/check_birch_renderings.py \| ran checker CLI: python3 scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gemini35flash-publication-final/module-explainer.html",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gemini35flash",
	"model_slug": "gemini35flash",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gemini35flash-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/gemini35flash/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 49628,
	"generation_ok": true,
	"generation_duration_s": 201.715,
	"input_tokens": 2346900,
	"output_tokens": 9173,
	"total_tokens": 2356073,
	"billing_tokens": 2356073,
	"reasoning_tokens": 15150,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 2043078,
	"total_cache_tokens": 2043078,
	"effective_input_tokens": 303822,
	"display_input_tokens": 2346900,
	"usage_event_count": 34,
	"tool_calls": 33,
	"turn_count": 34,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 5,
	"self_check_failed_runs": 4,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 34,
	"self_check_mode": "checker-cli-error,run-checker-cli",
	"self_check_evidence": "ran checker CLI: python3 skill/scripts/check_birch_renderings.py --help \| checker CLI usage error \| ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact temp_plan.html \| ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/temp_plan.html \| ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/implementation-plan.html",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gemini35flash",
	"model_slug": "gemini35flash",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gemini35flash-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/gemini35flash/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 97390,
	"generation_ok": true,
	"generation_duration_s": 62.077,
	"input_tokens": 495825,
	"output_tokens": 829,
	"total_tokens": 496654,
	"billing_tokens": 496654,
	"reasoning_tokens": 4961,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 387138,
	"total_cache_tokens": 387138,
	"effective_input_tokens": 108687,
	"display_input_tokens": 495825,
	"usage_event_count": 17,
	"tool_calls": 16,
	"turn_count": 17,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": false,
	"self_check_runs": 1,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 17,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/be",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "glm51",
	"model_slug": "glm51",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm51-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/glm51/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 62971,
	"generation_ok": true,
	"generation_duration_s": 300.114,
	"input_tokens": 459899,
	"output_tokens": 16275,
	"total_tokens": 476174,
	"billing_tokens": 476174,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 369152,
	"total_cache_tokens": 369152,
	"effective_input_tokens": 90747,
	"display_input_tokens": 459899,
	"usage_event_count": 15,
	"tool_calls": 16,
	"turn_count": 15,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": false,
	"self_check_runs": 1,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 15,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/numeric-data.h",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 2,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 1,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 99,
	"task_score": 19.8,
	"task_score_max": 20,
	"quality_score": 99,
	"quality_cap_reason": "",
	"quality_class": "warn"
	},
	{
	"suite": "publish",
	"model": "glm51",
	"model_slug": "glm51",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm51-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/glm51/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 48933,
	"generation_ok": true,
	"generation_duration_s": 133.324,
	"input_tokens": 254816,
	"output_tokens": 8008,
	"total_tokens": 262824,
	"billing_tokens": 262824,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 202560,
	"total_cache_tokens": 202560,
	"effective_input_tokens": 52256,
	"display_input_tokens": 254816,
	"usage_event_count": 11,
	"tool_calls": 13,
	"turn_count": 11,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 11,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/code-review.ht",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 2,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 1,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 92,
	"task_score": 18.4,
	"task_score_max": 20,
	"quality_score": 92,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "glm51",
	"model_slug": "glm51",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm51-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/glm51/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 54229,
	"generation_ok": true,
	"generation_duration_s": 94.822,
	"input_tokens": 358438,
	"output_tokens": 6652,
	"total_tokens": 365090,
	"billing_tokens": 365090,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 254656,
	"total_cache_tokens": 254656,
	"effective_input_tokens": 103782,
	"display_input_tokens": 358438,
	"usage_event_count": 9,
	"tool_calls": 15,
	"turn_count": 9,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 9,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/module-explainer.htm",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "glm51",
	"model_slug": "glm51",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm51-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/glm51/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 60535,
	"generation_ok": true,
	"generation_duration_s": 90.03,
	"input_tokens": 210191,
	"output_tokens": 7574,
	"total_tokens": 217765,
	"billing_tokens": 217765,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 180736,
	"total_cache_tokens": 180736,
	"effective_input_tokens": 29455,
	"display_input_tokens": 210191,
	"usage_event_count": 15,
	"tool_calls": 16,
	"turn_count": 15,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 15,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/implementation",
	"deterministic_failures": 2,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 2,
	"deterministic_failure_units": 1,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 1,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 1,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 1,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 93,
	"task_score": 18.6,
	"task_score_max": 20,
	"quality_score": 93,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "glm51",
	"model_slug": "glm51",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm51-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/glm51/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 64863,
	"generation_ok": true,
	"generation_duration_s": 149.159,
	"input_tokens": 274201,
	"output_tokens": 14416,
	"total_tokens": 288617,
	"billing_tokens": 288617,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 214336,
	"total_cache_tokens": 214336,
	"effective_input_tokens": 59865,
	"display_input_tokens": 274201,
	"usage_event_count": 12,
	"tool_calls": 14,
	"turn_count": 12,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 12,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/benchmark-comp",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "glm52",
	"model_slug": "glm52",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm52-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/glm52/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 51395,
	"generation_ok": true,
	"generation_duration_s": 274.73,
	"input_tokens": 271862,
	"output_tokens": 16133,
	"total_tokens": 287995,
	"billing_tokens": 287995,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 239936,
	"total_cache_tokens": 239936,
	"effective_input_tokens": 31926,
	"display_input_tokens": 271862,
	"usage_event_count": 14,
	"tool_calls": 17,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 14,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-glm52-publication-final/numeric-data.html && echo \"=== finished ===\" && uv \| ran checker CLI: cd /home/shaun/source/birch-html && F=eval-runs/skill-with-shell-glm52-publication-final/numeric-data.html\necho \"=== my local style block (after system block) ===\"\npython3 - \"$F\" <",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "glm52",
	"model_slug": "glm52",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm52-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/glm52/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 50845,
	"generation_ok": true,
	"generation_duration_s": 771.097,
	"input_tokens": 1249523,
	"output_tokens": 43260,
	"total_tokens": 1292783,
	"billing_tokens": 1292783,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1177792,
	"total_cache_tokens": 1177792,
	"effective_input_tokens": 71731,
	"display_input_tokens": 1249523,
	"usage_event_count": 25,
	"tool_calls": 32,
	"turn_count": 25,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 3,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 25,
	"self_check_mode": "checker-cli-error,run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/code-review.ht \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-glm52-publication-final/code-r \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --help 2>&1 \| rg -i 'viewport\|artifact\|mobile\|width' \| head; echo \"=== run m \| checker CLI usage error",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "glm52",
	"model_slug": "glm52",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm52-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/glm52/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 59952,
	"generation_ok": true,
	"generation_duration_s": 751.331,
	"input_tokens": 1204327,
	"output_tokens": 40435,
	"total_tokens": 1244762,
	"billing_tokens": 1244762,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1118592,
	"total_cache_tokens": 1118592,
	"effective_input_tokens": 85735,
	"display_input_tokens": 1204327,
	"usage_event_count": 22,
	"tool_calls": 32,
	"turn_count": 22,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 22,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/module-explainer.htm",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "glm52",
	"model_slug": "glm52",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm52-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/glm52/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 56320,
	"generation_ok": true,
	"generation_duration_s": 456.209,
	"input_tokens": 991570,
	"output_tokens": 24123,
	"total_tokens": 1015693,
	"billing_tokens": 1015693,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 911168,
	"total_cache_tokens": 911168,
	"effective_input_tokens": 80402,
	"display_input_tokens": 991570,
	"usage_event_count": 18,
	"tool_calls": 26,
	"turn_count": 18,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 3,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 18,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/implementation \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-glm52-pu",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "glm52",
	"model_slug": "glm52",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm52-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/glm52/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 60487,
	"generation_ok": true,
	"generation_duration_s": 380.184,
	"input_tokens": 522022,
	"output_tokens": 23534,
	"total_tokens": 545556,
	"billing_tokens": 545556,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 459648,
	"total_cache_tokens": 459648,
	"effective_input_tokens": 62374,
	"display_input_tokens": 522022,
	"usage_event_count": 16,
	"tool_calls": 19,
	"turn_count": 16,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 16,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/benchmark-comp \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-glm52-publication-final/benchm",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gpt-5.3-codex",
	"model_slug": "gpt-5-3-codex",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gpt-5-3-codex-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 40305,
	"generation_ok": true,
	"generation_duration_s": 63.372,
	"input_tokens": 91503,
	"output_tokens": 5097,
	"total_tokens": 96600,
	"billing_tokens": 96600,
	"reasoning_tokens": 1083,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 76800,
	"total_cache_tokens": 76800,
	"effective_input_tokens": 14703,
	"display_input_tokens": 91503,
	"usage_event_count": 8,
	"tool_calls": 11,
	"turn_count": 8,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 8,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 2,
	"deterministic_warnings": 2,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 1,
	"deterministic_warning_units": 1,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 1,
	"mobile_failures": 1,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 1,
	"mobile_deep_failures": 1,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 93,
	"task_score": 18.6,
	"task_score_max": 20,
	"quality_score": 93,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "gpt-5.3-codex",
	"model_slug": "gpt-5-3-codex",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gpt-5-3-codex-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 39494,
	"generation_ok": true,
	"generation_duration_s": 94.334,
	"input_tokens": 461816,
	"output_tokens": 6027,
	"total_tokens": 467843,
	"billing_tokens": 467843,
	"reasoning_tokens": 2855,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 384640,
	"total_cache_tokens": 384640,
	"effective_input_tokens": 77176,
	"display_input_tokens": 461816,
	"usage_event_count": 17,
	"tool_calls": 18,
	"turn_count": 17,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": false,
	"self_check_runs": 1,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 17,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/code-r",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gpt-5.3-codex",
	"model_slug": "gpt-5-3-codex",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gpt-5-3-codex-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 46290,
	"generation_ok": true,
	"generation_duration_s": 93.641,
	"input_tokens": 555669,
	"output_tokens": 7177,
	"total_tokens": 562846,
	"billing_tokens": 562846,
	"reasoning_tokens": 1701,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 450304,
	"total_cache_tokens": 450304,
	"effective_input_tokens": 105365,
	"display_input_tokens": 555669,
	"usage_event_count": 17,
	"tool_calls": 23,
	"turn_count": 17,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 17,
	"self_check_mode": "checker-cli-error,checker-shell-reference,read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| shell referenced checker: rg '^def ' -n /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-codex-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-codex-pu \| ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/module-explainer.h \| checker CLI usage error",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 1,
	"vlm_warnings": 1,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 1,
	"vlm_warning_units": 1,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 91,
	"task_score": 18.2,
	"task_score_max": 20,
	"quality_score": 91,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "gpt-5.3-codex",
	"model_slug": "gpt-5-3-codex",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gpt-5-3-codex-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 45485,
	"generation_ok": true,
	"generation_duration_s": 59.362,
	"input_tokens": 90659,
	"output_tokens": 4766,
	"total_tokens": 95425,
	"billing_tokens": 95425,
	"reasoning_tokens": 589,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 71168,
	"total_cache_tokens": 71168,
	"effective_input_tokens": 19491,
	"display_input_tokens": 90659,
	"usage_event_count": 9,
	"tool_calls": 10,
	"turn_count": 9,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 9,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/implem \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gpt-5.3-codex",
	"model_slug": "gpt-5-3-codex",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gpt-5-3-codex-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 46793,
	"generation_ok": true,
	"generation_duration_s": 61.812,
	"input_tokens": 60483,
	"output_tokens": 5615,
	"total_tokens": 66098,
	"billing_tokens": 66098,
	"reasoning_tokens": 746,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 53376,
	"total_cache_tokens": 53376,
	"effective_input_tokens": 7107,
	"display_input_tokens": 60483,
	"usage_event_count": 7,
	"tool_calls": 8,
	"turn_count": 7,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 7,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 4,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 2,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 2,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 88,
	"task_score": 17.6,
	"task_score_max": 20,
	"quality_score": 88,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "grok-4.3",
	"model_slug": "grok-4-3",
	"source_kind": "clean-final",
	"label": "skill-with-shell-grok-4-3-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/grok-4-3/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 36903,
	"generation_ok": true,
	"generation_duration_s": 49.028,
	"input_tokens": 73338,
	"output_tokens": 3307,
	"total_tokens": 76645,
	"billing_tokens": 76645,
	"reasoning_tokens": 925,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 62720,
	"total_cache_tokens": 62720,
	"effective_input_tokens": 10618,
	"display_input_tokens": 73338,
	"usage_event_count": 10,
	"tool_calls": 9,
	"turn_count": 10,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 10,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "grok-4.3",
	"model_slug": "grok-4-3",
	"source_kind": "clean-final",
	"label": "skill-with-shell-grok-4-3-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/grok-4-3/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 38297,
	"generation_ok": true,
	"generation_duration_s": 55.392,
	"input_tokens": 190492,
	"output_tokens": 4553,
	"total_tokens": 195045,
	"billing_tokens": 195045,
	"reasoning_tokens": 2340,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 147520,
	"total_cache_tokens": 147520,
	"effective_input_tokens": 42972,
	"display_input_tokens": 190492,
	"usage_event_count": 11,
	"tool_calls": 10,
	"turn_count": 11,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 11,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "grok-4.3",
	"model_slug": "grok-4-3",
	"source_kind": "clean-final",
	"label": "skill-with-shell-grok-4-3-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/grok-4-3/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 9279,
	"generation_ok": false,
	"generation_duration_s": 40.052,
	"input_tokens": 125766,
	"output_tokens": 3826,
	"total_tokens": 129592,
	"billing_tokens": 129592,
	"reasoning_tokens": 1202,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 46784,
	"total_cache_tokens": 46784,
	"effective_input_tokens": 53433,
	"display_input_tokens": 100217,
	"usage_event_count": 15,
	"tool_calls": 6,
	"turn_count": 7,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 15,
	"self_check_mode": "read-checker",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
	"deterministic_failures": 8,
	"deterministic_warnings": 0,
	"vlm_failures": 3,
	"vlm_warnings": 0,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 2,
	"vlm_warning_units": 0,
	"desktop_failures": 2,
	"desktop_warnings": 0,
	"mobile_failures": 2,
	"mobile_warnings": 0,
	"deep_failures": 2,
	"deep_warnings": 0,
	"mobile_deep_failures": 2,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "grok-4.3",
	"model_slug": "grok-4-3",
	"source_kind": "clean-final",
	"label": "skill-with-shell-grok-4-3-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/grok-4-3/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 16152,
	"generation_ok": false,
	"generation_duration_s": 41.596,
	"input_tokens": 32235,
	"output_tokens": 5236,
	"total_tokens": 37471,
	"billing_tokens": 37471,
	"reasoning_tokens": 1207,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 39488,
	"total_cache_tokens": 39488,
	"effective_input_tokens": 20479,
	"display_input_tokens": 59967,
	"usage_event_count": 8,
	"tool_calls": 4,
	"turn_count": 5,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 8,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 4,
	"deterministic_warnings": 0,
	"vlm_failures": 4,
	"vlm_warnings": 0,
	"deterministic_failure_units": 1,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 1,
	"vlm_warning_units": 0,
	"desktop_failures": 1,
	"desktop_warnings": 0,
	"mobile_failures": 1,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 1,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 20.0,
	"task_score": 4.0,
	"task_score_max": 20,
	"quality_score": 20.0,
	"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "grok-4.3",
	"model_slug": "grok-4-3",
	"source_kind": "clean-final",
	"label": "skill-with-shell-grok-4-3-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/grok-4-3/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 10364,
	"generation_ok": false,
	"generation_duration_s": 98.19,
	"input_tokens": 153411,
	"output_tokens": 7388,
	"total_tokens": 160799,
	"billing_tokens": 160799,
	"reasoning_tokens": 2517,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 39488,
	"total_cache_tokens": 39488,
	"effective_input_tokens": 6645,
	"display_input_tokens": 46133,
	"usage_event_count": 8,
	"tool_calls": 15,
	"turn_count": 16,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 8,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 4,
	"deterministic_warnings": 0,
	"vlm_failures": 4,
	"vlm_warnings": 1,
	"deterministic_failure_units": 1,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 1,
	"vlm_warning_units": 1,
	"desktop_failures": 1,
	"desktop_warnings": 0,
	"mobile_failures": 1,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 1,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "haiku45",
	"model_slug": "haiku45",
	"source_kind": "clean-final",
	"label": "skill-with-shell-haiku45-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/haiku45/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 23937,
	"generation_ok": false,
	"generation_duration_s": 67.62,
	"input_tokens": 119520,
	"output_tokens": 7707,
	"total_tokens": 127227,
	"billing_tokens": 127227,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 7297,
	"cache_write_tokens": 12081,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 19378,
	"effective_input_tokens": 11280,
	"display_input_tokens": 30658,
	"usage_event_count": 4,
	"tool_calls": 9,
	"turn_count": 10,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 4,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-haiku45-publication-final/numeric-data",
	"deterministic_failures": 16,
	"deterministic_warnings": 12,
	"vlm_failures": 1,
	"vlm_warnings": 0,
	"deterministic_failure_units": 4,
	"deterministic_warning_units": 3,
	"vlm_failure_units": 1,
	"vlm_warning_units": 0,
	"desktop_failures": 4,
	"desktop_warnings": 3,
	"mobile_failures": 4,
	"mobile_warnings": 3,
	"deep_failures": 4,
	"deep_warnings": 3,
	"mobile_deep_failures": 4,
	"mobile_deep_warnings": 3,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "haiku45",
	"model_slug": "haiku45",
	"source_kind": "clean-final",
	"label": "skill-with-shell-haiku45-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/haiku45/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 53526,
	"generation_ok": true,
	"generation_duration_s": 94.461,
	"input_tokens": 301467,
	"output_tokens": 10117,
	"total_tokens": 311584,
	"billing_tokens": 311584,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 228528,
	"cache_write_tokens": 34499,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 263027,
	"effective_input_tokens": 38440,
	"display_input_tokens": 301467,
	"usage_event_count": 11,
	"tool_calls": 11,
	"turn_count": 11,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 11,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-haiku45-",
	"deterministic_failures": 6,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 2,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 1,
	"desktop_failures": 1,
	"desktop_warnings": 0,
	"mobile_failures": 2,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 2,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 87,
	"task_score": 17.4,
	"task_score_max": 20,
	"quality_score": 87,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "haiku45",
	"model_slug": "haiku45",
	"source_kind": "clean-final",
	"label": "skill-with-shell-haiku45-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/haiku45/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 57853,
	"generation_ok": false,
	"generation_duration_s": 75.42,
	"input_tokens": 211164,
	"output_tokens": 9407,
	"total_tokens": 220571,
	"billing_tokens": 220571,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 55031,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 55031,
	"effective_input_tokens": 80985,
	"display_input_tokens": 136016,
	"usage_event_count": 3,
	"tool_calls": 10,
	"turn_count": 6,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 3,
	"self_check_mode": "read-checker",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "haiku45",
	"model_slug": "haiku45",
	"source_kind": "clean-final",
	"label": "skill-with-shell-haiku45-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/haiku45/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 50641,
	"generation_ok": true,
	"generation_duration_s": 67.418,
	"input_tokens": 123711,
	"output_tokens": 7166,
	"total_tokens": 130877,
	"billing_tokens": 130877,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 91600,
	"cache_write_tokens": 16126,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 107726,
	"effective_input_tokens": 15985,
	"display_input_tokens": 123711,
	"usage_event_count": 9,
	"tool_calls": 9,
	"turn_count": 9,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 9,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-haiku45-",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "haiku45",
	"model_slug": "haiku45",
	"source_kind": "clean-final",
	"label": "skill-with-shell-haiku45-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/haiku45/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 49137,
	"generation_ok": true,
	"generation_duration_s": 65.28,
	"input_tokens": 151349,
	"output_tokens": 7796,
	"total_tokens": 159145,
	"billing_tokens": 159145,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 122743,
	"cache_write_tokens": 12640,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 135383,
	"effective_input_tokens": 15966,
	"display_input_tokens": 151349,
	"usage_event_count": 11,
	"tool_calls": 10,
	"turn_count": 11,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 11,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 4,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 3,
	"deterministic_failure_units": 1,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 1,
	"desktop_failures": 1,
	"desktop_warnings": 0,
	"mobile_failures": 1,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 1,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 93,
	"task_score": 18.6,
	"task_score_max": 20,
	"quality_score": 93,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "kimi",
	"model_slug": "kimi",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/kimi/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 67620,
	"generation_ok": true,
	"generation_duration_s": 194.344,
	"input_tokens": 470039,
	"output_tokens": 5317,
	"total_tokens": 475356,
	"billing_tokens": 475356,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 425472,
	"total_cache_tokens": 425472,
	"effective_input_tokens": 44567,
	"display_input_tokens": 470039,
	"usage_event_count": 20,
	"tool_calls": 23,
	"turn_count": 20,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 20,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi-publication-final/numeric-data.ht \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi-pub",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "kimi",
	"model_slug": "kimi",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/kimi/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 44300,
	"generation_ok": true,
	"generation_duration_s": 627.536,
	"input_tokens": 1248543,
	"output_tokens": 24596,
	"total_tokens": 1273139,
	"billing_tokens": 1273139,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1192448,
	"total_cache_tokens": 1192448,
	"effective_input_tokens": 56095,
	"display_input_tokens": 1248543,
	"usage_event_count": 33,
	"tool_calls": 36,
	"turn_count": 33,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 33,
	"self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| shell referenced checker: grep -n \"CANDLE_CLASSES\\\|BIRCH_CLASSES\\\|LAYOUT_CLASSES\\\|SEMANTIC_CLASSES\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| head -20 \| shell referenced checker: grep -n \"callout\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| shell referenced checker: grep -n \"eyebrow\\\|lede\\\|muted\\\|caption\\\|subtle\\\|note\\\|entity\\\|label-cell\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| head -20 \| shell referenced checker: grep -n \"code-block\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| head -20 \| shell referenced checker: grep -n \"data-tone\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| head -20",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "kimi",
	"model_slug": "kimi",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/kimi/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 17730,
	"generation_ok": false,
	"generation_duration_s": 142.653,
	"input_tokens": 54919,
	"output_tokens": 5427,
	"total_tokens": 60346,
	"billing_tokens": 60346,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 0,
	"effective_input_tokens": 54919,
	"display_input_tokens": 54919,
	"usage_event_count": 5,
	"tool_calls": 10,
	"turn_count": 5,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 5,
	"self_check_mode": "read-checker",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
	"deterministic_failures": 6,
	"deterministic_warnings": 0,
	"vlm_failures": 7,
	"vlm_warnings": 1,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 3,
	"vlm_warning_units": 1,
	"desktop_failures": 1,
	"desktop_warnings": 0,
	"mobile_failures": 2,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 2,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 20.0,
	"task_score": 4.0,
	"task_score_max": 20,
	"quality_score": 20.0,
	"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "kimi",
	"model_slug": "kimi",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/kimi/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 50937,
	"generation_ok": true,
	"generation_duration_s": 372.779,
	"input_tokens": 468652,
	"output_tokens": 19358,
	"total_tokens": 488010,
	"billing_tokens": 488010,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 415232,
	"total_cache_tokens": 415232,
	"effective_input_tokens": 53420,
	"display_input_tokens": 468652,
	"usage_event_count": 15,
	"tool_calls": 16,
	"turn_count": 15,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 15,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi-publication-final/implementation-",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "kimi",
	"model_slug": "kimi",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/kimi/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 51725,
	"generation_ok": true,
	"generation_duration_s": 427.336,
	"input_tokens": 358341,
	"output_tokens": 15297,
	"total_tokens": 373638,
	"billing_tokens": 373638,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 299776,
	"total_cache_tokens": 299776,
	"effective_input_tokens": 58565,
	"display_input_tokens": 358341,
	"usage_event_count": 14,
	"tool_calls": 14,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 14,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi-publicati",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 1,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 1,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 99,
	"task_score": 19.8,
	"task_score_max": 20,
	"quality_score": 99,
	"quality_cap_reason": "",
	"quality_class": "warn"
	},
	{
	"suite": "publish",
	"model": "kimi27",
	"model_slug": "kimi27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi27-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/kimi27/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 41967,
	"generation_ok": true,
	"generation_duration_s": 210.371,
	"input_tokens": 1978925,
	"output_tokens": 17532,
	"total_tokens": 1996457,
	"billing_tokens": 1996457,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1687898,
	"total_cache_tokens": 1687898,
	"effective_input_tokens": 291027,
	"display_input_tokens": 1978925,
	"usage_event_count": 30,
	"tool_calls": 32,
	"turn_count": 30,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 3,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 30,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && if [ -f skill/scripts/check_birch_renderings.py ]; then uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs \| read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi27-p \| ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-kimi27-publication-final/numeric-data.html && uv run --with pillow python s",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "kimi27",
	"model_slug": "kimi27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi27-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/kimi27/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 47402,
	"generation_ok": true,
	"generation_duration_s": 253.252,
	"input_tokens": 1509119,
	"output_tokens": 28034,
	"total_tokens": 1537153,
	"billing_tokens": 1537153,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1144217,
	"total_cache_tokens": 1144217,
	"effective_input_tokens": 364902,
	"display_input_tokens": 1509119,
	"usage_event_count": 25,
	"tool_calls": 30,
	"turn_count": 25,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 25,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/code-review.html 2> \| ran checker CLI: cd /home/shaun/source/birch-html && uv run python skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-kimi27-publication-final/code-review.html && uv run --with pillow py",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "kimi27",
	"model_slug": "kimi27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi27-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/kimi27/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 52748,
	"generation_ok": true,
	"generation_duration_s": 136.617,
	"input_tokens": 582570,
	"output_tokens": 12473,
	"total_tokens": 595043,
	"billing_tokens": 595043,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 334281,
	"total_cache_tokens": 334281,
	"effective_input_tokens": 248289,
	"display_input_tokens": 582570,
	"usage_event_count": 7,
	"tool_calls": 14,
	"turn_count": 7,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 7,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/module-explainer.ht",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "kimi27",
	"model_slug": "kimi27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi27-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/kimi27/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 52277,
	"generation_ok": true,
	"generation_duration_s": 72.968,
	"input_tokens": 487122,
	"output_tokens": 6684,
	"total_tokens": 493806,
	"billing_tokens": 493806,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 332350,
	"total_cache_tokens": 332350,
	"effective_input_tokens": 154772,
	"display_input_tokens": 487122,
	"usage_event_count": 9,
	"tool_calls": 9,
	"turn_count": 9,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 9,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/implementatio \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi27-p",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "kimi27",
	"model_slug": "kimi27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi27-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/kimi27/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 59856,
	"generation_ok": true,
	"generation_duration_s": 159.927,
	"input_tokens": 1290293,
	"output_tokens": 18058,
	"total_tokens": 1308351,
	"billing_tokens": 1308351,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1169139,
	"total_cache_tokens": 1169139,
	"effective_input_tokens": 121154,
	"display_input_tokens": 1290293,
	"usage_event_count": 16,
	"tool_calls": 19,
	"turn_count": 16,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 16,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/benchmark-co \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi27-",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "minimax27",
	"model_slug": "minimax27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-minimax27-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/minimax27/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 50838,
	"generation_ok": false,
	"generation_duration_s": 160.154,
	"input_tokens": 87235,
	"output_tokens": 10902,
	"total_tokens": 98137,
	"billing_tokens": 98137,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 116736,
	"total_cache_tokens": 116736,
	"effective_input_tokens": 81499,
	"display_input_tokens": 198235,
	"usage_event_count": 12,
	"tool_calls": 9,
	"turn_count": 10,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 12,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "minimax27",
	"model_slug": "minimax27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-minimax27-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/minimax27/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 43165,
	"generation_ok": true,
	"generation_duration_s": 211.215,
	"input_tokens": 444148,
	"output_tokens": 7213,
	"total_tokens": 451361,
	"billing_tokens": 451361,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 355328,
	"total_cache_tokens": 355328,
	"effective_input_tokens": 88820,
	"display_input_tokens": 444148,
	"usage_event_count": 18,
	"tool_calls": 20,
	"turn_count": 18,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 18,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "minimax27",
	"model_slug": "minimax27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-minimax27-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/minimax27/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 50511,
	"generation_ok": false,
	"generation_duration_s": 183.748,
	"input_tokens": 185140,
	"output_tokens": 15068,
	"total_tokens": 200208,
	"billing_tokens": 200208,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 232320,
	"total_cache_tokens": 232320,
	"effective_input_tokens": 148313,
	"display_input_tokens": 380633,
	"usage_event_count": 9,
	"tool_calls": 9,
	"turn_count": 5,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 9,
	"self_check_mode": "read-checker",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
	"deterministic_failures": 4,
	"deterministic_warnings": 0,
	"vlm_failures": 4,
	"vlm_warnings": 0,
	"deterministic_failure_units": 1,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 1,
	"vlm_warning_units": 0,
	"desktop_failures": 1,
	"desktop_warnings": 0,
	"mobile_failures": 1,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 1,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 20.0,
	"task_score": 4.0,
	"task_score_max": 20,
	"quality_score": 20.0,
	"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "minimax27",
	"model_slug": "minimax27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-minimax27-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/minimax27/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 21904,
	"generation_ok": false,
	"generation_duration_s": 64.763,
	"input_tokens": 27146,
	"output_tokens": 4563,
	"total_tokens": 31709,
	"billing_tokens": 31709,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 7040,
	"total_cache_tokens": 7040,
	"effective_input_tokens": 11494,
	"display_input_tokens": 18534,
	"usage_event_count": 3,
	"tool_calls": 3,
	"turn_count": 4,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 3,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 14,
	"deterministic_warnings": 4,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 4,
	"deterministic_warning_units": 1,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 3,
	"desktop_warnings": 1,
	"mobile_failures": 4,
	"mobile_warnings": 1,
	"deep_failures": 3,
	"deep_warnings": 1,
	"mobile_deep_failures": 4,
	"mobile_deep_warnings": 1,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "minimax27",
	"model_slug": "minimax27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-minimax27-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/minimax27/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 79228,
	"generation_ok": false,
	"generation_duration_s": 420.033,
	"input_tokens": 511926,
	"output_tokens": 33192,
	"total_tokens": 545118,
	"billing_tokens": 545118,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 129664,
	"total_cache_tokens": 129664,
	"effective_input_tokens": 154885,
	"display_input_tokens": 284549,
	"usage_event_count": 7,
	"tool_calls": 14,
	"turn_count": 13,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 7,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-minimax27-publication-final/benchmark-comparison.html 2>&1 ",
	"deterministic_failures": 8,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 4,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 1,
	"desktop_failures": 2,
	"desktop_warnings": 0,
	"mobile_failures": 2,
	"mobile_warnings": 0,
	"deep_failures": 2,
	"deep_warnings": 0,
	"mobile_deep_failures": 2,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "opus47",
	"model_slug": "opus47",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus47-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/opus47/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 45758,
	"generation_ok": true,
	"generation_duration_s": 106.088,
	"input_tokens": 161380,
	"output_tokens": 8823,
	"total_tokens": 170203,
	"billing_tokens": 170203,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 114642,
	"cache_write_tokens": 25769,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 140411,
	"effective_input_tokens": 20969,
	"display_input_tokens": 161380,
	"usage_event_count": 10,
	"tool_calls": 12,
	"turn_count": 10,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 10,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/numeric-data. \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "opus47",
	"model_slug": "opus47",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus47-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/opus47/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 50191,
	"generation_ok": true,
	"generation_duration_s": 268.356,
	"input_tokens": 571314,
	"output_tokens": 17059,
	"total_tokens": 588373,
	"billing_tokens": 588373,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 441950,
	"cache_write_tokens": 55976,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 497926,
	"effective_input_tokens": 73388,
	"display_input_tokens": 571314,
	"usage_event_count": 14,
	"tool_calls": 18,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 3,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 14,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/code-review.h \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "opus47",
	"model_slug": "opus47",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus47-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/opus47/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 58814,
	"generation_ok": true,
	"generation_duration_s": 206.748,
	"input_tokens": 653611,
	"output_tokens": 15632,
	"total_tokens": 669243,
	"billing_tokens": 669243,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 502232,
	"cache_write_tokens": 65941,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 568173,
	"effective_input_tokens": 85438,
	"display_input_tokens": 653611,
	"usage_event_count": 13,
	"tool_calls": 19,
	"turn_count": 13,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 13,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/module-explainer.ht",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "opus47",
	"model_slug": "opus47",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus47-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/opus47/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 53012,
	"generation_ok": true,
	"generation_duration_s": 141.632,
	"input_tokens": 206186,
	"output_tokens": 9414,
	"total_tokens": 215600,
	"billing_tokens": 215600,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 160139,
	"cache_write_tokens": 23940,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 184079,
	"effective_input_tokens": 22107,
	"display_input_tokens": 206186,
	"usage_event_count": 11,
	"tool_calls": 12,
	"turn_count": 11,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 11,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/implementatio \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "opus47",
	"model_slug": "opus47",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus47-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/opus47/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 64934,
	"generation_ok": true,
	"generation_duration_s": 150.046,
	"input_tokens": 388331,
	"output_tokens": 9617,
	"total_tokens": 397948,
	"billing_tokens": 397948,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 328368,
	"cache_write_tokens": 33477,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 361845,
	"effective_input_tokens": 26486,
	"display_input_tokens": 388331,
	"usage_event_count": 19,
	"tool_calls": 22,
	"turn_count": 19,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 19,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/benchmark-com \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "sonnet46",
	"model_slug": "sonnet46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-sonnet46-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/sonnet46/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 52394,
	"generation_ok": true,
	"generation_duration_s": 203.959,
	"input_tokens": 302149,
	"output_tokens": 14758,
	"total_tokens": 316907,
	"billing_tokens": 316907,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 234504,
	"cache_write_tokens": 38197,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 272701,
	"effective_input_tokens": 29448,
	"display_input_tokens": 302149,
	"usage_event_count": 13,
	"tool_calls": 15,
	"turn_count": 13,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 13,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/numeric-dat \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "sonnet46",
	"model_slug": "sonnet46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-sonnet46-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/sonnet46/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 57805,
	"generation_ok": true,
	"generation_duration_s": 302.047,
	"input_tokens": 477280,
	"output_tokens": 18427,
	"total_tokens": 495707,
	"billing_tokens": 495707,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 368349,
	"cache_write_tokens": 44875,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 413224,
	"effective_input_tokens": 64056,
	"display_input_tokens": 477280,
	"usage_event_count": 14,
	"tool_calls": 18,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 14,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/code-review \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "sonnet46",
	"model_slug": "sonnet46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-sonnet46-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/sonnet46/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 66525,
	"generation_ok": true,
	"generation_duration_s": 978.64,
	"input_tokens": 2649057,
	"output_tokens": 62243,
	"total_tokens": 2711300,
	"billing_tokens": 2711300,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 2413844,
	"cache_write_tokens": 135163,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 2549007,
	"effective_input_tokens": 100050,
	"display_input_tokens": 2649057,
	"usage_event_count": 34,
	"tool_calls": 38,
	"turn_count": 34,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 34,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/module-explainer. \| ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-sonnet46-publication-final/module-explainer.html && uv run --with pillow py",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "sonnet46",
	"model_slug": "sonnet46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-sonnet46-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/sonnet46/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 49926,
	"generation_ok": true,
	"generation_duration_s": 196.05,
	"input_tokens": 257093,
	"output_tokens": 12916,
	"total_tokens": 270009,
	"billing_tokens": 270009,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 210864,
	"cache_write_tokens": 24527,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 235391,
	"effective_input_tokens": 21702,
	"display_input_tokens": 257093,
	"usage_event_count": 14,
	"tool_calls": 15,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 14,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/implementat \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "sonnet46",
	"model_slug": "sonnet46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-sonnet46-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/sonnet46/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 122208,
	"generation_ok": true,
	"generation_duration_s": 623.147,
	"input_tokens": 1192904,
	"output_tokens": 48270,
	"total_tokens": 1241174,
	"billing_tokens": 1241174,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 987803,
	"cache_write_tokens": 129337,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 1117140,
	"effective_input_tokens": 75764,
	"display_input_tokens": 1192904,
	"usage_event_count": 18,
	"tool_calls": 22,
	"turn_count": 18,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 3,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 18,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/benchmark-c \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	}
	]

Xet Storage Details

Size:: 204 kB
Xet hash:: 6727550d9199a333f7d2dbe313d5e3b4347b5e5c4a54cdd75afb3aac2e958117

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.