evalstate/birch-html / analysis /data /artifact-summary.json
evalstate's picture
download
raw
204 kB
[
{
"suite": "publish",
"model": "codexresponses.gpt-5.4-mini",
"model_slug": "codexresponses-gpt-5-4-mini",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 41655,
"generation_ok": true,
"generation_duration_s": 233.57,
"input_tokens": 257043,
"output_tokens": 19565,
"total_tokens": 276608,
"billing_tokens": 276608,
"reasoning_tokens": 13843,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 236032,
"total_cache_tokens": 236032,
"effective_input_tokens": 21011,
"display_input_tokens": 257043,
"usage_event_count": 12,
"tool_calls": 16,
"turn_count": 12,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 12,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publica",
"deterministic_failures": 0,
"deterministic_warnings": 2,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 1,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 1,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 1,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 99,
"task_score": 19.8,
"task_score_max": 20,
"quality_score": 99,
"quality_cap_reason": "",
"quality_class": "warn"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.4-mini",
"model_slug": "codexresponses-gpt-5-4-mini",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 40247,
"generation_ok": true,
"generation_duration_s": 251.091,
"input_tokens": 1602209,
"output_tokens": 16541,
"total_tokens": 1618750,
"billing_tokens": 1618750,
"reasoning_tokens": 10735,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1516544,
"total_cache_tokens": 1516544,
"effective_input_tokens": 85665,
"display_input_tokens": 1602209,
"usage_event_count": 24,
"tool_calls": 39,
"turn_count": 24,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 1,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 24,
"self_check_mode": "checker-cli-error,run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run python skill/scripts/check_birch_renderings.py --help | sed -n '1,220p' | checker CLI usage error | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.4-mini",
"model_slug": "codexresponses-gpt-5-4-mini",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 51503,
"generation_ok": true,
"generation_duration_s": 228.357,
"input_tokens": 538144,
"output_tokens": 20613,
"total_tokens": 558757,
"billing_tokens": 558757,
"reasoning_tokens": 12973,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 489472,
"total_cache_tokens": 489472,
"effective_input_tokens": 48672,
"display_input_tokens": 538144,
"usage_event_count": 14,
"tool_calls": 29,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 14,
"self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: rg -n \"^def (contract_findings|compare_stats|screenshot_findings|artifact_screenshot_findings|geometry_findings|render_markdown|capture|find_chrome|capture_height_for_viewport|css_ | ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-co | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-fina",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.4-mini",
"model_slug": "codexresponses-gpt-5-4-mini",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 48838,
"generation_ok": true,
"generation_duration_s": 249.193,
"input_tokens": 122451,
"output_tokens": 13529,
"total_tokens": 135980,
"billing_tokens": 135980,
"reasoning_tokens": 8129,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 103936,
"total_cache_tokens": 103936,
"effective_input_tokens": 18515,
"display_input_tokens": 122451,
"usage_event_count": 8,
"tool_calls": 11,
"turn_count": 8,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 8,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/implementation-plan.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.4-mini",
"model_slug": "codexresponses-gpt-5-4-mini",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 55271,
"generation_ok": true,
"generation_duration_s": 193.592,
"input_tokens": 280048,
"output_tokens": 17564,
"total_tokens": 297612,
"billing_tokens": 297612,
"reasoning_tokens": 9912,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 261120,
"total_cache_tokens": 261120,
"effective_input_tokens": 18928,
"display_input_tokens": 280048,
"usage_event_count": 14,
"tool_calls": 18,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 4,
"self_check_failed_runs": 3,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 14,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final && uv run --with matplotlib python - <<'PY'\nfrom pathlib impor | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres | ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\npath = Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/benchmark-comparison.h | ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\nimport re\npath = Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/benchmark-co",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.5",
"model_slug": "codexresponses-gpt-5-5",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 41967,
"generation_ok": true,
"generation_duration_s": 118.283,
"input_tokens": 95354,
"output_tokens": 5337,
"total_tokens": 100691,
"billing_tokens": 100691,
"reasoning_tokens": 402,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 78848,
"total_cache_tokens": 78848,
"effective_input_tokens": 16506,
"display_input_tokens": 95354,
"usage_event_count": 10,
"tool_calls": 10,
"turn_count": 10,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 1,
"self_corrected_after_checker": true,
"self_correction_verified": true,
"assistant_turns_trace": 10,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-fin | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.5",
"model_slug": "codexresponses-gpt-5-5",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 44204,
"generation_ok": true,
"generation_duration_s": 164.43,
"input_tokens": 388756,
"output_tokens": 7268,
"total_tokens": 396024,
"billing_tokens": 396024,
"reasoning_tokens": 2335,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 346624,
"total_cache_tokens": 346624,
"effective_input_tokens": 42132,
"display_input_tokens": 388756,
"usage_event_count": 16,
"tool_calls": 22,
"turn_count": 16,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 0,
"self_check_successful_runs": 3,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 16,
"self_check_mode": "checker-cli-error,run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-fin | checker CLI usage error | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.5",
"model_slug": "codexresponses-gpt-5-5",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 57189,
"generation_ok": true,
"generation_duration_s": 178.972,
"input_tokens": 450726,
"output_tokens": 9063,
"total_tokens": 459789,
"billing_tokens": 459789,
"reasoning_tokens": 477,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 400896,
"total_cache_tokens": 400896,
"effective_input_tokens": 49830,
"display_input_tokens": 450726,
"usage_event_count": 14,
"tool_calls": 25,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 14,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/module-explainer.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <met | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/mod | ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\np=Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/module-explainer.html')\ns=p.read",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 1,
"vlm_warnings": 1,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 1,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 91,
"task_score": 18.2,
"task_score_max": 20,
"quality_score": 91,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.5",
"model_slug": "codexresponses-gpt-5-5",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 49708,
"generation_ok": true,
"generation_duration_s": 144.313,
"input_tokens": 129170,
"output_tokens": 6893,
"total_tokens": 136063,
"billing_tokens": 136063,
"reasoning_tokens": 369,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 91136,
"total_cache_tokens": 91136,
"effective_input_tokens": 38034,
"display_input_tokens": 129170,
"usage_event_count": 11,
"tool_calls": 13,
"turn_count": 11,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 1,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 11,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexre | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-fin | ran checker CLI: cd /home/shaun/source/birch-html && python3 - <<'PY'\nfrom pathlib import Path\np=Path('eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/implementation-plan.html')",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.5",
"model_slug": "codexresponses-gpt-5-5",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-5-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 53609,
"generation_ok": true,
"generation_duration_s": 142.604,
"input_tokens": 126650,
"output_tokens": 6524,
"total_tokens": 133174,
"billing_tokens": 133174,
"reasoning_tokens": 491,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 101376,
"total_cache_tokens": 101376,
"effective_input_tokens": 25274,
"display_input_tokens": 126650,
"usage_event_count": 11,
"tool_calls": 13,
"turn_count": 11,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 11,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && python3 - <<'PY'\nfrom pathlib import Path\np=Path('eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/benchmark-comparison.html' | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexspark",
"model_slug": "codexspark",
"source_kind": "clean-final",
"label": "skill-with-shell-codexspark-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/codexspark/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 17281,
"generation_ok": true,
"generation_duration_s": 82.34,
"input_tokens": 825347,
"output_tokens": 23923,
"total_tokens": 849270,
"billing_tokens": 849270,
"reasoning_tokens": 13374,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 770688,
"total_cache_tokens": 770688,
"effective_input_tokens": 54659,
"display_input_tokens": 825347,
"usage_event_count": 32,
"tool_calls": 31,
"turn_count": 32,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 32,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 6,
"deterministic_warnings": 2,
"vlm_failures": 1,
"vlm_warnings": 0,
"deterministic_failure_units": 2,
"deterministic_warning_units": 1,
"vlm_failure_units": 1,
"vlm_warning_units": 0,
"desktop_failures": 1,
"desktop_warnings": 1,
"mobile_failures": 2,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 1,
"mobile_deep_failures": 2,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "codexspark",
"model_slug": "codexspark",
"source_kind": "clean-final",
"label": "skill-with-shell-codexspark-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/codexspark/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 9658,
"generation_ok": false,
"generation_duration_s": 60.395,
"input_tokens": 1737615,
"output_tokens": 21291,
"total_tokens": 1758906,
"billing_tokens": 1758906,
"reasoning_tokens": 17081,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1702656,
"total_cache_tokens": 1702656,
"effective_input_tokens": 86941,
"display_input_tokens": 1789597,
"usage_event_count": 41,
"tool_calls": 32,
"turn_count": 26,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 0,
"self_check_successful_runs": 3,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 41,
"self_check_mode": "checker-shell-reference,read-checker",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '1,260p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '260,560p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '560,920p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '920,1320p'",
"deterministic_failures": 8,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 2,
"desktop_warnings": 0,
"mobile_failures": 2,
"mobile_warnings": 0,
"deep_failures": 2,
"deep_warnings": 0,
"mobile_deep_failures": 2,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "codexspark",
"model_slug": "codexspark",
"source_kind": "clean-final",
"label": "skill-with-shell-codexspark-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/codexspark/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 16366,
"generation_ok": false,
"generation_duration_s": 87.747,
"input_tokens": 2740590,
"output_tokens": 27049,
"total_tokens": 2767639,
"billing_tokens": 2767639,
"reasoning_tokens": 15704,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 2024320,
"total_cache_tokens": 2024320,
"effective_input_tokens": 202803,
"display_input_tokens": 2227123,
"usage_event_count": 35,
"tool_calls": 51,
"turn_count": 42,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 35,
"self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: cd /home/shaun/source/birch-html && rg -n \"class\\s*=\\\"(flow-node|flow-edge|flow-list|flow-step|metric-row|chart-panel|finding|code-block|copyable|timeline)\" styles/birch-system.css | shell referenced checker: cd /home/shaun/source/birch-html && wc -l scripts/check_birch_renderings.py | shell referenced checker: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexspark-publication-final && cat > eval-runs/skill-with-shell-codexspark-publication-final/module-explain | ran checker CLI: cd /home/shaun/source/birch-html && cat > eval-runs/skill-with-shell-codexspark-publication-final/module-explainer.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <meta char",
"deterministic_failures": 0,
"deterministic_warnings": 4,
"vlm_failures": 4,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 1,
"vlm_failure_units": 1,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 1,
"mobile_failures": 0,
"mobile_warnings": 1,
"deep_failures": 0,
"deep_warnings": 1,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 1,
"artifact_present": true,
"artifact_score_100": 91,
"task_score": 18.2,
"task_score_max": 20,
"quality_score": 91,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "codexspark",
"model_slug": "codexspark",
"source_kind": "clean-final",
"label": "skill-with-shell-codexspark-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/codexspark/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 46864,
"generation_ok": true,
"generation_duration_s": 91.953,
"input_tokens": 1108319,
"output_tokens": 14746,
"total_tokens": 1123065,
"billing_tokens": 1123065,
"reasoning_tokens": 8043,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1055232,
"total_cache_tokens": 1055232,
"effective_input_tokens": 53087,
"display_input_tokens": 1108319,
"usage_event_count": 35,
"tool_calls": 37,
"turn_count": 35,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 35,
"self_check_mode": "checker-cli-error,checker-shell-reference,read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexspark-publication-final/implementation-plan.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <meta charset | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --help | head -n 120 | checker CLI usage error | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexspark-publication-final/implementation-plan.html --no- | ran checker CLI: python - <<'PY'\nfrom pathlib import Path\nfrom inspect import getsourcelines\nimport importlib.util\np=Path('/home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py')\nte",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexspark",
"model_slug": "codexspark",
"source_kind": "clean-final",
"label": "skill-with-shell-codexspark-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/codexspark/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 55786,
"generation_ok": true,
"generation_duration_s": 41.038,
"input_tokens": 681289,
"output_tokens": 5651,
"total_tokens": 686940,
"billing_tokens": 686940,
"reasoning_tokens": 4100,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 628224,
"total_cache_tokens": 628224,
"effective_input_tokens": 53065,
"display_input_tokens": 681289,
"usage_event_count": 24,
"tool_calls": 23,
"turn_count": 24,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 24,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "deepseek",
"model_slug": "deepseek",
"source_kind": "clean-final",
"label": "skill-with-shell-deepseek-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/deepseek/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 62489,
"generation_ok": true,
"generation_duration_s": 280.24,
"input_tokens": 594128,
"output_tokens": 18097,
"total_tokens": 612225,
"billing_tokens": 612225,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 560512,
"total_cache_tokens": 560512,
"effective_input_tokens": 33616,
"display_input_tokens": 594128,
"usage_event_count": 18,
"tool_calls": 20,
"turn_count": 18,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 18,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/numeric-dat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-deepseek",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "deepseek",
"model_slug": "deepseek",
"source_kind": "clean-final",
"label": "skill-with-shell-deepseek-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/deepseek/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 62789,
"generation_ok": true,
"generation_duration_s": 294.1,
"input_tokens": 784186,
"output_tokens": 14634,
"total_tokens": 798820,
"billing_tokens": 798820,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 749440,
"total_cache_tokens": 749440,
"effective_input_tokens": 34746,
"display_input_tokens": 784186,
"usage_event_count": 26,
"tool_calls": 30,
"turn_count": 26,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 1,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 26,
"self_check_mode": "checker-shell-reference,run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/code-review | shell referenced checker: cd /home/shaun/source/birch-html && head -30 skill/scripts/check_birch_renderings.py | grep -A5 \"add_argument\" | shell referenced checker: cd /home/shaun/source/birch-html && grep -n \"artifact\\|--artifact\" skill/scripts/check_birch_renderings.py | head -10 | ran checker CLI: cd /home/shaun/source/birch-html && rm -f skill/reports/birch-rendering-check.json skill/reports/birch-rendering-check.md && uv run --with pillow python skill/scripts/check_birch_r | shell referenced checker: cd /home/shaun/source/birch-html && grep -n \"ROOT\\s*=\" skill/scripts/check_birch_renderings.py | head -3 | ran checker CLI: cd /home/shaun/source/birch-html && rm -f skill/reports/birch-rendering-check-code-review.json && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /ho",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "deepseek",
"model_slug": "deepseek",
"source_kind": "clean-final",
"label": "skill-with-shell-deepseek-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/deepseek/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 31473,
"generation_ok": false,
"generation_duration_s": 177.334,
"input_tokens": 215656,
"output_tokens": 9938,
"total_tokens": 225594,
"billing_tokens": 225594,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 449920,
"total_cache_tokens": 449920,
"effective_input_tokens": 48511,
"display_input_tokens": 498431,
"usage_event_count": 10,
"tool_calls": 10,
"turn_count": 6,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 10,
"self_check_mode": "read-checker",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
"deterministic_failures": 8,
"deterministic_warnings": 1,
"vlm_failures": 7,
"vlm_warnings": 0,
"deterministic_failure_units": 3,
"deterministic_warning_units": 1,
"vlm_failure_units": 2,
"vlm_warning_units": 0,
"desktop_failures": 1,
"desktop_warnings": 1,
"mobile_failures": 3,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 3,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 20.0,
"task_score": 4.0,
"task_score_max": 20,
"quality_score": 20.0,
"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "deepseek",
"model_slug": "deepseek",
"source_kind": "clean-final",
"label": "skill-with-shell-deepseek-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/deepseek/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 52099,
"generation_ok": true,
"generation_duration_s": 112.544,
"input_tokens": 173739,
"output_tokens": 6911,
"total_tokens": 180650,
"billing_tokens": 180650,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 160128,
"total_cache_tokens": 160128,
"effective_input_tokens": 13611,
"display_input_tokens": 173739,
"usage_event_count": 12,
"tool_calls": 15,
"turn_count": 12,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 12,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/implementat",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "deepseek",
"model_slug": "deepseek",
"source_kind": "clean-final",
"label": "skill-with-shell-deepseek-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/deepseek/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 78962,
"generation_ok": true,
"generation_duration_s": 378.136,
"input_tokens": 767427,
"output_tokens": 27984,
"total_tokens": 795411,
"billing_tokens": 795411,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 717696,
"total_cache_tokens": 717696,
"effective_input_tokens": 49731,
"display_input_tokens": 767427,
"usage_event_count": 18,
"tool_calls": 22,
"turn_count": 18,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 18,
"self_check_mode": "checker-shell-reference",
"self_check_evidence": "shell referenced checker: cd /home/shaun/source/birch-html && ls skill/scripts/check_birch_renderings.py 2>&1 && echo \"---\" && head -5 eval-runs/skill-with-shell-deepseek-publication-final/benchmark-compari",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gemini35flash",
"model_slug": "gemini35flash",
"source_kind": "clean-final",
"label": "skill-with-shell-gemini35flash-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/gemini35flash/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 53215,
"generation_ok": true,
"generation_duration_s": 114.216,
"input_tokens": 1371616,
"output_tokens": 5260,
"total_tokens": 1376876,
"billing_tokens": 1376876,
"reasoning_tokens": 12418,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1116684,
"total_cache_tokens": 1116684,
"effective_input_tokens": 254932,
"display_input_tokens": 1371616,
"usage_event_count": 29,
"tool_calls": 28,
"turn_count": 29,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 29,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gemini35flash-publication-final/numeric-data.html | ran checker CLI: uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/numeric-dat",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gemini35flash",
"model_slug": "gemini35flash",
"source_kind": "clean-final",
"label": "skill-with-shell-gemini35flash-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/gemini35flash/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 53047,
"generation_ok": true,
"generation_duration_s": 193.238,
"input_tokens": 1684136,
"output_tokens": 6902,
"total_tokens": 1691038,
"billing_tokens": 1691038,
"reasoning_tokens": 23273,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1424691,
"total_cache_tokens": 1424691,
"effective_input_tokens": 259445,
"display_input_tokens": 1684136,
"usage_event_count": 34,
"tool_calls": 33,
"turn_count": 34,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 1,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 34,
"self_check_mode": "checker-cli-error,run-checker-cli",
"self_check_evidence": "ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --help | checker CLI usage error | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/co | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --no-capture --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publica",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gemini35flash",
"model_slug": "gemini35flash",
"source_kind": "clean-final",
"label": "skill-with-shell-gemini35flash-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/gemini35flash/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 57420,
"generation_ok": true,
"generation_duration_s": 203.178,
"input_tokens": 2196880,
"output_tokens": 10222,
"total_tokens": 2207102,
"billing_tokens": 2207102,
"reasoning_tokens": 22501,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1965131,
"total_cache_tokens": 1965131,
"effective_input_tokens": 231749,
"display_input_tokens": 2196880,
"usage_event_count": 33,
"tool_calls": 32,
"turn_count": 33,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": false,
"self_check_runs": 2,
"self_check_failed_runs": 2,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 33,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read scripts/check_birch_renderings.py | ran checker CLI: python3 scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gemini35flash-publication-final/module-explainer.html",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gemini35flash",
"model_slug": "gemini35flash",
"source_kind": "clean-final",
"label": "skill-with-shell-gemini35flash-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/gemini35flash/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 49628,
"generation_ok": true,
"generation_duration_s": 201.715,
"input_tokens": 2346900,
"output_tokens": 9173,
"total_tokens": 2356073,
"billing_tokens": 2356073,
"reasoning_tokens": 15150,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 2043078,
"total_cache_tokens": 2043078,
"effective_input_tokens": 303822,
"display_input_tokens": 2346900,
"usage_event_count": 34,
"tool_calls": 33,
"turn_count": 34,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 5,
"self_check_failed_runs": 4,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 34,
"self_check_mode": "checker-cli-error,run-checker-cli",
"self_check_evidence": "ran checker CLI: python3 skill/scripts/check_birch_renderings.py --help | checker CLI usage error | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact temp_plan.html | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/temp_plan.html | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/implementation-plan.html",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gemini35flash",
"model_slug": "gemini35flash",
"source_kind": "clean-final",
"label": "skill-with-shell-gemini35flash-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/gemini35flash/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 97390,
"generation_ok": true,
"generation_duration_s": 62.077,
"input_tokens": 495825,
"output_tokens": 829,
"total_tokens": 496654,
"billing_tokens": 496654,
"reasoning_tokens": 4961,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 387138,
"total_cache_tokens": 387138,
"effective_input_tokens": 108687,
"display_input_tokens": 495825,
"usage_event_count": 17,
"tool_calls": 16,
"turn_count": 17,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": false,
"self_check_runs": 1,
"self_check_failed_runs": 1,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 17,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/be",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "glm51",
"model_slug": "glm51",
"source_kind": "clean-final",
"label": "skill-with-shell-glm51-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/glm51/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 62971,
"generation_ok": true,
"generation_duration_s": 300.114,
"input_tokens": 459899,
"output_tokens": 16275,
"total_tokens": 476174,
"billing_tokens": 476174,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 369152,
"total_cache_tokens": 369152,
"effective_input_tokens": 90747,
"display_input_tokens": 459899,
"usage_event_count": 15,
"tool_calls": 16,
"turn_count": 15,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": false,
"self_check_runs": 1,
"self_check_failed_runs": 1,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 15,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/numeric-data.h",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 2,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 1,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 99,
"task_score": 19.8,
"task_score_max": 20,
"quality_score": 99,
"quality_cap_reason": "",
"quality_class": "warn"
},
{
"suite": "publish",
"model": "glm51",
"model_slug": "glm51",
"source_kind": "clean-final",
"label": "skill-with-shell-glm51-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/glm51/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 48933,
"generation_ok": true,
"generation_duration_s": 133.324,
"input_tokens": 254816,
"output_tokens": 8008,
"total_tokens": 262824,
"billing_tokens": 262824,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 202560,
"total_cache_tokens": 202560,
"effective_input_tokens": 52256,
"display_input_tokens": 254816,
"usage_event_count": 11,
"tool_calls": 13,
"turn_count": 11,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 11,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/code-review.ht",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 2,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 92,
"task_score": 18.4,
"task_score_max": 20,
"quality_score": 92,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "glm51",
"model_slug": "glm51",
"source_kind": "clean-final",
"label": "skill-with-shell-glm51-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/glm51/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 54229,
"generation_ok": true,
"generation_duration_s": 94.822,
"input_tokens": 358438,
"output_tokens": 6652,
"total_tokens": 365090,
"billing_tokens": 365090,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 254656,
"total_cache_tokens": 254656,
"effective_input_tokens": 103782,
"display_input_tokens": 358438,
"usage_event_count": 9,
"tool_calls": 15,
"turn_count": 9,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 9,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/module-explainer.htm",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "glm51",
"model_slug": "glm51",
"source_kind": "clean-final",
"label": "skill-with-shell-glm51-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/glm51/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 60535,
"generation_ok": true,
"generation_duration_s": 90.03,
"input_tokens": 210191,
"output_tokens": 7574,
"total_tokens": 217765,
"billing_tokens": 217765,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 180736,
"total_cache_tokens": 180736,
"effective_input_tokens": 29455,
"display_input_tokens": 210191,
"usage_event_count": 15,
"tool_calls": 16,
"turn_count": 15,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 15,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/implementation",
"deterministic_failures": 2,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 2,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 1,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 1,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 1,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 93,
"task_score": 18.6,
"task_score_max": 20,
"quality_score": 93,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "glm51",
"model_slug": "glm51",
"source_kind": "clean-final",
"label": "skill-with-shell-glm51-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/glm51/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 64863,
"generation_ok": true,
"generation_duration_s": 149.159,
"input_tokens": 274201,
"output_tokens": 14416,
"total_tokens": 288617,
"billing_tokens": 288617,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 214336,
"total_cache_tokens": 214336,
"effective_input_tokens": 59865,
"display_input_tokens": 274201,
"usage_event_count": 12,
"tool_calls": 14,
"turn_count": 12,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 12,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/benchmark-comp",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "glm52",
"model_slug": "glm52",
"source_kind": "clean-final",
"label": "skill-with-shell-glm52-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/glm52/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 51395,
"generation_ok": true,
"generation_duration_s": 274.73,
"input_tokens": 271862,
"output_tokens": 16133,
"total_tokens": 287995,
"billing_tokens": 287995,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 239936,
"total_cache_tokens": 239936,
"effective_input_tokens": 31926,
"display_input_tokens": 271862,
"usage_event_count": 14,
"tool_calls": 17,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 14,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-glm52-publication-final/numeric-data.html && echo \"=== finished ===\" && uv | ran checker CLI: cd /home/shaun/source/birch-html && F=eval-runs/skill-with-shell-glm52-publication-final/numeric-data.html\necho \"=== my local style block (after system block) ===\"\npython3 - \"$F\" <",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "glm52",
"model_slug": "glm52",
"source_kind": "clean-final",
"label": "skill-with-shell-glm52-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/glm52/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 50845,
"generation_ok": true,
"generation_duration_s": 771.097,
"input_tokens": 1249523,
"output_tokens": 43260,
"total_tokens": 1292783,
"billing_tokens": 1292783,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1177792,
"total_cache_tokens": 1177792,
"effective_input_tokens": 71731,
"display_input_tokens": 1249523,
"usage_event_count": 25,
"tool_calls": 32,
"turn_count": 25,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 0,
"self_check_successful_runs": 3,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 25,
"self_check_mode": "checker-cli-error,run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/code-review.ht | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-glm52-publication-final/code-r | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --help 2>&1 | rg -i 'viewport|artifact|mobile|width' | head; echo \"=== run m | checker CLI usage error",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "glm52",
"model_slug": "glm52",
"source_kind": "clean-final",
"label": "skill-with-shell-glm52-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/glm52/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 59952,
"generation_ok": true,
"generation_duration_s": 751.331,
"input_tokens": 1204327,
"output_tokens": 40435,
"total_tokens": 1244762,
"billing_tokens": 1244762,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1118592,
"total_cache_tokens": 1118592,
"effective_input_tokens": 85735,
"display_input_tokens": 1204327,
"usage_event_count": 22,
"tool_calls": 32,
"turn_count": 22,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 22,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/module-explainer.htm",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "glm52",
"model_slug": "glm52",
"source_kind": "clean-final",
"label": "skill-with-shell-glm52-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/glm52/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 56320,
"generation_ok": true,
"generation_duration_s": 456.209,
"input_tokens": 991570,
"output_tokens": 24123,
"total_tokens": 1015693,
"billing_tokens": 1015693,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 911168,
"total_cache_tokens": 911168,
"effective_input_tokens": 80402,
"display_input_tokens": 991570,
"usage_event_count": 18,
"tool_calls": 26,
"turn_count": 18,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 0,
"self_check_successful_runs": 3,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 18,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/implementation | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-glm52-pu",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "glm52",
"model_slug": "glm52",
"source_kind": "clean-final",
"label": "skill-with-shell-glm52-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/glm52/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 60487,
"generation_ok": true,
"generation_duration_s": 380.184,
"input_tokens": 522022,
"output_tokens": 23534,
"total_tokens": 545556,
"billing_tokens": 545556,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 459648,
"total_cache_tokens": 459648,
"effective_input_tokens": 62374,
"display_input_tokens": 522022,
"usage_event_count": 16,
"tool_calls": 19,
"turn_count": 16,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 16,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/benchmark-comp | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-glm52-publication-final/benchm",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gpt-5.3-codex",
"model_slug": "gpt-5-3-codex",
"source_kind": "clean-final",
"label": "skill-with-shell-gpt-5-3-codex-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 40305,
"generation_ok": true,
"generation_duration_s": 63.372,
"input_tokens": 91503,
"output_tokens": 5097,
"total_tokens": 96600,
"billing_tokens": 96600,
"reasoning_tokens": 1083,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 76800,
"total_cache_tokens": 76800,
"effective_input_tokens": 14703,
"display_input_tokens": 91503,
"usage_event_count": 8,
"tool_calls": 11,
"turn_count": 8,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 8,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 2,
"deterministic_warnings": 2,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 1,
"deterministic_warning_units": 1,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 1,
"mobile_failures": 1,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 1,
"mobile_deep_failures": 1,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 93,
"task_score": 18.6,
"task_score_max": 20,
"quality_score": 93,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "gpt-5.3-codex",
"model_slug": "gpt-5-3-codex",
"source_kind": "clean-final",
"label": "skill-with-shell-gpt-5-3-codex-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 39494,
"generation_ok": true,
"generation_duration_s": 94.334,
"input_tokens": 461816,
"output_tokens": 6027,
"total_tokens": 467843,
"billing_tokens": 467843,
"reasoning_tokens": 2855,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 384640,
"total_cache_tokens": 384640,
"effective_input_tokens": 77176,
"display_input_tokens": 461816,
"usage_event_count": 17,
"tool_calls": 18,
"turn_count": 17,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": false,
"self_check_runs": 1,
"self_check_failed_runs": 1,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 17,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/code-r",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gpt-5.3-codex",
"model_slug": "gpt-5-3-codex",
"source_kind": "clean-final",
"label": "skill-with-shell-gpt-5-3-codex-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 46290,
"generation_ok": true,
"generation_duration_s": 93.641,
"input_tokens": 555669,
"output_tokens": 7177,
"total_tokens": 562846,
"billing_tokens": 562846,
"reasoning_tokens": 1701,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 450304,
"total_cache_tokens": 450304,
"effective_input_tokens": 105365,
"display_input_tokens": 555669,
"usage_event_count": 17,
"tool_calls": 23,
"turn_count": 17,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 17,
"self_check_mode": "checker-cli-error,checker-shell-reference,read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: rg '^def ' -n /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-codex-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-codex-pu | ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/module-explainer.h | checker CLI usage error",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 1,
"vlm_warnings": 1,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 1,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 91,
"task_score": 18.2,
"task_score_max": 20,
"quality_score": 91,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "gpt-5.3-codex",
"model_slug": "gpt-5-3-codex",
"source_kind": "clean-final",
"label": "skill-with-shell-gpt-5-3-codex-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 45485,
"generation_ok": true,
"generation_duration_s": 59.362,
"input_tokens": 90659,
"output_tokens": 4766,
"total_tokens": 95425,
"billing_tokens": 95425,
"reasoning_tokens": 589,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 71168,
"total_cache_tokens": 71168,
"effective_input_tokens": 19491,
"display_input_tokens": 90659,
"usage_event_count": 9,
"tool_calls": 10,
"turn_count": 9,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 9,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/implem | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gpt-5.3-codex",
"model_slug": "gpt-5-3-codex",
"source_kind": "clean-final",
"label": "skill-with-shell-gpt-5-3-codex-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 46793,
"generation_ok": true,
"generation_duration_s": 61.812,
"input_tokens": 60483,
"output_tokens": 5615,
"total_tokens": 66098,
"billing_tokens": 66098,
"reasoning_tokens": 746,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 53376,
"total_cache_tokens": 53376,
"effective_input_tokens": 7107,
"display_input_tokens": 60483,
"usage_event_count": 7,
"tool_calls": 8,
"turn_count": 7,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 7,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 4,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 2,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 2,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 88,
"task_score": 17.6,
"task_score_max": 20,
"quality_score": 88,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "grok-4.3",
"model_slug": "grok-4-3",
"source_kind": "clean-final",
"label": "skill-with-shell-grok-4-3-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/grok-4-3/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 36903,
"generation_ok": true,
"generation_duration_s": 49.028,
"input_tokens": 73338,
"output_tokens": 3307,
"total_tokens": 76645,
"billing_tokens": 76645,
"reasoning_tokens": 925,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 62720,
"total_cache_tokens": 62720,
"effective_input_tokens": 10618,
"display_input_tokens": 73338,
"usage_event_count": 10,
"tool_calls": 9,
"turn_count": 10,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 10,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "grok-4.3",
"model_slug": "grok-4-3",
"source_kind": "clean-final",
"label": "skill-with-shell-grok-4-3-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/grok-4-3/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 38297,
"generation_ok": true,
"generation_duration_s": 55.392,
"input_tokens": 190492,
"output_tokens": 4553,
"total_tokens": 195045,
"billing_tokens": 195045,
"reasoning_tokens": 2340,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 147520,
"total_cache_tokens": 147520,
"effective_input_tokens": 42972,
"display_input_tokens": 190492,
"usage_event_count": 11,
"tool_calls": 10,
"turn_count": 11,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 11,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "grok-4.3",
"model_slug": "grok-4-3",
"source_kind": "clean-final",
"label": "skill-with-shell-grok-4-3-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/grok-4-3/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 9279,
"generation_ok": false,
"generation_duration_s": 40.052,
"input_tokens": 125766,
"output_tokens": 3826,
"total_tokens": 129592,
"billing_tokens": 129592,
"reasoning_tokens": 1202,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 46784,
"total_cache_tokens": 46784,
"effective_input_tokens": 53433,
"display_input_tokens": 100217,
"usage_event_count": 15,
"tool_calls": 6,
"turn_count": 7,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 15,
"self_check_mode": "read-checker",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
"deterministic_failures": 8,
"deterministic_warnings": 0,
"vlm_failures": 3,
"vlm_warnings": 0,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 2,
"vlm_warning_units": 0,
"desktop_failures": 2,
"desktop_warnings": 0,
"mobile_failures": 2,
"mobile_warnings": 0,
"deep_failures": 2,
"deep_warnings": 0,
"mobile_deep_failures": 2,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "grok-4.3",
"model_slug": "grok-4-3",
"source_kind": "clean-final",
"label": "skill-with-shell-grok-4-3-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/grok-4-3/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 16152,
"generation_ok": false,
"generation_duration_s": 41.596,
"input_tokens": 32235,
"output_tokens": 5236,
"total_tokens": 37471,
"billing_tokens": 37471,
"reasoning_tokens": 1207,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 39488,
"total_cache_tokens": 39488,
"effective_input_tokens": 20479,
"display_input_tokens": 59967,
"usage_event_count": 8,
"tool_calls": 4,
"turn_count": 5,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 8,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 4,
"deterministic_warnings": 0,
"vlm_failures": 4,
"vlm_warnings": 0,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 0,
"desktop_failures": 1,
"desktop_warnings": 0,
"mobile_failures": 1,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 1,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 20.0,
"task_score": 4.0,
"task_score_max": 20,
"quality_score": 20.0,
"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "grok-4.3",
"model_slug": "grok-4-3",
"source_kind": "clean-final",
"label": "skill-with-shell-grok-4-3-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/grok-4-3/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 10364,
"generation_ok": false,
"generation_duration_s": 98.19,
"input_tokens": 153411,
"output_tokens": 7388,
"total_tokens": 160799,
"billing_tokens": 160799,
"reasoning_tokens": 2517,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 39488,
"total_cache_tokens": 39488,
"effective_input_tokens": 6645,
"display_input_tokens": 46133,
"usage_event_count": 8,
"tool_calls": 15,
"turn_count": 16,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 8,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 4,
"deterministic_warnings": 0,
"vlm_failures": 4,
"vlm_warnings": 1,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 1,
"desktop_failures": 1,
"desktop_warnings": 0,
"mobile_failures": 1,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 1,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "haiku45",
"model_slug": "haiku45",
"source_kind": "clean-final",
"label": "skill-with-shell-haiku45-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/haiku45/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 23937,
"generation_ok": false,
"generation_duration_s": 67.62,
"input_tokens": 119520,
"output_tokens": 7707,
"total_tokens": 127227,
"billing_tokens": 127227,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 7297,
"cache_write_tokens": 12081,
"cache_hit_tokens": 0,
"total_cache_tokens": 19378,
"effective_input_tokens": 11280,
"display_input_tokens": 30658,
"usage_event_count": 4,
"tool_calls": 9,
"turn_count": 10,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 4,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-haiku45-publication-final/numeric-data",
"deterministic_failures": 16,
"deterministic_warnings": 12,
"vlm_failures": 1,
"vlm_warnings": 0,
"deterministic_failure_units": 4,
"deterministic_warning_units": 3,
"vlm_failure_units": 1,
"vlm_warning_units": 0,
"desktop_failures": 4,
"desktop_warnings": 3,
"mobile_failures": 4,
"mobile_warnings": 3,
"deep_failures": 4,
"deep_warnings": 3,
"mobile_deep_failures": 4,
"mobile_deep_warnings": 3,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "haiku45",
"model_slug": "haiku45",
"source_kind": "clean-final",
"label": "skill-with-shell-haiku45-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/haiku45/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 53526,
"generation_ok": true,
"generation_duration_s": 94.461,
"input_tokens": 301467,
"output_tokens": 10117,
"total_tokens": 311584,
"billing_tokens": 311584,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 228528,
"cache_write_tokens": 34499,
"cache_hit_tokens": 0,
"total_cache_tokens": 263027,
"effective_input_tokens": 38440,
"display_input_tokens": 301467,
"usage_event_count": 11,
"tool_calls": 11,
"turn_count": 11,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 11,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-haiku45-",
"deterministic_failures": 6,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 2,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 1,
"desktop_failures": 1,
"desktop_warnings": 0,
"mobile_failures": 2,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 2,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 87,
"task_score": 17.4,
"task_score_max": 20,
"quality_score": 87,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "haiku45",
"model_slug": "haiku45",
"source_kind": "clean-final",
"label": "skill-with-shell-haiku45-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/haiku45/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 57853,
"generation_ok": false,
"generation_duration_s": 75.42,
"input_tokens": 211164,
"output_tokens": 9407,
"total_tokens": 220571,
"billing_tokens": 220571,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 55031,
"cache_hit_tokens": 0,
"total_cache_tokens": 55031,
"effective_input_tokens": 80985,
"display_input_tokens": 136016,
"usage_event_count": 3,
"tool_calls": 10,
"turn_count": 6,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 3,
"self_check_mode": "read-checker",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "haiku45",
"model_slug": "haiku45",
"source_kind": "clean-final",
"label": "skill-with-shell-haiku45-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/haiku45/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 50641,
"generation_ok": true,
"generation_duration_s": 67.418,
"input_tokens": 123711,
"output_tokens": 7166,
"total_tokens": 130877,
"billing_tokens": 130877,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 91600,
"cache_write_tokens": 16126,
"cache_hit_tokens": 0,
"total_cache_tokens": 107726,
"effective_input_tokens": 15985,
"display_input_tokens": 123711,
"usage_event_count": 9,
"tool_calls": 9,
"turn_count": 9,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 9,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-haiku45-",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "haiku45",
"model_slug": "haiku45",
"source_kind": "clean-final",
"label": "skill-with-shell-haiku45-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/haiku45/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 49137,
"generation_ok": true,
"generation_duration_s": 65.28,
"input_tokens": 151349,
"output_tokens": 7796,
"total_tokens": 159145,
"billing_tokens": 159145,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 122743,
"cache_write_tokens": 12640,
"cache_hit_tokens": 0,
"total_cache_tokens": 135383,
"effective_input_tokens": 15966,
"display_input_tokens": 151349,
"usage_event_count": 11,
"tool_calls": 10,
"turn_count": 11,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 11,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 4,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 3,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 1,
"desktop_failures": 1,
"desktop_warnings": 0,
"mobile_failures": 1,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 1,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 93,
"task_score": 18.6,
"task_score_max": 20,
"quality_score": 93,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "kimi",
"model_slug": "kimi",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/kimi/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 67620,
"generation_ok": true,
"generation_duration_s": 194.344,
"input_tokens": 470039,
"output_tokens": 5317,
"total_tokens": 475356,
"billing_tokens": 475356,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 425472,
"total_cache_tokens": 425472,
"effective_input_tokens": 44567,
"display_input_tokens": 470039,
"usage_event_count": 20,
"tool_calls": 23,
"turn_count": 20,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 1,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 20,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi-publication-final/numeric-data.ht | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi-pub",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "kimi",
"model_slug": "kimi",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/kimi/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 44300,
"generation_ok": true,
"generation_duration_s": 627.536,
"input_tokens": 1248543,
"output_tokens": 24596,
"total_tokens": 1273139,
"billing_tokens": 1273139,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1192448,
"total_cache_tokens": 1192448,
"effective_input_tokens": 56095,
"display_input_tokens": 1248543,
"usage_event_count": 33,
"tool_calls": 36,
"turn_count": 33,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 33,
"self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | shell referenced checker: grep -n \"CANDLE_CLASSES\\|BIRCH_CLASSES\\|LAYOUT_CLASSES\\|SEMANTIC_CLASSES\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"callout\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | shell referenced checker: grep -n \"eyebrow\\|lede\\|muted\\|caption\\|subtle\\|note\\|entity\\|label-cell\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"code-block\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"data-tone\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "kimi",
"model_slug": "kimi",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/kimi/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 17730,
"generation_ok": false,
"generation_duration_s": 142.653,
"input_tokens": 54919,
"output_tokens": 5427,
"total_tokens": 60346,
"billing_tokens": 60346,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 0,
"total_cache_tokens": 0,
"effective_input_tokens": 54919,
"display_input_tokens": 54919,
"usage_event_count": 5,
"tool_calls": 10,
"turn_count": 5,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 5,
"self_check_mode": "read-checker",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
"deterministic_failures": 6,
"deterministic_warnings": 0,
"vlm_failures": 7,
"vlm_warnings": 1,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 3,
"vlm_warning_units": 1,
"desktop_failures": 1,
"desktop_warnings": 0,
"mobile_failures": 2,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 2,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 20.0,
"task_score": 4.0,
"task_score_max": 20,
"quality_score": 20.0,
"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "kimi",
"model_slug": "kimi",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/kimi/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 50937,
"generation_ok": true,
"generation_duration_s": 372.779,
"input_tokens": 468652,
"output_tokens": 19358,
"total_tokens": 488010,
"billing_tokens": 488010,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 415232,
"total_cache_tokens": 415232,
"effective_input_tokens": 53420,
"display_input_tokens": 468652,
"usage_event_count": 15,
"tool_calls": 16,
"turn_count": 15,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 15,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi-publication-final/implementation-",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "kimi",
"model_slug": "kimi",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/kimi/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 51725,
"generation_ok": true,
"generation_duration_s": 427.336,
"input_tokens": 358341,
"output_tokens": 15297,
"total_tokens": 373638,
"billing_tokens": 373638,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 299776,
"total_cache_tokens": 299776,
"effective_input_tokens": 58565,
"display_input_tokens": 358341,
"usage_event_count": 14,
"tool_calls": 14,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 14,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi-publicati",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 1,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 1,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 99,
"task_score": 19.8,
"task_score_max": 20,
"quality_score": 99,
"quality_cap_reason": "",
"quality_class": "warn"
},
{
"suite": "publish",
"model": "kimi27",
"model_slug": "kimi27",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi27-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/kimi27/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 41967,
"generation_ok": true,
"generation_duration_s": 210.371,
"input_tokens": 1978925,
"output_tokens": 17532,
"total_tokens": 1996457,
"billing_tokens": 1996457,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1687898,
"total_cache_tokens": 1687898,
"effective_input_tokens": 291027,
"display_input_tokens": 1978925,
"usage_event_count": 30,
"tool_calls": 32,
"turn_count": 30,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 0,
"self_check_successful_runs": 3,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 30,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && if [ -f skill/scripts/check_birch_renderings.py ]; then uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs | read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi27-p | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-kimi27-publication-final/numeric-data.html && uv run --with pillow python s",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "kimi27",
"model_slug": "kimi27",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi27-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/kimi27/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 47402,
"generation_ok": true,
"generation_duration_s": 253.252,
"input_tokens": 1509119,
"output_tokens": 28034,
"total_tokens": 1537153,
"billing_tokens": 1537153,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1144217,
"total_cache_tokens": 1144217,
"effective_input_tokens": 364902,
"display_input_tokens": 1509119,
"usage_event_count": 25,
"tool_calls": 30,
"turn_count": 25,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 25,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/code-review.html 2> | ran checker CLI: cd /home/shaun/source/birch-html && uv run python skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-kimi27-publication-final/code-review.html && uv run --with pillow py",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "kimi27",
"model_slug": "kimi27",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi27-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/kimi27/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 52748,
"generation_ok": true,
"generation_duration_s": 136.617,
"input_tokens": 582570,
"output_tokens": 12473,
"total_tokens": 595043,
"billing_tokens": 595043,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 334281,
"total_cache_tokens": 334281,
"effective_input_tokens": 248289,
"display_input_tokens": 582570,
"usage_event_count": 7,
"tool_calls": 14,
"turn_count": 7,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 7,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/module-explainer.ht",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "kimi27",
"model_slug": "kimi27",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi27-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/kimi27/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 52277,
"generation_ok": true,
"generation_duration_s": 72.968,
"input_tokens": 487122,
"output_tokens": 6684,
"total_tokens": 493806,
"billing_tokens": 493806,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 332350,
"total_cache_tokens": 332350,
"effective_input_tokens": 154772,
"display_input_tokens": 487122,
"usage_event_count": 9,
"tool_calls": 9,
"turn_count": 9,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 9,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/implementatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi27-p",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "kimi27",
"model_slug": "kimi27",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi27-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/kimi27/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 59856,
"generation_ok": true,
"generation_duration_s": 159.927,
"input_tokens": 1290293,
"output_tokens": 18058,
"total_tokens": 1308351,
"billing_tokens": 1308351,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1169139,
"total_cache_tokens": 1169139,
"effective_input_tokens": 121154,
"display_input_tokens": 1290293,
"usage_event_count": 16,
"tool_calls": 19,
"turn_count": 16,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 16,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/benchmark-co | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi27-",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "minimax27",
"model_slug": "minimax27",
"source_kind": "clean-final",
"label": "skill-with-shell-minimax27-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/minimax27/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 50838,
"generation_ok": false,
"generation_duration_s": 160.154,
"input_tokens": 87235,
"output_tokens": 10902,
"total_tokens": 98137,
"billing_tokens": 98137,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 116736,
"total_cache_tokens": 116736,
"effective_input_tokens": 81499,
"display_input_tokens": 198235,
"usage_event_count": 12,
"tool_calls": 9,
"turn_count": 10,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 12,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "minimax27",
"model_slug": "minimax27",
"source_kind": "clean-final",
"label": "skill-with-shell-minimax27-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/minimax27/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 43165,
"generation_ok": true,
"generation_duration_s": 211.215,
"input_tokens": 444148,
"output_tokens": 7213,
"total_tokens": 451361,
"billing_tokens": 451361,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 355328,
"total_cache_tokens": 355328,
"effective_input_tokens": 88820,
"display_input_tokens": 444148,
"usage_event_count": 18,
"tool_calls": 20,
"turn_count": 18,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 18,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "minimax27",
"model_slug": "minimax27",
"source_kind": "clean-final",
"label": "skill-with-shell-minimax27-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/minimax27/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 50511,
"generation_ok": false,
"generation_duration_s": 183.748,
"input_tokens": 185140,
"output_tokens": 15068,
"total_tokens": 200208,
"billing_tokens": 200208,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 232320,
"total_cache_tokens": 232320,
"effective_input_tokens": 148313,
"display_input_tokens": 380633,
"usage_event_count": 9,
"tool_calls": 9,
"turn_count": 5,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 9,
"self_check_mode": "read-checker",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
"deterministic_failures": 4,
"deterministic_warnings": 0,
"vlm_failures": 4,
"vlm_warnings": 0,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 0,
"desktop_failures": 1,
"desktop_warnings": 0,
"mobile_failures": 1,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 1,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 20.0,
"task_score": 4.0,
"task_score_max": 20,
"quality_score": 20.0,
"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "minimax27",
"model_slug": "minimax27",
"source_kind": "clean-final",
"label": "skill-with-shell-minimax27-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/minimax27/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 21904,
"generation_ok": false,
"generation_duration_s": 64.763,
"input_tokens": 27146,
"output_tokens": 4563,
"total_tokens": 31709,
"billing_tokens": 31709,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 7040,
"total_cache_tokens": 7040,
"effective_input_tokens": 11494,
"display_input_tokens": 18534,
"usage_event_count": 3,
"tool_calls": 3,
"turn_count": 4,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 3,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 14,
"deterministic_warnings": 4,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 4,
"deterministic_warning_units": 1,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 3,
"desktop_warnings": 1,
"mobile_failures": 4,
"mobile_warnings": 1,
"deep_failures": 3,
"deep_warnings": 1,
"mobile_deep_failures": 4,
"mobile_deep_warnings": 1,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "minimax27",
"model_slug": "minimax27",
"source_kind": "clean-final",
"label": "skill-with-shell-minimax27-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/minimax27/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 79228,
"generation_ok": false,
"generation_duration_s": 420.033,
"input_tokens": 511926,
"output_tokens": 33192,
"total_tokens": 545118,
"billing_tokens": 545118,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 129664,
"total_cache_tokens": 129664,
"effective_input_tokens": 154885,
"display_input_tokens": 284549,
"usage_event_count": 7,
"tool_calls": 14,
"turn_count": 13,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 7,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-minimax27-publication-final/benchmark-comparison.html 2>&1 ",
"deterministic_failures": 8,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 4,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 1,
"desktop_failures": 2,
"desktop_warnings": 0,
"mobile_failures": 2,
"mobile_warnings": 0,
"deep_failures": 2,
"deep_warnings": 0,
"mobile_deep_failures": 2,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "opus47",
"model_slug": "opus47",
"source_kind": "clean-final",
"label": "skill-with-shell-opus47-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/opus47/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 45758,
"generation_ok": true,
"generation_duration_s": 106.088,
"input_tokens": 161380,
"output_tokens": 8823,
"total_tokens": 170203,
"billing_tokens": 170203,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 114642,
"cache_write_tokens": 25769,
"cache_hit_tokens": 0,
"total_cache_tokens": 140411,
"effective_input_tokens": 20969,
"display_input_tokens": 161380,
"usage_event_count": 10,
"tool_calls": 12,
"turn_count": 10,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 10,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/numeric-data. | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "opus47",
"model_slug": "opus47",
"source_kind": "clean-final",
"label": "skill-with-shell-opus47-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/opus47/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 50191,
"generation_ok": true,
"generation_duration_s": 268.356,
"input_tokens": 571314,
"output_tokens": 17059,
"total_tokens": 588373,
"billing_tokens": 588373,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 441950,
"cache_write_tokens": 55976,
"cache_hit_tokens": 0,
"total_cache_tokens": 497926,
"effective_input_tokens": 73388,
"display_input_tokens": 571314,
"usage_event_count": 14,
"tool_calls": 18,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 0,
"self_check_successful_runs": 3,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 14,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/code-review.h | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "opus47",
"model_slug": "opus47",
"source_kind": "clean-final",
"label": "skill-with-shell-opus47-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/opus47/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 58814,
"generation_ok": true,
"generation_duration_s": 206.748,
"input_tokens": 653611,
"output_tokens": 15632,
"total_tokens": 669243,
"billing_tokens": 669243,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 502232,
"cache_write_tokens": 65941,
"cache_hit_tokens": 0,
"total_cache_tokens": 568173,
"effective_input_tokens": 85438,
"display_input_tokens": 653611,
"usage_event_count": 13,
"tool_calls": 19,
"turn_count": 13,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 13,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/module-explainer.ht",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "opus47",
"model_slug": "opus47",
"source_kind": "clean-final",
"label": "skill-with-shell-opus47-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/opus47/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 53012,
"generation_ok": true,
"generation_duration_s": 141.632,
"input_tokens": 206186,
"output_tokens": 9414,
"total_tokens": 215600,
"billing_tokens": 215600,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 160139,
"cache_write_tokens": 23940,
"cache_hit_tokens": 0,
"total_cache_tokens": 184079,
"effective_input_tokens": 22107,
"display_input_tokens": 206186,
"usage_event_count": 11,
"tool_calls": 12,
"turn_count": 11,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 11,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/implementatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "opus47",
"model_slug": "opus47",
"source_kind": "clean-final",
"label": "skill-with-shell-opus47-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/opus47/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 64934,
"generation_ok": true,
"generation_duration_s": 150.046,
"input_tokens": 388331,
"output_tokens": 9617,
"total_tokens": 397948,
"billing_tokens": 397948,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 328368,
"cache_write_tokens": 33477,
"cache_hit_tokens": 0,
"total_cache_tokens": 361845,
"effective_input_tokens": 26486,
"display_input_tokens": 388331,
"usage_event_count": 19,
"tool_calls": 22,
"turn_count": 19,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 19,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/benchmark-com | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "sonnet46",
"model_slug": "sonnet46",
"source_kind": "clean-final",
"label": "skill-with-shell-sonnet46-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/sonnet46/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 52394,
"generation_ok": true,
"generation_duration_s": 203.959,
"input_tokens": 302149,
"output_tokens": 14758,
"total_tokens": 316907,
"billing_tokens": 316907,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 234504,
"cache_write_tokens": 38197,
"cache_hit_tokens": 0,
"total_cache_tokens": 272701,
"effective_input_tokens": 29448,
"display_input_tokens": 302149,
"usage_event_count": 13,
"tool_calls": 15,
"turn_count": 13,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 13,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/numeric-dat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "sonnet46",
"model_slug": "sonnet46",
"source_kind": "clean-final",
"label": "skill-with-shell-sonnet46-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/sonnet46/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 57805,
"generation_ok": true,
"generation_duration_s": 302.047,
"input_tokens": 477280,
"output_tokens": 18427,
"total_tokens": 495707,
"billing_tokens": 495707,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 368349,
"cache_write_tokens": 44875,
"cache_hit_tokens": 0,
"total_cache_tokens": 413224,
"effective_input_tokens": 64056,
"display_input_tokens": 477280,
"usage_event_count": 14,
"tool_calls": 18,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 14,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/code-review | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "sonnet46",
"model_slug": "sonnet46",
"source_kind": "clean-final",
"label": "skill-with-shell-sonnet46-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/sonnet46/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 66525,
"generation_ok": true,
"generation_duration_s": 978.64,
"input_tokens": 2649057,
"output_tokens": 62243,
"total_tokens": 2711300,
"billing_tokens": 2711300,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 2413844,
"cache_write_tokens": 135163,
"cache_hit_tokens": 0,
"total_cache_tokens": 2549007,
"effective_input_tokens": 100050,
"display_input_tokens": 2649057,
"usage_event_count": 34,
"tool_calls": 38,
"turn_count": 34,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 34,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/module-explainer. | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-sonnet46-publication-final/module-explainer.html && uv run --with pillow py",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "sonnet46",
"model_slug": "sonnet46",
"source_kind": "clean-final",
"label": "skill-with-shell-sonnet46-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/sonnet46/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 49926,
"generation_ok": true,
"generation_duration_s": 196.05,
"input_tokens": 257093,
"output_tokens": 12916,
"total_tokens": 270009,
"billing_tokens": 270009,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 210864,
"cache_write_tokens": 24527,
"cache_hit_tokens": 0,
"total_cache_tokens": 235391,
"effective_input_tokens": 21702,
"display_input_tokens": 257093,
"usage_event_count": 14,
"tool_calls": 15,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 14,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/implementat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "sonnet46",
"model_slug": "sonnet46",
"source_kind": "clean-final",
"label": "skill-with-shell-sonnet46-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/sonnet46/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 122208,
"generation_ok": true,
"generation_duration_s": 623.147,
"input_tokens": 1192904,
"output_tokens": 48270,
"total_tokens": 1241174,
"billing_tokens": 1241174,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 987803,
"cache_write_tokens": 129337,
"cache_hit_tokens": 0,
"total_cache_tokens": 1117140,
"effective_input_tokens": 75764,
"display_input_tokens": 1192904,
"usage_event_count": 18,
"tool_calls": 22,
"turn_count": 18,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 0,
"self_check_successful_runs": 3,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 18,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/benchmark-c | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
}
]

Xet Storage Details

Size:
204 kB
·
Xet hash:
6727550d9199a333f7d2dbe313d5e3b4347b5e5c4a54cdd75afb3aac2e958117

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.