Buckets:
| [ | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.4-mini", | |
| "model_slug": "codexresponses-gpt-5-4-mini", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 41655, | |
| "generation_ok": true, | |
| "generation_duration_s": 233.57, | |
| "input_tokens": 257043, | |
| "output_tokens": 19565, | |
| "total_tokens": 276608, | |
| "billing_tokens": 276608, | |
| "reasoning_tokens": 13843, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 236032, | |
| "total_cache_tokens": 236032, | |
| "effective_input_tokens": 21011, | |
| "display_input_tokens": 257043, | |
| "usage_event_count": 12, | |
| "tool_calls": 16, | |
| "turn_count": 12, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 12, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publica", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 2, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 1, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 1, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 99, | |
| "task_score": 19.8, | |
| "task_score_max": 20, | |
| "quality_score": 99, | |
| "quality_cap_reason": "", | |
| "quality_class": "warn" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.4-mini", | |
| "model_slug": "codexresponses-gpt-5-4-mini", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 40247, | |
| "generation_ok": true, | |
| "generation_duration_s": 251.091, | |
| "input_tokens": 1602209, | |
| "output_tokens": 16541, | |
| "total_tokens": 1618750, | |
| "billing_tokens": 1618750, | |
| "reasoning_tokens": 10735, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1516544, | |
| "total_cache_tokens": 1516544, | |
| "effective_input_tokens": 85665, | |
| "display_input_tokens": 1602209, | |
| "usage_event_count": 24, | |
| "tool_calls": 39, | |
| "turn_count": 24, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 24, | |
| "self_check_mode": "checker-cli-error,run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run python skill/scripts/check_birch_renderings.py --help | sed -n '1,220p' | checker CLI usage error | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.4-mini", | |
| "model_slug": "codexresponses-gpt-5-4-mini", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 51503, | |
| "generation_ok": true, | |
| "generation_duration_s": 228.357, | |
| "input_tokens": 538144, | |
| "output_tokens": 20613, | |
| "total_tokens": 558757, | |
| "billing_tokens": 558757, | |
| "reasoning_tokens": 12973, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 489472, | |
| "total_cache_tokens": 489472, | |
| "effective_input_tokens": 48672, | |
| "display_input_tokens": 538144, | |
| "usage_event_count": 14, | |
| "tool_calls": 29, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "checker-shell-reference,read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: rg -n \"^def (contract_findings|compare_stats|screenshot_findings|artifact_screenshot_findings|geometry_findings|render_markdown|capture|find_chrome|capture_height_for_viewport|css_ | ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-co | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-fina", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.4-mini", | |
| "model_slug": "codexresponses-gpt-5-4-mini", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 48838, | |
| "generation_ok": true, | |
| "generation_duration_s": 249.193, | |
| "input_tokens": 122451, | |
| "output_tokens": 13529, | |
| "total_tokens": 135980, | |
| "billing_tokens": 135980, | |
| "reasoning_tokens": 8129, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 103936, | |
| "total_cache_tokens": 103936, | |
| "effective_input_tokens": 18515, | |
| "display_input_tokens": 122451, | |
| "usage_event_count": 8, | |
| "tool_calls": 11, | |
| "turn_count": 8, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 8, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/implementation-plan.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.4-mini", | |
| "model_slug": "codexresponses-gpt-5-4-mini", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 55271, | |
| "generation_ok": true, | |
| "generation_duration_s": 193.592, | |
| "input_tokens": 280048, | |
| "output_tokens": 17564, | |
| "total_tokens": 297612, | |
| "billing_tokens": 297612, | |
| "reasoning_tokens": 9912, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 261120, | |
| "total_cache_tokens": 261120, | |
| "effective_input_tokens": 18928, | |
| "display_input_tokens": 280048, | |
| "usage_event_count": 14, | |
| "tool_calls": 18, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 4, | |
| "self_check_failed_runs": 3, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final && uv run --with matplotlib python - <<'PY'\nfrom pathlib impor | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres | ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\npath = Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/benchmark-comparison.h | ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\nimport re\npath = Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/benchmark-co", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.5", | |
| "model_slug": "codexresponses-gpt-5-5", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-5-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 41967, | |
| "generation_ok": true, | |
| "generation_duration_s": 118.283, | |
| "input_tokens": 95354, | |
| "output_tokens": 5337, | |
| "total_tokens": 100691, | |
| "billing_tokens": 100691, | |
| "reasoning_tokens": 402, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 78848, | |
| "total_cache_tokens": 78848, | |
| "effective_input_tokens": 16506, | |
| "display_input_tokens": 95354, | |
| "usage_event_count": 10, | |
| "tool_calls": 10, | |
| "turn_count": 10, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 1, | |
| "self_corrected_after_checker": true, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 10, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-fin | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.5", | |
| "model_slug": "codexresponses-gpt-5-5", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-5-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 44204, | |
| "generation_ok": true, | |
| "generation_duration_s": 164.43, | |
| "input_tokens": 388756, | |
| "output_tokens": 7268, | |
| "total_tokens": 396024, | |
| "billing_tokens": 396024, | |
| "reasoning_tokens": 2335, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 346624, | |
| "total_cache_tokens": 346624, | |
| "effective_input_tokens": 42132, | |
| "display_input_tokens": 388756, | |
| "usage_event_count": 16, | |
| "tool_calls": 22, | |
| "turn_count": 16, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 3, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 16, | |
| "self_check_mode": "checker-cli-error,run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-fin | checker CLI usage error | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.5", | |
| "model_slug": "codexresponses-gpt-5-5", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-5-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 57189, | |
| "generation_ok": true, | |
| "generation_duration_s": 178.972, | |
| "input_tokens": 450726, | |
| "output_tokens": 9063, | |
| "total_tokens": 459789, | |
| "billing_tokens": 459789, | |
| "reasoning_tokens": 477, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 400896, | |
| "total_cache_tokens": 400896, | |
| "effective_input_tokens": 49830, | |
| "display_input_tokens": 450726, | |
| "usage_event_count": 14, | |
| "tool_calls": 25, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/module-explainer.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <met | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/mod | ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\np=Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/module-explainer.html')\ns=p.read", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 1, | |
| "vlm_warnings": 1, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 91, | |
| "task_score": 18.2, | |
| "task_score_max": 20, | |
| "quality_score": 91, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.5", | |
| "model_slug": "codexresponses-gpt-5-5", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-5-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 49708, | |
| "generation_ok": true, | |
| "generation_duration_s": 144.313, | |
| "input_tokens": 129170, | |
| "output_tokens": 6893, | |
| "total_tokens": 136063, | |
| "billing_tokens": 136063, | |
| "reasoning_tokens": 369, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 91136, | |
| "total_cache_tokens": 91136, | |
| "effective_input_tokens": 38034, | |
| "display_input_tokens": 129170, | |
| "usage_event_count": 11, | |
| "tool_calls": 13, | |
| "turn_count": 11, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexre | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-fin | ran checker CLI: cd /home/shaun/source/birch-html && python3 - <<'PY'\nfrom pathlib import Path\np=Path('eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/implementation-plan.html')", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.5", | |
| "model_slug": "codexresponses-gpt-5-5", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-5-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 53609, | |
| "generation_ok": true, | |
| "generation_duration_s": 142.604, | |
| "input_tokens": 126650, | |
| "output_tokens": 6524, | |
| "total_tokens": 133174, | |
| "billing_tokens": 133174, | |
| "reasoning_tokens": 491, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 101376, | |
| "total_cache_tokens": 101376, | |
| "effective_input_tokens": 25274, | |
| "display_input_tokens": 126650, | |
| "usage_event_count": 11, | |
| "tool_calls": 13, | |
| "turn_count": 11, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && python3 - <<'PY'\nfrom pathlib import Path\np=Path('eval-runs/skill-with-shell-codexresponses-gpt-5-5-publication-final/benchmark-comparison.html' | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexspark", | |
| "model_slug": "codexspark", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexspark-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/codexspark/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 17281, | |
| "generation_ok": true, | |
| "generation_duration_s": 82.34, | |
| "input_tokens": 825347, | |
| "output_tokens": 23923, | |
| "total_tokens": 849270, | |
| "billing_tokens": 849270, | |
| "reasoning_tokens": 13374, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 770688, | |
| "total_cache_tokens": 770688, | |
| "effective_input_tokens": 54659, | |
| "display_input_tokens": 825347, | |
| "usage_event_count": 32, | |
| "tool_calls": 31, | |
| "turn_count": 32, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 32, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 6, | |
| "deterministic_warnings": 2, | |
| "vlm_failures": 1, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 1, | |
| "mobile_failures": 2, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 1, | |
| "mobile_deep_failures": 2, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexspark", | |
| "model_slug": "codexspark", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexspark-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/codexspark/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 9658, | |
| "generation_ok": false, | |
| "generation_duration_s": 60.395, | |
| "input_tokens": 1737615, | |
| "output_tokens": 21291, | |
| "total_tokens": 1758906, | |
| "billing_tokens": 1758906, | |
| "reasoning_tokens": 17081, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1702656, | |
| "total_cache_tokens": 1702656, | |
| "effective_input_tokens": 86941, | |
| "display_input_tokens": 1789597, | |
| "usage_event_count": 41, | |
| "tool_calls": 32, | |
| "turn_count": 26, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 3, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 41, | |
| "self_check_mode": "checker-shell-reference,read-checker", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '1,260p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '260,560p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '560,920p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '920,1320p'", | |
| "deterministic_failures": 8, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 2, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 2, | |
| "mobile_warnings": 0, | |
| "deep_failures": 2, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 2, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexspark", | |
| "model_slug": "codexspark", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexspark-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/codexspark/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 16366, | |
| "generation_ok": false, | |
| "generation_duration_s": 87.747, | |
| "input_tokens": 2740590, | |
| "output_tokens": 27049, | |
| "total_tokens": 2767639, | |
| "billing_tokens": 2767639, | |
| "reasoning_tokens": 15704, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 2024320, | |
| "total_cache_tokens": 2024320, | |
| "effective_input_tokens": 202803, | |
| "display_input_tokens": 2227123, | |
| "usage_event_count": 35, | |
| "tool_calls": 51, | |
| "turn_count": 42, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 35, | |
| "self_check_mode": "checker-shell-reference,read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: cd /home/shaun/source/birch-html && rg -n \"class\\s*=\\\"(flow-node|flow-edge|flow-list|flow-step|metric-row|chart-panel|finding|code-block|copyable|timeline)\" styles/birch-system.css | shell referenced checker: cd /home/shaun/source/birch-html && wc -l scripts/check_birch_renderings.py | shell referenced checker: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexspark-publication-final && cat > eval-runs/skill-with-shell-codexspark-publication-final/module-explain | ran checker CLI: cd /home/shaun/source/birch-html && cat > eval-runs/skill-with-shell-codexspark-publication-final/module-explainer.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <meta char", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 4, | |
| "vlm_failures": 4, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 1, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 1, | |
| "deep_failures": 0, | |
| "deep_warnings": 1, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 1, | |
| "artifact_present": true, | |
| "artifact_score_100": 91, | |
| "task_score": 18.2, | |
| "task_score_max": 20, | |
| "quality_score": 91, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexspark", | |
| "model_slug": "codexspark", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexspark-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/codexspark/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 46864, | |
| "generation_ok": true, | |
| "generation_duration_s": 91.953, | |
| "input_tokens": 1108319, | |
| "output_tokens": 14746, | |
| "total_tokens": 1123065, | |
| "billing_tokens": 1123065, | |
| "reasoning_tokens": 8043, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1055232, | |
| "total_cache_tokens": 1055232, | |
| "effective_input_tokens": 53087, | |
| "display_input_tokens": 1108319, | |
| "usage_event_count": 35, | |
| "tool_calls": 37, | |
| "turn_count": 35, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 35, | |
| "self_check_mode": "checker-cli-error,checker-shell-reference,read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexspark-publication-final/implementation-plan.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <meta charset | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --help | head -n 120 | checker CLI usage error | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexspark-publication-final/implementation-plan.html --no- | ran checker CLI: python - <<'PY'\nfrom pathlib import Path\nfrom inspect import getsourcelines\nimport importlib.util\np=Path('/home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py')\nte", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexspark", | |
| "model_slug": "codexspark", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexspark-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/codexspark/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 55786, | |
| "generation_ok": true, | |
| "generation_duration_s": 41.038, | |
| "input_tokens": 681289, | |
| "output_tokens": 5651, | |
| "total_tokens": 686940, | |
| "billing_tokens": 686940, | |
| "reasoning_tokens": 4100, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 628224, | |
| "total_cache_tokens": 628224, | |
| "effective_input_tokens": 53065, | |
| "display_input_tokens": 681289, | |
| "usage_event_count": 24, | |
| "tool_calls": 23, | |
| "turn_count": 24, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 24, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "deepseek", | |
| "model_slug": "deepseek", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-deepseek-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/deepseek/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 62489, | |
| "generation_ok": true, | |
| "generation_duration_s": 280.24, | |
| "input_tokens": 594128, | |
| "output_tokens": 18097, | |
| "total_tokens": 612225, | |
| "billing_tokens": 612225, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 560512, | |
| "total_cache_tokens": 560512, | |
| "effective_input_tokens": 33616, | |
| "display_input_tokens": 594128, | |
| "usage_event_count": 18, | |
| "tool_calls": 20, | |
| "turn_count": 18, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 18, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/numeric-dat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-deepseek", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "deepseek", | |
| "model_slug": "deepseek", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-deepseek-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/deepseek/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 62789, | |
| "generation_ok": true, | |
| "generation_duration_s": 294.1, | |
| "input_tokens": 784186, | |
| "output_tokens": 14634, | |
| "total_tokens": 798820, | |
| "billing_tokens": 798820, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 749440, | |
| "total_cache_tokens": 749440, | |
| "effective_input_tokens": 34746, | |
| "display_input_tokens": 784186, | |
| "usage_event_count": 26, | |
| "tool_calls": 30, | |
| "turn_count": 26, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 26, | |
| "self_check_mode": "checker-shell-reference,run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/code-review | shell referenced checker: cd /home/shaun/source/birch-html && head -30 skill/scripts/check_birch_renderings.py | grep -A5 \"add_argument\" | shell referenced checker: cd /home/shaun/source/birch-html && grep -n \"artifact\\|--artifact\" skill/scripts/check_birch_renderings.py | head -10 | ran checker CLI: cd /home/shaun/source/birch-html && rm -f skill/reports/birch-rendering-check.json skill/reports/birch-rendering-check.md && uv run --with pillow python skill/scripts/check_birch_r | shell referenced checker: cd /home/shaun/source/birch-html && grep -n \"ROOT\\s*=\" skill/scripts/check_birch_renderings.py | head -3 | ran checker CLI: cd /home/shaun/source/birch-html && rm -f skill/reports/birch-rendering-check-code-review.json && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /ho", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "deepseek", | |
| "model_slug": "deepseek", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-deepseek-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/deepseek/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 31473, | |
| "generation_ok": false, | |
| "generation_duration_s": 177.334, | |
| "input_tokens": 215656, | |
| "output_tokens": 9938, | |
| "total_tokens": 225594, | |
| "billing_tokens": 225594, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 449920, | |
| "total_cache_tokens": 449920, | |
| "effective_input_tokens": 48511, | |
| "display_input_tokens": 498431, | |
| "usage_event_count": 10, | |
| "tool_calls": 10, | |
| "turn_count": 6, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 10, | |
| "self_check_mode": "read-checker", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py", | |
| "deterministic_failures": 8, | |
| "deterministic_warnings": 1, | |
| "vlm_failures": 7, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 3, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 2, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 1, | |
| "mobile_failures": 3, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 3, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 20.0, | |
| "task_score": 4.0, | |
| "task_score_max": 20, | |
| "quality_score": 20.0, | |
| "quality_cap_reason": "missing_birch_css_and_visibly_unstyled", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "deepseek", | |
| "model_slug": "deepseek", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-deepseek-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/deepseek/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 52099, | |
| "generation_ok": true, | |
| "generation_duration_s": 112.544, | |
| "input_tokens": 173739, | |
| "output_tokens": 6911, | |
| "total_tokens": 180650, | |
| "billing_tokens": 180650, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 160128, | |
| "total_cache_tokens": 160128, | |
| "effective_input_tokens": 13611, | |
| "display_input_tokens": 173739, | |
| "usage_event_count": 12, | |
| "tool_calls": 15, | |
| "turn_count": 12, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 12, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/implementat", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "deepseek", | |
| "model_slug": "deepseek", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-deepseek-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/deepseek/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 78962, | |
| "generation_ok": true, | |
| "generation_duration_s": 378.136, | |
| "input_tokens": 767427, | |
| "output_tokens": 27984, | |
| "total_tokens": 795411, | |
| "billing_tokens": 795411, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 717696, | |
| "total_cache_tokens": 717696, | |
| "effective_input_tokens": 49731, | |
| "display_input_tokens": 767427, | |
| "usage_event_count": 18, | |
| "tool_calls": 22, | |
| "turn_count": 18, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 18, | |
| "self_check_mode": "checker-shell-reference", | |
| "self_check_evidence": "shell referenced checker: cd /home/shaun/source/birch-html && ls skill/scripts/check_birch_renderings.py 2>&1 && echo \"---\" && head -5 eval-runs/skill-with-shell-deepseek-publication-final/benchmark-compari", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gemini35flash", | |
| "model_slug": "gemini35flash", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gemini35flash-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/gemini35flash/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 53215, | |
| "generation_ok": true, | |
| "generation_duration_s": 114.216, | |
| "input_tokens": 1371616, | |
| "output_tokens": 5260, | |
| "total_tokens": 1376876, | |
| "billing_tokens": 1376876, | |
| "reasoning_tokens": 12418, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1116684, | |
| "total_cache_tokens": 1116684, | |
| "effective_input_tokens": 254932, | |
| "display_input_tokens": 1371616, | |
| "usage_event_count": 29, | |
| "tool_calls": 28, | |
| "turn_count": 29, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 29, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gemini35flash-publication-final/numeric-data.html | ran checker CLI: uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/numeric-dat", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gemini35flash", | |
| "model_slug": "gemini35flash", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gemini35flash-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/gemini35flash/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 53047, | |
| "generation_ok": true, | |
| "generation_duration_s": 193.238, | |
| "input_tokens": 1684136, | |
| "output_tokens": 6902, | |
| "total_tokens": 1691038, | |
| "billing_tokens": 1691038, | |
| "reasoning_tokens": 23273, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1424691, | |
| "total_cache_tokens": 1424691, | |
| "effective_input_tokens": 259445, | |
| "display_input_tokens": 1684136, | |
| "usage_event_count": 34, | |
| "tool_calls": 33, | |
| "turn_count": 34, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 34, | |
| "self_check_mode": "checker-cli-error,run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --help | checker CLI usage error | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/co | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --no-capture --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publica", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gemini35flash", | |
| "model_slug": "gemini35flash", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gemini35flash-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/gemini35flash/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 57420, | |
| "generation_ok": true, | |
| "generation_duration_s": 203.178, | |
| "input_tokens": 2196880, | |
| "output_tokens": 10222, | |
| "total_tokens": 2207102, | |
| "billing_tokens": 2207102, | |
| "reasoning_tokens": 22501, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1965131, | |
| "total_cache_tokens": 1965131, | |
| "effective_input_tokens": 231749, | |
| "display_input_tokens": 2196880, | |
| "usage_event_count": 33, | |
| "tool_calls": 32, | |
| "turn_count": 33, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 2, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 33, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read scripts/check_birch_renderings.py | ran checker CLI: python3 scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gemini35flash-publication-final/module-explainer.html", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gemini35flash", | |
| "model_slug": "gemini35flash", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gemini35flash-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/gemini35flash/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 49628, | |
| "generation_ok": true, | |
| "generation_duration_s": 201.715, | |
| "input_tokens": 2346900, | |
| "output_tokens": 9173, | |
| "total_tokens": 2356073, | |
| "billing_tokens": 2356073, | |
| "reasoning_tokens": 15150, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 2043078, | |
| "total_cache_tokens": 2043078, | |
| "effective_input_tokens": 303822, | |
| "display_input_tokens": 2346900, | |
| "usage_event_count": 34, | |
| "tool_calls": 33, | |
| "turn_count": 34, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 5, | |
| "self_check_failed_runs": 4, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 34, | |
| "self_check_mode": "checker-cli-error,run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: python3 skill/scripts/check_birch_renderings.py --help | checker CLI usage error | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact temp_plan.html | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/temp_plan.html | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/implementation-plan.html", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gemini35flash", | |
| "model_slug": "gemini35flash", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gemini35flash-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/gemini35flash/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 97390, | |
| "generation_ok": true, | |
| "generation_duration_s": 62.077, | |
| "input_tokens": 495825, | |
| "output_tokens": 829, | |
| "total_tokens": 496654, | |
| "billing_tokens": 496654, | |
| "reasoning_tokens": 4961, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 387138, | |
| "total_cache_tokens": 387138, | |
| "effective_input_tokens": 108687, | |
| "display_input_tokens": 495825, | |
| "usage_event_count": 17, | |
| "tool_calls": 16, | |
| "turn_count": 17, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 17, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/be", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm51", | |
| "model_slug": "glm51", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm51-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/glm51/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 62971, | |
| "generation_ok": true, | |
| "generation_duration_s": 300.114, | |
| "input_tokens": 459899, | |
| "output_tokens": 16275, | |
| "total_tokens": 476174, | |
| "billing_tokens": 476174, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 369152, | |
| "total_cache_tokens": 369152, | |
| "effective_input_tokens": 90747, | |
| "display_input_tokens": 459899, | |
| "usage_event_count": 15, | |
| "tool_calls": 16, | |
| "turn_count": 15, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 15, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/numeric-data.h", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 2, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 99, | |
| "task_score": 19.8, | |
| "task_score_max": 20, | |
| "quality_score": 99, | |
| "quality_cap_reason": "", | |
| "quality_class": "warn" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm51", | |
| "model_slug": "glm51", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm51-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/glm51/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 48933, | |
| "generation_ok": true, | |
| "generation_duration_s": 133.324, | |
| "input_tokens": 254816, | |
| "output_tokens": 8008, | |
| "total_tokens": 262824, | |
| "billing_tokens": 262824, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 202560, | |
| "total_cache_tokens": 202560, | |
| "effective_input_tokens": 52256, | |
| "display_input_tokens": 254816, | |
| "usage_event_count": 11, | |
| "tool_calls": 13, | |
| "turn_count": 11, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/code-review.ht", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 2, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 92, | |
| "task_score": 18.4, | |
| "task_score_max": 20, | |
| "quality_score": 92, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm51", | |
| "model_slug": "glm51", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm51-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/glm51/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 54229, | |
| "generation_ok": true, | |
| "generation_duration_s": 94.822, | |
| "input_tokens": 358438, | |
| "output_tokens": 6652, | |
| "total_tokens": 365090, | |
| "billing_tokens": 365090, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 254656, | |
| "total_cache_tokens": 254656, | |
| "effective_input_tokens": 103782, | |
| "display_input_tokens": 358438, | |
| "usage_event_count": 9, | |
| "tool_calls": 15, | |
| "turn_count": 9, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 9, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/module-explainer.htm", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm51", | |
| "model_slug": "glm51", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm51-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/glm51/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 60535, | |
| "generation_ok": true, | |
| "generation_duration_s": 90.03, | |
| "input_tokens": 210191, | |
| "output_tokens": 7574, | |
| "total_tokens": 217765, | |
| "billing_tokens": 217765, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 180736, | |
| "total_cache_tokens": 180736, | |
| "effective_input_tokens": 29455, | |
| "display_input_tokens": 210191, | |
| "usage_event_count": 15, | |
| "tool_calls": 16, | |
| "turn_count": 15, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 15, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/implementation", | |
| "deterministic_failures": 2, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 2, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 1, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 1, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 93, | |
| "task_score": 18.6, | |
| "task_score_max": 20, | |
| "quality_score": 93, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm51", | |
| "model_slug": "glm51", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm51-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/glm51/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 64863, | |
| "generation_ok": true, | |
| "generation_duration_s": 149.159, | |
| "input_tokens": 274201, | |
| "output_tokens": 14416, | |
| "total_tokens": 288617, | |
| "billing_tokens": 288617, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 214336, | |
| "total_cache_tokens": 214336, | |
| "effective_input_tokens": 59865, | |
| "display_input_tokens": 274201, | |
| "usage_event_count": 12, | |
| "tool_calls": 14, | |
| "turn_count": 12, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 12, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/benchmark-comp", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm52", | |
| "model_slug": "glm52", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm52-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/glm52/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 51395, | |
| "generation_ok": true, | |
| "generation_duration_s": 274.73, | |
| "input_tokens": 271862, | |
| "output_tokens": 16133, | |
| "total_tokens": 287995, | |
| "billing_tokens": 287995, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 239936, | |
| "total_cache_tokens": 239936, | |
| "effective_input_tokens": 31926, | |
| "display_input_tokens": 271862, | |
| "usage_event_count": 14, | |
| "tool_calls": 17, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-glm52-publication-final/numeric-data.html && echo \"=== finished ===\" && uv | ran checker CLI: cd /home/shaun/source/birch-html && F=eval-runs/skill-with-shell-glm52-publication-final/numeric-data.html\necho \"=== my local style block (after system block) ===\"\npython3 - \"$F\" <", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm52", | |
| "model_slug": "glm52", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm52-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/glm52/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 50845, | |
| "generation_ok": true, | |
| "generation_duration_s": 771.097, | |
| "input_tokens": 1249523, | |
| "output_tokens": 43260, | |
| "total_tokens": 1292783, | |
| "billing_tokens": 1292783, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1177792, | |
| "total_cache_tokens": 1177792, | |
| "effective_input_tokens": 71731, | |
| "display_input_tokens": 1249523, | |
| "usage_event_count": 25, | |
| "tool_calls": 32, | |
| "turn_count": 25, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 3, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 25, | |
| "self_check_mode": "checker-cli-error,run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/code-review.ht | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-glm52-publication-final/code-r | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --help 2>&1 | rg -i 'viewport|artifact|mobile|width' | head; echo \"=== run m | checker CLI usage error", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm52", | |
| "model_slug": "glm52", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm52-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/glm52/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 59952, | |
| "generation_ok": true, | |
| "generation_duration_s": 751.331, | |
| "input_tokens": 1204327, | |
| "output_tokens": 40435, | |
| "total_tokens": 1244762, | |
| "billing_tokens": 1244762, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1118592, | |
| "total_cache_tokens": 1118592, | |
| "effective_input_tokens": 85735, | |
| "display_input_tokens": 1204327, | |
| "usage_event_count": 22, | |
| "tool_calls": 32, | |
| "turn_count": 22, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 22, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/module-explainer.htm", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm52", | |
| "model_slug": "glm52", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm52-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/glm52/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 56320, | |
| "generation_ok": true, | |
| "generation_duration_s": 456.209, | |
| "input_tokens": 991570, | |
| "output_tokens": 24123, | |
| "total_tokens": 1015693, | |
| "billing_tokens": 1015693, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 911168, | |
| "total_cache_tokens": 911168, | |
| "effective_input_tokens": 80402, | |
| "display_input_tokens": 991570, | |
| "usage_event_count": 18, | |
| "tool_calls": 26, | |
| "turn_count": 18, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 3, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 18, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/implementation | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-glm52-pu", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm52", | |
| "model_slug": "glm52", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm52-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/glm52/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm52/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 60487, | |
| "generation_ok": true, | |
| "generation_duration_s": 380.184, | |
| "input_tokens": 522022, | |
| "output_tokens": 23534, | |
| "total_tokens": 545556, | |
| "billing_tokens": 545556, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 459648, | |
| "total_cache_tokens": 459648, | |
| "effective_input_tokens": 62374, | |
| "display_input_tokens": 522022, | |
| "usage_event_count": 16, | |
| "tool_calls": 19, | |
| "turn_count": 16, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 16, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm52-publication-final/benchmark-comp | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-glm52-publication-final/benchm", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gpt-5.3-codex", | |
| "model_slug": "gpt-5-3-codex", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gpt-5-3-codex-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 40305, | |
| "generation_ok": true, | |
| "generation_duration_s": 63.372, | |
| "input_tokens": 91503, | |
| "output_tokens": 5097, | |
| "total_tokens": 96600, | |
| "billing_tokens": 96600, | |
| "reasoning_tokens": 1083, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 76800, | |
| "total_cache_tokens": 76800, | |
| "effective_input_tokens": 14703, | |
| "display_input_tokens": 91503, | |
| "usage_event_count": 8, | |
| "tool_calls": 11, | |
| "turn_count": 8, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 8, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 2, | |
| "deterministic_warnings": 2, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 1, | |
| "mobile_failures": 1, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 1, | |
| "mobile_deep_failures": 1, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 93, | |
| "task_score": 18.6, | |
| "task_score_max": 20, | |
| "quality_score": 93, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gpt-5.3-codex", | |
| "model_slug": "gpt-5-3-codex", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gpt-5-3-codex-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 39494, | |
| "generation_ok": true, | |
| "generation_duration_s": 94.334, | |
| "input_tokens": 461816, | |
| "output_tokens": 6027, | |
| "total_tokens": 467843, | |
| "billing_tokens": 467843, | |
| "reasoning_tokens": 2855, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 384640, | |
| "total_cache_tokens": 384640, | |
| "effective_input_tokens": 77176, | |
| "display_input_tokens": 461816, | |
| "usage_event_count": 17, | |
| "tool_calls": 18, | |
| "turn_count": 17, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 17, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/code-r", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gpt-5.3-codex", | |
| "model_slug": "gpt-5-3-codex", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gpt-5-3-codex-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 46290, | |
| "generation_ok": true, | |
| "generation_duration_s": 93.641, | |
| "input_tokens": 555669, | |
| "output_tokens": 7177, | |
| "total_tokens": 562846, | |
| "billing_tokens": 562846, | |
| "reasoning_tokens": 1701, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 450304, | |
| "total_cache_tokens": 450304, | |
| "effective_input_tokens": 105365, | |
| "display_input_tokens": 555669, | |
| "usage_event_count": 17, | |
| "tool_calls": 23, | |
| "turn_count": 17, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 17, | |
| "self_check_mode": "checker-cli-error,checker-shell-reference,read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: rg '^def ' -n /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-codex-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-codex-pu | ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/module-explainer.h | checker CLI usage error", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 1, | |
| "vlm_warnings": 1, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 91, | |
| "task_score": 18.2, | |
| "task_score_max": 20, | |
| "quality_score": 91, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gpt-5.3-codex", | |
| "model_slug": "gpt-5-3-codex", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gpt-5-3-codex-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 45485, | |
| "generation_ok": true, | |
| "generation_duration_s": 59.362, | |
| "input_tokens": 90659, | |
| "output_tokens": 4766, | |
| "total_tokens": 95425, | |
| "billing_tokens": 95425, | |
| "reasoning_tokens": 589, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 71168, | |
| "total_cache_tokens": 71168, | |
| "effective_input_tokens": 19491, | |
| "display_input_tokens": 90659, | |
| "usage_event_count": 9, | |
| "tool_calls": 10, | |
| "turn_count": 9, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 9, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/implem | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gpt-5.3-codex", | |
| "model_slug": "gpt-5-3-codex", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gpt-5-3-codex-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 46793, | |
| "generation_ok": true, | |
| "generation_duration_s": 61.812, | |
| "input_tokens": 60483, | |
| "output_tokens": 5615, | |
| "total_tokens": 66098, | |
| "billing_tokens": 66098, | |
| "reasoning_tokens": 746, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 53376, | |
| "total_cache_tokens": 53376, | |
| "effective_input_tokens": 7107, | |
| "display_input_tokens": 60483, | |
| "usage_event_count": 7, | |
| "tool_calls": 8, | |
| "turn_count": 7, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 7, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 4, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 2, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 2, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 88, | |
| "task_score": 17.6, | |
| "task_score_max": 20, | |
| "quality_score": 88, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "grok-4.3", | |
| "model_slug": "grok-4-3", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-grok-4-3-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/grok-4-3/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 36903, | |
| "generation_ok": true, | |
| "generation_duration_s": 49.028, | |
| "input_tokens": 73338, | |
| "output_tokens": 3307, | |
| "total_tokens": 76645, | |
| "billing_tokens": 76645, | |
| "reasoning_tokens": 925, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 62720, | |
| "total_cache_tokens": 62720, | |
| "effective_input_tokens": 10618, | |
| "display_input_tokens": 73338, | |
| "usage_event_count": 10, | |
| "tool_calls": 9, | |
| "turn_count": 10, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 10, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "grok-4.3", | |
| "model_slug": "grok-4-3", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-grok-4-3-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/grok-4-3/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 38297, | |
| "generation_ok": true, | |
| "generation_duration_s": 55.392, | |
| "input_tokens": 190492, | |
| "output_tokens": 4553, | |
| "total_tokens": 195045, | |
| "billing_tokens": 195045, | |
| "reasoning_tokens": 2340, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 147520, | |
| "total_cache_tokens": 147520, | |
| "effective_input_tokens": 42972, | |
| "display_input_tokens": 190492, | |
| "usage_event_count": 11, | |
| "tool_calls": 10, | |
| "turn_count": 11, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "grok-4.3", | |
| "model_slug": "grok-4-3", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-grok-4-3-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/grok-4-3/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 9279, | |
| "generation_ok": false, | |
| "generation_duration_s": 40.052, | |
| "input_tokens": 125766, | |
| "output_tokens": 3826, | |
| "total_tokens": 129592, | |
| "billing_tokens": 129592, | |
| "reasoning_tokens": 1202, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 46784, | |
| "total_cache_tokens": 46784, | |
| "effective_input_tokens": 53433, | |
| "display_input_tokens": 100217, | |
| "usage_event_count": 15, | |
| "tool_calls": 6, | |
| "turn_count": 7, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 15, | |
| "self_check_mode": "read-checker", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py", | |
| "deterministic_failures": 8, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 3, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 2, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 2, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 2, | |
| "mobile_warnings": 0, | |
| "deep_failures": 2, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 2, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "grok-4.3", | |
| "model_slug": "grok-4-3", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-grok-4-3-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/grok-4-3/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 16152, | |
| "generation_ok": false, | |
| "generation_duration_s": 41.596, | |
| "input_tokens": 32235, | |
| "output_tokens": 5236, | |
| "total_tokens": 37471, | |
| "billing_tokens": 37471, | |
| "reasoning_tokens": 1207, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 39488, | |
| "total_cache_tokens": 39488, | |
| "effective_input_tokens": 20479, | |
| "display_input_tokens": 59967, | |
| "usage_event_count": 8, | |
| "tool_calls": 4, | |
| "turn_count": 5, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 8, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 4, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 4, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 1, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 1, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 20.0, | |
| "task_score": 4.0, | |
| "task_score_max": 20, | |
| "quality_score": 20.0, | |
| "quality_cap_reason": "missing_birch_css_and_visibly_unstyled", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "grok-4.3", | |
| "model_slug": "grok-4-3", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-grok-4-3-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/grok-4-3/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 10364, | |
| "generation_ok": false, | |
| "generation_duration_s": 98.19, | |
| "input_tokens": 153411, | |
| "output_tokens": 7388, | |
| "total_tokens": 160799, | |
| "billing_tokens": 160799, | |
| "reasoning_tokens": 2517, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 39488, | |
| "total_cache_tokens": 39488, | |
| "effective_input_tokens": 6645, | |
| "display_input_tokens": 46133, | |
| "usage_event_count": 8, | |
| "tool_calls": 15, | |
| "turn_count": 16, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 8, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 4, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 4, | |
| "vlm_warnings": 1, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 1, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 1, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "haiku45", | |
| "model_slug": "haiku45", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-haiku45-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/haiku45/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 23937, | |
| "generation_ok": false, | |
| "generation_duration_s": 67.62, | |
| "input_tokens": 119520, | |
| "output_tokens": 7707, | |
| "total_tokens": 127227, | |
| "billing_tokens": 127227, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 7297, | |
| "cache_write_tokens": 12081, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 19378, | |
| "effective_input_tokens": 11280, | |
| "display_input_tokens": 30658, | |
| "usage_event_count": 4, | |
| "tool_calls": 9, | |
| "turn_count": 10, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 4, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-haiku45-publication-final/numeric-data", | |
| "deterministic_failures": 16, | |
| "deterministic_warnings": 12, | |
| "vlm_failures": 1, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 4, | |
| "deterministic_warning_units": 3, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 4, | |
| "desktop_warnings": 3, | |
| "mobile_failures": 4, | |
| "mobile_warnings": 3, | |
| "deep_failures": 4, | |
| "deep_warnings": 3, | |
| "mobile_deep_failures": 4, | |
| "mobile_deep_warnings": 3, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "haiku45", | |
| "model_slug": "haiku45", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-haiku45-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/haiku45/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 53526, | |
| "generation_ok": true, | |
| "generation_duration_s": 94.461, | |
| "input_tokens": 301467, | |
| "output_tokens": 10117, | |
| "total_tokens": 311584, | |
| "billing_tokens": 311584, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 228528, | |
| "cache_write_tokens": 34499, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 263027, | |
| "effective_input_tokens": 38440, | |
| "display_input_tokens": 301467, | |
| "usage_event_count": 11, | |
| "tool_calls": 11, | |
| "turn_count": 11, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-haiku45-", | |
| "deterministic_failures": 6, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 2, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 2, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 2, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 87, | |
| "task_score": 17.4, | |
| "task_score_max": 20, | |
| "quality_score": 87, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "haiku45", | |
| "model_slug": "haiku45", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-haiku45-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/haiku45/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 57853, | |
| "generation_ok": false, | |
| "generation_duration_s": 75.42, | |
| "input_tokens": 211164, | |
| "output_tokens": 9407, | |
| "total_tokens": 220571, | |
| "billing_tokens": 220571, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 55031, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 55031, | |
| "effective_input_tokens": 80985, | |
| "display_input_tokens": 136016, | |
| "usage_event_count": 3, | |
| "tool_calls": 10, | |
| "turn_count": 6, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 3, | |
| "self_check_mode": "read-checker", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "haiku45", | |
| "model_slug": "haiku45", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-haiku45-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/haiku45/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 50641, | |
| "generation_ok": true, | |
| "generation_duration_s": 67.418, | |
| "input_tokens": 123711, | |
| "output_tokens": 7166, | |
| "total_tokens": 130877, | |
| "billing_tokens": 130877, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 91600, | |
| "cache_write_tokens": 16126, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 107726, | |
| "effective_input_tokens": 15985, | |
| "display_input_tokens": 123711, | |
| "usage_event_count": 9, | |
| "tool_calls": 9, | |
| "turn_count": 9, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 9, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-haiku45-", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "haiku45", | |
| "model_slug": "haiku45", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-haiku45-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/haiku45/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 49137, | |
| "generation_ok": true, | |
| "generation_duration_s": 65.28, | |
| "input_tokens": 151349, | |
| "output_tokens": 7796, | |
| "total_tokens": 159145, | |
| "billing_tokens": 159145, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 122743, | |
| "cache_write_tokens": 12640, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 135383, | |
| "effective_input_tokens": 15966, | |
| "display_input_tokens": 151349, | |
| "usage_event_count": 11, | |
| "tool_calls": 10, | |
| "turn_count": 11, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 4, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 3, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 1, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 1, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 93, | |
| "task_score": 18.6, | |
| "task_score_max": 20, | |
| "quality_score": 93, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi", | |
| "model_slug": "kimi", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/kimi/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 67620, | |
| "generation_ok": true, | |
| "generation_duration_s": 194.344, | |
| "input_tokens": 470039, | |
| "output_tokens": 5317, | |
| "total_tokens": 475356, | |
| "billing_tokens": 475356, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 425472, | |
| "total_cache_tokens": 425472, | |
| "effective_input_tokens": 44567, | |
| "display_input_tokens": 470039, | |
| "usage_event_count": 20, | |
| "tool_calls": 23, | |
| "turn_count": 20, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 20, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi-publication-final/numeric-data.ht | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi-pub", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi", | |
| "model_slug": "kimi", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/kimi/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 44300, | |
| "generation_ok": true, | |
| "generation_duration_s": 627.536, | |
| "input_tokens": 1248543, | |
| "output_tokens": 24596, | |
| "total_tokens": 1273139, | |
| "billing_tokens": 1273139, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1192448, | |
| "total_cache_tokens": 1192448, | |
| "effective_input_tokens": 56095, | |
| "display_input_tokens": 1248543, | |
| "usage_event_count": 33, | |
| "tool_calls": 36, | |
| "turn_count": 33, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 33, | |
| "self_check_mode": "checker-shell-reference,read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | shell referenced checker: grep -n \"CANDLE_CLASSES\\|BIRCH_CLASSES\\|LAYOUT_CLASSES\\|SEMANTIC_CLASSES\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"callout\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | shell referenced checker: grep -n \"eyebrow\\|lede\\|muted\\|caption\\|subtle\\|note\\|entity\\|label-cell\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"code-block\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"data-tone\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi", | |
| "model_slug": "kimi", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/kimi/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 17730, | |
| "generation_ok": false, | |
| "generation_duration_s": 142.653, | |
| "input_tokens": 54919, | |
| "output_tokens": 5427, | |
| "total_tokens": 60346, | |
| "billing_tokens": 60346, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 0, | |
| "effective_input_tokens": 54919, | |
| "display_input_tokens": 54919, | |
| "usage_event_count": 5, | |
| "tool_calls": 10, | |
| "turn_count": 5, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 5, | |
| "self_check_mode": "read-checker", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py", | |
| "deterministic_failures": 6, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 7, | |
| "vlm_warnings": 1, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 3, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 2, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 2, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 20.0, | |
| "task_score": 4.0, | |
| "task_score_max": 20, | |
| "quality_score": 20.0, | |
| "quality_cap_reason": "missing_birch_css_and_visibly_unstyled", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi", | |
| "model_slug": "kimi", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/kimi/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 50937, | |
| "generation_ok": true, | |
| "generation_duration_s": 372.779, | |
| "input_tokens": 468652, | |
| "output_tokens": 19358, | |
| "total_tokens": 488010, | |
| "billing_tokens": 488010, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 415232, | |
| "total_cache_tokens": 415232, | |
| "effective_input_tokens": 53420, | |
| "display_input_tokens": 468652, | |
| "usage_event_count": 15, | |
| "tool_calls": 16, | |
| "turn_count": 15, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 15, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi-publication-final/implementation-", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi", | |
| "model_slug": "kimi", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/kimi/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 51725, | |
| "generation_ok": true, | |
| "generation_duration_s": 427.336, | |
| "input_tokens": 358341, | |
| "output_tokens": 15297, | |
| "total_tokens": 373638, | |
| "billing_tokens": 373638, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 299776, | |
| "total_cache_tokens": 299776, | |
| "effective_input_tokens": 58565, | |
| "display_input_tokens": 358341, | |
| "usage_event_count": 14, | |
| "tool_calls": 14, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi-publicati", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 1, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 99, | |
| "task_score": 19.8, | |
| "task_score_max": 20, | |
| "quality_score": 99, | |
| "quality_cap_reason": "", | |
| "quality_class": "warn" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi27", | |
| "model_slug": "kimi27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi27-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/kimi27/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 41967, | |
| "generation_ok": true, | |
| "generation_duration_s": 210.371, | |
| "input_tokens": 1978925, | |
| "output_tokens": 17532, | |
| "total_tokens": 1996457, | |
| "billing_tokens": 1996457, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1687898, | |
| "total_cache_tokens": 1687898, | |
| "effective_input_tokens": 291027, | |
| "display_input_tokens": 1978925, | |
| "usage_event_count": 30, | |
| "tool_calls": 32, | |
| "turn_count": 30, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 3, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 30, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && if [ -f skill/scripts/check_birch_renderings.py ]; then uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs | read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi27-p | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-kimi27-publication-final/numeric-data.html && uv run --with pillow python s", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi27", | |
| "model_slug": "kimi27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi27-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/kimi27/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 47402, | |
| "generation_ok": true, | |
| "generation_duration_s": 253.252, | |
| "input_tokens": 1509119, | |
| "output_tokens": 28034, | |
| "total_tokens": 1537153, | |
| "billing_tokens": 1537153, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1144217, | |
| "total_cache_tokens": 1144217, | |
| "effective_input_tokens": 364902, | |
| "display_input_tokens": 1509119, | |
| "usage_event_count": 25, | |
| "tool_calls": 30, | |
| "turn_count": 25, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 25, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/code-review.html 2> | ran checker CLI: cd /home/shaun/source/birch-html && uv run python skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-kimi27-publication-final/code-review.html && uv run --with pillow py", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi27", | |
| "model_slug": "kimi27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi27-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/kimi27/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 52748, | |
| "generation_ok": true, | |
| "generation_duration_s": 136.617, | |
| "input_tokens": 582570, | |
| "output_tokens": 12473, | |
| "total_tokens": 595043, | |
| "billing_tokens": 595043, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 334281, | |
| "total_cache_tokens": 334281, | |
| "effective_input_tokens": 248289, | |
| "display_input_tokens": 582570, | |
| "usage_event_count": 7, | |
| "tool_calls": 14, | |
| "turn_count": 7, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 7, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/module-explainer.ht", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi27", | |
| "model_slug": "kimi27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi27-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/kimi27/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 52277, | |
| "generation_ok": true, | |
| "generation_duration_s": 72.968, | |
| "input_tokens": 487122, | |
| "output_tokens": 6684, | |
| "total_tokens": 493806, | |
| "billing_tokens": 493806, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 332350, | |
| "total_cache_tokens": 332350, | |
| "effective_input_tokens": 154772, | |
| "display_input_tokens": 487122, | |
| "usage_event_count": 9, | |
| "tool_calls": 9, | |
| "turn_count": 9, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 9, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/implementatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi27-p", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi27", | |
| "model_slug": "kimi27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi27-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/kimi27/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi27/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 59856, | |
| "generation_ok": true, | |
| "generation_duration_s": 159.927, | |
| "input_tokens": 1290293, | |
| "output_tokens": 18058, | |
| "total_tokens": 1308351, | |
| "billing_tokens": 1308351, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1169139, | |
| "total_cache_tokens": 1169139, | |
| "effective_input_tokens": 121154, | |
| "display_input_tokens": 1290293, | |
| "usage_event_count": 16, | |
| "tool_calls": 19, | |
| "turn_count": 16, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 16, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi27-publication-final/benchmark-co | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi27-", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "minimax27", | |
| "model_slug": "minimax27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-minimax27-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/minimax27/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 50838, | |
| "generation_ok": false, | |
| "generation_duration_s": 160.154, | |
| "input_tokens": 87235, | |
| "output_tokens": 10902, | |
| "total_tokens": 98137, | |
| "billing_tokens": 98137, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 116736, | |
| "total_cache_tokens": 116736, | |
| "effective_input_tokens": 81499, | |
| "display_input_tokens": 198235, | |
| "usage_event_count": 12, | |
| "tool_calls": 9, | |
| "turn_count": 10, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 12, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "minimax27", | |
| "model_slug": "minimax27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-minimax27-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/minimax27/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 43165, | |
| "generation_ok": true, | |
| "generation_duration_s": 211.215, | |
| "input_tokens": 444148, | |
| "output_tokens": 7213, | |
| "total_tokens": 451361, | |
| "billing_tokens": 451361, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 355328, | |
| "total_cache_tokens": 355328, | |
| "effective_input_tokens": 88820, | |
| "display_input_tokens": 444148, | |
| "usage_event_count": 18, | |
| "tool_calls": 20, | |
| "turn_count": 18, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 18, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "minimax27", | |
| "model_slug": "minimax27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-minimax27-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/minimax27/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 50511, | |
| "generation_ok": false, | |
| "generation_duration_s": 183.748, | |
| "input_tokens": 185140, | |
| "output_tokens": 15068, | |
| "total_tokens": 200208, | |
| "billing_tokens": 200208, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 232320, | |
| "total_cache_tokens": 232320, | |
| "effective_input_tokens": 148313, | |
| "display_input_tokens": 380633, | |
| "usage_event_count": 9, | |
| "tool_calls": 9, | |
| "turn_count": 5, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 9, | |
| "self_check_mode": "read-checker", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py", | |
| "deterministic_failures": 4, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 4, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 1, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 1, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 20.0, | |
| "task_score": 4.0, | |
| "task_score_max": 20, | |
| "quality_score": 20.0, | |
| "quality_cap_reason": "missing_birch_css_and_visibly_unstyled", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "minimax27", | |
| "model_slug": "minimax27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-minimax27-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/minimax27/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 21904, | |
| "generation_ok": false, | |
| "generation_duration_s": 64.763, | |
| "input_tokens": 27146, | |
| "output_tokens": 4563, | |
| "total_tokens": 31709, | |
| "billing_tokens": 31709, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 7040, | |
| "total_cache_tokens": 7040, | |
| "effective_input_tokens": 11494, | |
| "display_input_tokens": 18534, | |
| "usage_event_count": 3, | |
| "tool_calls": 3, | |
| "turn_count": 4, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 3, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 14, | |
| "deterministic_warnings": 4, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 4, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 3, | |
| "desktop_warnings": 1, | |
| "mobile_failures": 4, | |
| "mobile_warnings": 1, | |
| "deep_failures": 3, | |
| "deep_warnings": 1, | |
| "mobile_deep_failures": 4, | |
| "mobile_deep_warnings": 1, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "minimax27", | |
| "model_slug": "minimax27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-minimax27-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/minimax27/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 79228, | |
| "generation_ok": false, | |
| "generation_duration_s": 420.033, | |
| "input_tokens": 511926, | |
| "output_tokens": 33192, | |
| "total_tokens": 545118, | |
| "billing_tokens": 545118, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 129664, | |
| "total_cache_tokens": 129664, | |
| "effective_input_tokens": 154885, | |
| "display_input_tokens": 284549, | |
| "usage_event_count": 7, | |
| "tool_calls": 14, | |
| "turn_count": 13, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 7, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-minimax27-publication-final/benchmark-comparison.html 2>&1 ", | |
| "deterministic_failures": 8, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 4, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 2, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 2, | |
| "mobile_warnings": 0, | |
| "deep_failures": 2, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 2, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "opus47", | |
| "model_slug": "opus47", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus47-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/opus47/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 45758, | |
| "generation_ok": true, | |
| "generation_duration_s": 106.088, | |
| "input_tokens": 161380, | |
| "output_tokens": 8823, | |
| "total_tokens": 170203, | |
| "billing_tokens": 170203, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 114642, | |
| "cache_write_tokens": 25769, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 140411, | |
| "effective_input_tokens": 20969, | |
| "display_input_tokens": 161380, | |
| "usage_event_count": 10, | |
| "tool_calls": 12, | |
| "turn_count": 10, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 10, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/numeric-data. | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "opus47", | |
| "model_slug": "opus47", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus47-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/opus47/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 50191, | |
| "generation_ok": true, | |
| "generation_duration_s": 268.356, | |
| "input_tokens": 571314, | |
| "output_tokens": 17059, | |
| "total_tokens": 588373, | |
| "billing_tokens": 588373, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 441950, | |
| "cache_write_tokens": 55976, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 497926, | |
| "effective_input_tokens": 73388, | |
| "display_input_tokens": 571314, | |
| "usage_event_count": 14, | |
| "tool_calls": 18, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 3, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/code-review.h | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "opus47", | |
| "model_slug": "opus47", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus47-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/opus47/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 58814, | |
| "generation_ok": true, | |
| "generation_duration_s": 206.748, | |
| "input_tokens": 653611, | |
| "output_tokens": 15632, | |
| "total_tokens": 669243, | |
| "billing_tokens": 669243, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 502232, | |
| "cache_write_tokens": 65941, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 568173, | |
| "effective_input_tokens": 85438, | |
| "display_input_tokens": 653611, | |
| "usage_event_count": 13, | |
| "tool_calls": 19, | |
| "turn_count": 13, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 13, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/module-explainer.ht", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "opus47", | |
| "model_slug": "opus47", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus47-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/opus47/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 53012, | |
| "generation_ok": true, | |
| "generation_duration_s": 141.632, | |
| "input_tokens": 206186, | |
| "output_tokens": 9414, | |
| "total_tokens": 215600, | |
| "billing_tokens": 215600, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 160139, | |
| "cache_write_tokens": 23940, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 184079, | |
| "effective_input_tokens": 22107, | |
| "display_input_tokens": 206186, | |
| "usage_event_count": 11, | |
| "tool_calls": 12, | |
| "turn_count": 11, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/implementatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "opus47", | |
| "model_slug": "opus47", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus47-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/opus47/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 64934, | |
| "generation_ok": true, | |
| "generation_duration_s": 150.046, | |
| "input_tokens": 388331, | |
| "output_tokens": 9617, | |
| "total_tokens": 397948, | |
| "billing_tokens": 397948, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 328368, | |
| "cache_write_tokens": 33477, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 361845, | |
| "effective_input_tokens": 26486, | |
| "display_input_tokens": 388331, | |
| "usage_event_count": 19, | |
| "tool_calls": 22, | |
| "turn_count": 19, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 19, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/benchmark-com | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "sonnet46", | |
| "model_slug": "sonnet46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-sonnet46-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/sonnet46/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 52394, | |
| "generation_ok": true, | |
| "generation_duration_s": 203.959, | |
| "input_tokens": 302149, | |
| "output_tokens": 14758, | |
| "total_tokens": 316907, | |
| "billing_tokens": 316907, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 234504, | |
| "cache_write_tokens": 38197, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 272701, | |
| "effective_input_tokens": 29448, | |
| "display_input_tokens": 302149, | |
| "usage_event_count": 13, | |
| "tool_calls": 15, | |
| "turn_count": 13, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 13, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/numeric-dat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "sonnet46", | |
| "model_slug": "sonnet46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-sonnet46-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/sonnet46/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 57805, | |
| "generation_ok": true, | |
| "generation_duration_s": 302.047, | |
| "input_tokens": 477280, | |
| "output_tokens": 18427, | |
| "total_tokens": 495707, | |
| "billing_tokens": 495707, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 368349, | |
| "cache_write_tokens": 44875, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 413224, | |
| "effective_input_tokens": 64056, | |
| "display_input_tokens": 477280, | |
| "usage_event_count": 14, | |
| "tool_calls": 18, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/code-review | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "sonnet46", | |
| "model_slug": "sonnet46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-sonnet46-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/sonnet46/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 66525, | |
| "generation_ok": true, | |
| "generation_duration_s": 978.64, | |
| "input_tokens": 2649057, | |
| "output_tokens": 62243, | |
| "total_tokens": 2711300, | |
| "billing_tokens": 2711300, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 2413844, | |
| "cache_write_tokens": 135163, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 2549007, | |
| "effective_input_tokens": 100050, | |
| "display_input_tokens": 2649057, | |
| "usage_event_count": 34, | |
| "tool_calls": 38, | |
| "turn_count": 34, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 34, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/module-explainer. | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-sonnet46-publication-final/module-explainer.html && uv run --with pillow py", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "sonnet46", | |
| "model_slug": "sonnet46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-sonnet46-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/sonnet46/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 49926, | |
| "generation_ok": true, | |
| "generation_duration_s": 196.05, | |
| "input_tokens": 257093, | |
| "output_tokens": 12916, | |
| "total_tokens": 270009, | |
| "billing_tokens": 270009, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 210864, | |
| "cache_write_tokens": 24527, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 235391, | |
| "effective_input_tokens": 21702, | |
| "display_input_tokens": 257093, | |
| "usage_event_count": 14, | |
| "tool_calls": 15, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/implementat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "sonnet46", | |
| "model_slug": "sonnet46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-sonnet46-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/sonnet46/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 122208, | |
| "generation_ok": true, | |
| "generation_duration_s": 623.147, | |
| "input_tokens": 1192904, | |
| "output_tokens": 48270, | |
| "total_tokens": 1241174, | |
| "billing_tokens": 1241174, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 987803, | |
| "cache_write_tokens": 129337, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 1117140, | |
| "effective_input_tokens": 75764, | |
| "display_input_tokens": 1192904, | |
| "usage_event_count": 18, | |
| "tool_calls": 22, | |
| "turn_count": 18, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 3, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 18, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/benchmark-c | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| } | |
| ] | |
Xet Storage Details
- Size:
- 204 kB
- Xet hash:
- 6727550d9199a333f7d2dbe313d5e3b4347b5e5c4a54cdd75afb3aac2e958117
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.