scrubdata / docs /paper /numbers.tex
OpenAI Codex
deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build
16dc556
Raw
History Blame Contribute Delete
6.5 kB
% Result macros β€” every value regenerates from one command (see Reproducibility section).
% Headline fine-tune (synthetic frozen gold, Layer 1)
\newcommand{\canonFOurs}{0.815} % v5 bf16, n=20 (single run; multi-seed below)
\newcommand{\canonFOursBest}{0.901} % v4 Q8 measurement
\newcommand{\canonFBig}{0.452} % large generic model (GLM-class, zero-shot)
\newcommand{\canonFHeur}{0.152} % rule heuristic
\newcommand{\canonFMultiSeed}{$0.803 \pm 0.009$ (95\% CI, 3 training seeds)}
\newcommand{\opFOurs}{0.957}
\newcommand{\jsonValidOurs}{0.950}
% Hospital head-to-head, repairs-only churn-neutral (both paths incl. errors-are-rare gates)
\newcommand{\hospRecallGrounded}{0.257}
\newcommand{\hospRecallFreq}{0.293}
\newcommand{\hospPrecGrounded}{0.845}
\newcommand{\hospPrecFreq}{0.871}
\newcommand{\hospModelRecall}{0.424} % fine-tuned v5, repair_recall (vs 0.000 synthetic-only)
% Wide-suite comparison (3 seeds, churn-neutral metric) β€” money table.
% PRIMARY = HEAD regeneration 2026-06-12 (eval/results/money_table_head.json,
% post-capability system, NaN metric fix in). Freeze (2026-06-10) values kept
% as *Freeze macros where the narrative discusses the freeze-version system.
\newcommand{\northGrounded}{0.224}
\newcommand{\northGroundedCI}{0.004}
\newcommand{\northORFp}{0.211}
\newcommand{\northORKnn}{0.122}
\newcommand{\realFGrounded}{0.225}
\newcommand{\realFORKnn}{0.058}
\newcommand{\damageGrounded}{0.092}
\newcommand{\damageORKnn}{0.096}
\newcommand{\northGroundedFreeze}{0.203}
\newcommand{\realFGroundedFreeze}{0.174}
\newcommand{\damageGroundedFreeze}{0.104}
% SHIPPED system (verified union, v6 adapter) on suite β€” scripts/modal_eval_suite.py
\newcommand{\modelRealF}{0.142}
\newcommand{\modelDamage}{0.015}
\newcommand{\modelAbstain}{1.000}
% Ablations (churn-neutral metric, 3 seeds β€” eval/results/ablations.json)
\newcommand{\ablFull}{0.203}
\newcommand{\ablNoGround}{0.223}
\newcommand{\ablNoAbstain}{0.190}
\newcommand{\ablNoMargin}{0.197}
\newcommand{\ablNoCase}{0.201}
\newcommand{\ablFullRealF}{0.174}
\newcommand{\ablNoGroundRealF}{0.135}
\newcommand{\ablNoAbstainDamage}{0.108}
\newcommand{\ablFullDamage}{0.104}
\newcommand{\ablFullAbstain}{1.000}
\newcommand{\ablNoAbstainAbstain}{0.250}
% Selective prediction / calibration
\newcommand{\aurc}{0.120}
\newcommand{\ece}{0.169}
\newcommand{\precAtDefault}{0.899} % threshold 0.84
\newcommand{\covAtDefault}{0.669}
\newcommand{\threshNinetyFive}{0.91}
\newcommand{\covNinetyFive}{0.206}
% PII transfer validation (OpenMed-PII 44M on bare cells)
\newcommand{\piiNameBare}{100\%}
\newcommand{\piiAddrBare}{100\%}
\newcommand{\piiNegRate}{43\%}
\newcommand{\piiLeakRate}{zero (0/360)} % eval/pii_leak.py, seeded
\newcommand{\realFORFp}{0.039}
\newcommand{\injFGrounded}{0.224}
\newcommand{\injFORFp}{0.282}
\newcommand{\damageORFp}{0.001}
\newcommand{\hospModelRecallVSix}{0.475}
\newcommand{\hospModelPrecVSix}{0.185}
% WS1 β€” plan-level selective prediction (verified union planner)
% repro: uv run python -m eval.precision_curve --plan eval/results/v6_hospital_raw_plan.json --union
\newcommand{\unionGatePrec}{0.905}
\newcommand{\unionGateCov}{0.413}
\newcommand{\modelGatePrec}{0.993} % gated model plan alone, tau=0.5 (146/147 correct)
\newcommand{\modelGateCov}{0.287}
\newcommand{\unionChanged}{232}
\newcommand{\unionFixed}{210}
\newcommand{\hospErrors}{509}
% 3 training seeds (mixA 21=shipped/25/26), union @ tau=0.5 (eval/results/union_gate_3seed.json)
\newcommand{\unionGateThreeSeedPrec}{$0.891 \pm 0.012$}
\newcommand{\unionGateThreeSeedCov}{$0.396 \pm 0.025$}
% WS2 pair-profiles (measured-and-cut): constrained raw plan / composed at tau=0.5
\newcommand{\pairsRawPrec}{0.760}
\newcommand{\pairsRawCov}{0.348}
\newcommand{\pairsUnionPrec}{0.876}
\newcommand{\pairsUnionCov}{0.387}
% ===== v2 (post-freeze system, 2026-06-11/12) =====
\newcommand{\nPairs}{42}
\newcommand{\nWild}{35}
\newcommand{\nTrust}{239}
\newcommand{\unseenMacroF}{0.363}
\newcommand{\unseenMacroDamage}{0.0219}
\newcommand{\wildRecovery}{0.207}
\newcommand{\genFTwo}{0.058}
\newcommand{\genVRTwo}{0.108}
\newcommand{\genDamageTwo}{0.036}
\newcommand{\ttFOne}{0.955--0.957}
\newcommand{\flightsVoteF}{0.164}
\newcommand{\flightsBaseF}{0.044}
\newcommand{\hospVoteHeur}{0.186}
\newcommand{\hospBaseHeur}{0.092}
\newcommand{\gidclHosp}{0.97}
% WS4 β€” learned-repair baselines, Raha real slice only (eval/baselines_learned.py)
\newcommand{\realFBaran}{0.811} % oracle detection + 20 gold labels: upper bound
\newcommand{\realFBaranCI}{0.018} % 3 label-sampling seeds
\newcommand{\damageBaran}{0.003}
\newcommand{\precBaran}{0.824}
\newcommand{\realFJelly}{0.074} % Jellyfish-13B ED+DI (scripts/modal_jellyfish.py)
\newcommand{\damageJelly}{0.027}
% W1.a β€” matched-budget label curve, 5-dataset Raha macro (eval/results/label_curve.json)
\newcommand{\realFBaranZero}{0.000} % Baran k=0 (oracle positions retained), 3 seeds
\newcommand{\realFBaranFive}{0.504} % Baran k=5
\newcommand{\realFOursFive}{0.282} % ours k=5 (labels validate/expand accept set only)
\newcommand{\realFOursTwenty}{0.351} % ours k=20
\newcommand{\realFOursHead}{0.225} % ours k=0 at HEAD (post-freeze capabilities)
% W4.3/4.4 β€” degenerate baselines + cost-weighted scores (eval/results/degenerate.json)
\newcommand{\degShippedF}{0.343}
\newcommand{\degShippedP}{0.576}
\newcommand{\degShippedDamage}{0.023}
\newcommand{\degShippedPhiOne}{$+0.13$}
% W1.c β€” zero-shot capability scaling arm (eval/results/scaling_arm.json)
\newcommand{\scalePrecBig}{0.915} % devstral-24B and gemma4-31B union point
\newcommand{\scaleCovBig}{0.485}
\newcommand{\scalePrecNemo}{0.877}
\newcommand{\scaleCovNemo}{0.336}
% hosted wall-clock for the 5 planning calls, single capture (scaling_arm.json runtime_s)
\newcommand{\runtimeDevstral}{135}
\newcommand{\runtimeNemo}{114}
\newcommand{\runtimeGemma}{104}
% Frontier zero-shot reference point: hospital repair recall of a vanilla frontier-scale
% cloud planner run through the same propose/execute harness (2026-06-04..07 architecture
% validation captures, pre-verifier; recorded in project training-run logs). Quoted in the
% fine-tune results subsection as the zero-shot ceiling the v6 recall approaches.
\newcommand{\frontierZeroShotRecall}{0.51}
% R3 β€” absolute champion GEN-F1 basis of the equivalence retrain series
% (eval/results/equivalence.json spec.champion_macro_gen_f1 = 0.014606)
\newcommand{\genChampionBasis}{0.0146}