% Result macros — every value regenerates from one command (see Reproducibility section). % Headline fine-tune (synthetic frozen gold, Layer 1) \newcommand{\canonFOurs}{0.815} % v5 bf16, n=20 (single run; multi-seed below) \newcommand{\canonFOursBest}{0.901} % v4 Q8 measurement \newcommand{\canonFBig}{0.452} % large generic model (GLM-class, zero-shot) \newcommand{\canonFHeur}{0.152} % rule heuristic \newcommand{\canonFMultiSeed}{$0.803 \pm 0.009$ (95\% CI, 3 training seeds)} \newcommand{\opFOurs}{0.957} \newcommand{\jsonValidOurs}{0.950} % Hospital head-to-head, repairs-only churn-neutral (both paths incl. errors-are-rare gates) \newcommand{\hospRecallGrounded}{0.257} \newcommand{\hospRecallFreq}{0.293} \newcommand{\hospPrecGrounded}{0.845} \newcommand{\hospPrecFreq}{0.871} \newcommand{\hospModelRecall}{0.424} % fine-tuned v5, repair_recall (vs 0.000 synthetic-only) % Wide-suite comparison (3 seeds, churn-neutral metric) — money table. % PRIMARY = HEAD regeneration 2026-06-12 (eval/results/money_table_head.json, % post-capability system, NaN metric fix in). Freeze (2026-06-10) values kept % as *Freeze macros where the narrative discusses the freeze-version system. \newcommand{\northGrounded}{0.224} \newcommand{\northGroundedCI}{0.004} \newcommand{\northORFp}{0.211} \newcommand{\northORKnn}{0.122} \newcommand{\realFGrounded}{0.225} \newcommand{\realFORKnn}{0.058} \newcommand{\damageGrounded}{0.092} \newcommand{\damageORKnn}{0.096} \newcommand{\northGroundedFreeze}{0.203} \newcommand{\realFGroundedFreeze}{0.174} \newcommand{\damageGroundedFreeze}{0.104} % SHIPPED system (verified union, v6 adapter) on suite — scripts/modal_eval_suite.py \newcommand{\modelRealF}{0.142} \newcommand{\modelDamage}{0.015} \newcommand{\modelAbstain}{1.000} % Ablations (churn-neutral metric, 3 seeds — eval/results/ablations.json) \newcommand{\ablFull}{0.203} \newcommand{\ablNoGround}{0.223} \newcommand{\ablNoAbstain}{0.190} \newcommand{\ablNoMargin}{0.197} \newcommand{\ablNoCase}{0.201} \newcommand{\ablFullRealF}{0.174} \newcommand{\ablNoGroundRealF}{0.135} \newcommand{\ablNoAbstainDamage}{0.108} \newcommand{\ablFullDamage}{0.104} \newcommand{\ablFullAbstain}{1.000} \newcommand{\ablNoAbstainAbstain}{0.250} % Selective prediction / calibration \newcommand{\aurc}{0.120} \newcommand{\ece}{0.169} \newcommand{\precAtDefault}{0.899} % threshold 0.84 \newcommand{\covAtDefault}{0.669} \newcommand{\threshNinetyFive}{0.91} \newcommand{\covNinetyFive}{0.206} % PII transfer validation (OpenMed-PII 44M on bare cells) \newcommand{\piiNameBare}{100\%} \newcommand{\piiAddrBare}{100\%} \newcommand{\piiNegRate}{43\%} \newcommand{\piiLeakRate}{zero (0/360)} % eval/pii_leak.py, seeded \newcommand{\realFORFp}{0.039} \newcommand{\injFGrounded}{0.224} \newcommand{\injFORFp}{0.282} \newcommand{\damageORFp}{0.001} \newcommand{\hospModelRecallVSix}{0.475} \newcommand{\hospModelPrecVSix}{0.185} % WS1 — plan-level selective prediction (verified union planner) % repro: uv run python -m eval.precision_curve --plan eval/results/v6_hospital_raw_plan.json --union \newcommand{\unionGatePrec}{0.905} \newcommand{\unionGateCov}{0.413} \newcommand{\modelGatePrec}{0.993} % gated model plan alone, tau=0.5 (146/147 correct) \newcommand{\modelGateCov}{0.287} \newcommand{\unionChanged}{232} \newcommand{\unionFixed}{210} \newcommand{\hospErrors}{509} % 3 training seeds (mixA 21=shipped/25/26), union @ tau=0.5 (eval/results/union_gate_3seed.json) \newcommand{\unionGateThreeSeedPrec}{$0.891 \pm 0.012$} \newcommand{\unionGateThreeSeedCov}{$0.396 \pm 0.025$} % WS2 pair-profiles (measured-and-cut): constrained raw plan / composed at tau=0.5 \newcommand{\pairsRawPrec}{0.760} \newcommand{\pairsRawCov}{0.348} \newcommand{\pairsUnionPrec}{0.876} \newcommand{\pairsUnionCov}{0.387} % ===== v2 (post-freeze system, 2026-06-11/12) ===== \newcommand{\nPairs}{42} \newcommand{\nWild}{35} \newcommand{\nTrust}{239} \newcommand{\unseenMacroF}{0.363} \newcommand{\unseenMacroDamage}{0.0219} \newcommand{\wildRecovery}{0.207} \newcommand{\genFTwo}{0.058} \newcommand{\genVRTwo}{0.108} \newcommand{\genDamageTwo}{0.036} \newcommand{\ttFOne}{0.955--0.957} \newcommand{\flightsVoteF}{0.164} \newcommand{\flightsBaseF}{0.044} \newcommand{\hospVoteHeur}{0.186} \newcommand{\hospBaseHeur}{0.092} \newcommand{\gidclHosp}{0.97} % WS4 — learned-repair baselines, Raha real slice only (eval/baselines_learned.py) \newcommand{\realFBaran}{0.811} % oracle detection + 20 gold labels: upper bound \newcommand{\realFBaranCI}{0.018} % 3 label-sampling seeds \newcommand{\damageBaran}{0.003} \newcommand{\precBaran}{0.824} \newcommand{\realFJelly}{0.074} % Jellyfish-13B ED+DI (scripts/modal_jellyfish.py) \newcommand{\damageJelly}{0.027} % W1.a — matched-budget label curve, 5-dataset Raha macro (eval/results/label_curve.json) \newcommand{\realFBaranZero}{0.000} % Baran k=0 (oracle positions retained), 3 seeds \newcommand{\realFBaranFive}{0.504} % Baran k=5 \newcommand{\realFOursFive}{0.282} % ours k=5 (labels validate/expand accept set only) \newcommand{\realFOursTwenty}{0.351} % ours k=20 \newcommand{\realFOursHead}{0.225} % ours k=0 at HEAD (post-freeze capabilities) % W4.3/4.4 — degenerate baselines + cost-weighted scores (eval/results/degenerate.json) \newcommand{\degShippedF}{0.343} \newcommand{\degShippedP}{0.576} \newcommand{\degShippedDamage}{0.023} \newcommand{\degShippedPhiOne}{$+0.13$} % W1.c — zero-shot capability scaling arm (eval/results/scaling_arm.json) \newcommand{\scalePrecBig}{0.915} % devstral-24B and gemma4-31B union point \newcommand{\scaleCovBig}{0.485} \newcommand{\scalePrecNemo}{0.877} \newcommand{\scaleCovNemo}{0.336} % hosted wall-clock for the 5 planning calls, single capture (scaling_arm.json runtime_s) \newcommand{\runtimeDevstral}{135} \newcommand{\runtimeNemo}{114} \newcommand{\runtimeGemma}{104} % Frontier zero-shot reference point: hospital repair recall of a vanilla frontier-scale % cloud planner run through the same propose/execute harness (2026-06-04..07 architecture % validation captures, pre-verifier; recorded in project training-run logs). Quoted in the % fine-tune results subsection as the zero-shot ceiling the v6 recall approaches. \newcommand{\frontierZeroShotRecall}{0.51} % R3 — absolute champion GEN-F1 basis of the equivalence retrain series % (eval/results/equivalence.json spec.champion_macro_gen_f1 = 0.014606) \newcommand{\genChampionBasis}{0.0146}