Spaces:
Running
Running
| % Result macros β every value regenerates from one command (see Reproducibility section). | |
| % Headline fine-tune (synthetic frozen gold, Layer 1) | |
| \newcommand{\canonFOurs}{0.815} % v5 bf16, n=20 (single run; multi-seed below) | |
| \newcommand{\canonFOursBest}{0.901} % v4 Q8 measurement | |
| \newcommand{\canonFBig}{0.452} % large generic model (GLM-class, zero-shot) | |
| \newcommand{\canonFHeur}{0.152} % rule heuristic | |
| \newcommand{\canonFMultiSeed}{$0.803 \pm 0.009$ (95\% CI, 3 training seeds)} | |
| \newcommand{\opFOurs}{0.957} | |
| \newcommand{\jsonValidOurs}{0.950} | |
| % Hospital head-to-head, repairs-only churn-neutral (both paths incl. errors-are-rare gates) | |
| \newcommand{\hospRecallGrounded}{0.257} | |
| \newcommand{\hospRecallFreq}{0.293} | |
| \newcommand{\hospPrecGrounded}{0.845} | |
| \newcommand{\hospPrecFreq}{0.871} | |
| \newcommand{\hospModelRecall}{0.424} % fine-tuned v5, repair_recall (vs 0.000 synthetic-only) | |
| % Wide-suite comparison (3 seeds, churn-neutral metric) β money table. | |
| % PRIMARY = HEAD regeneration 2026-06-12 (eval/results/money_table_head.json, | |
| % post-capability system, NaN metric fix in). Freeze (2026-06-10) values kept | |
| % as *Freeze macros where the narrative discusses the freeze-version system. | |
| \newcommand{\northGrounded}{0.224} | |
| \newcommand{\northGroundedCI}{0.004} | |
| \newcommand{\northORFp}{0.211} | |
| \newcommand{\northORKnn}{0.122} | |
| \newcommand{\realFGrounded}{0.225} | |
| \newcommand{\realFORKnn}{0.058} | |
| \newcommand{\damageGrounded}{0.092} | |
| \newcommand{\damageORKnn}{0.096} | |
| \newcommand{\northGroundedFreeze}{0.203} | |
| \newcommand{\realFGroundedFreeze}{0.174} | |
| \newcommand{\damageGroundedFreeze}{0.104} | |
| % SHIPPED system (verified union, v6 adapter) on suite β scripts/modal_eval_suite.py | |
| \newcommand{\modelRealF}{0.142} | |
| \newcommand{\modelDamage}{0.015} | |
| \newcommand{\modelAbstain}{1.000} | |
| % Ablations (churn-neutral metric, 3 seeds β eval/results/ablations.json) | |
| \newcommand{\ablFull}{0.203} | |
| \newcommand{\ablNoGround}{0.223} | |
| \newcommand{\ablNoAbstain}{0.190} | |
| \newcommand{\ablNoMargin}{0.197} | |
| \newcommand{\ablNoCase}{0.201} | |
| \newcommand{\ablFullRealF}{0.174} | |
| \newcommand{\ablNoGroundRealF}{0.135} | |
| \newcommand{\ablNoAbstainDamage}{0.108} | |
| \newcommand{\ablFullDamage}{0.104} | |
| \newcommand{\ablFullAbstain}{1.000} | |
| \newcommand{\ablNoAbstainAbstain}{0.250} | |
| % Selective prediction / calibration | |
| \newcommand{\aurc}{0.120} | |
| \newcommand{\ece}{0.169} | |
| \newcommand{\precAtDefault}{0.899} % threshold 0.84 | |
| \newcommand{\covAtDefault}{0.669} | |
| \newcommand{\threshNinetyFive}{0.91} | |
| \newcommand{\covNinetyFive}{0.206} | |
| % PII transfer validation (OpenMed-PII 44M on bare cells) | |
| \newcommand{\piiNameBare}{100\%} | |
| \newcommand{\piiAddrBare}{100\%} | |
| \newcommand{\piiNegRate}{43\%} | |
| \newcommand{\piiLeakRate}{zero (0/360)} % eval/pii_leak.py, seeded | |
| \newcommand{\realFORFp}{0.039} | |
| \newcommand{\injFGrounded}{0.224} | |
| \newcommand{\injFORFp}{0.282} | |
| \newcommand{\damageORFp}{0.001} | |
| \newcommand{\hospModelRecallVSix}{0.475} | |
| \newcommand{\hospModelPrecVSix}{0.185} | |
| % WS1 β plan-level selective prediction (verified union planner) | |
| % repro: uv run python -m eval.precision_curve --plan eval/results/v6_hospital_raw_plan.json --union | |
| \newcommand{\unionGatePrec}{0.905} | |
| \newcommand{\unionGateCov}{0.413} | |
| \newcommand{\modelGatePrec}{0.993} % gated model plan alone, tau=0.5 (146/147 correct) | |
| \newcommand{\modelGateCov}{0.287} | |
| \newcommand{\unionChanged}{232} | |
| \newcommand{\unionFixed}{210} | |
| \newcommand{\hospErrors}{509} | |
| % 3 training seeds (mixA 21=shipped/25/26), union @ tau=0.5 (eval/results/union_gate_3seed.json) | |
| \newcommand{\unionGateThreeSeedPrec}{$0.891 \pm 0.012$} | |
| \newcommand{\unionGateThreeSeedCov}{$0.396 \pm 0.025$} | |
| % WS2 pair-profiles (measured-and-cut): constrained raw plan / composed at tau=0.5 | |
| \newcommand{\pairsRawPrec}{0.760} | |
| \newcommand{\pairsRawCov}{0.348} | |
| \newcommand{\pairsUnionPrec}{0.876} | |
| \newcommand{\pairsUnionCov}{0.387} | |
| % ===== v2 (post-freeze system, 2026-06-11/12) ===== | |
| \newcommand{\nPairs}{42} | |
| \newcommand{\nWild}{35} | |
| \newcommand{\nTrust}{239} | |
| \newcommand{\unseenMacroF}{0.363} | |
| \newcommand{\unseenMacroDamage}{0.0219} | |
| \newcommand{\wildRecovery}{0.207} | |
| \newcommand{\genFTwo}{0.058} | |
| \newcommand{\genVRTwo}{0.108} | |
| \newcommand{\genDamageTwo}{0.036} | |
| \newcommand{\ttFOne}{0.955--0.957} | |
| \newcommand{\flightsVoteF}{0.164} | |
| \newcommand{\flightsBaseF}{0.044} | |
| \newcommand{\hospVoteHeur}{0.186} | |
| \newcommand{\hospBaseHeur}{0.092} | |
| \newcommand{\gidclHosp}{0.97} | |
| % WS4 β learned-repair baselines, Raha real slice only (eval/baselines_learned.py) | |
| \newcommand{\realFBaran}{0.811} % oracle detection + 20 gold labels: upper bound | |
| \newcommand{\realFBaranCI}{0.018} % 3 label-sampling seeds | |
| \newcommand{\damageBaran}{0.003} | |
| \newcommand{\precBaran}{0.824} | |
| \newcommand{\realFJelly}{0.074} % Jellyfish-13B ED+DI (scripts/modal_jellyfish.py) | |
| \newcommand{\damageJelly}{0.027} | |
| % W1.a β matched-budget label curve, 5-dataset Raha macro (eval/results/label_curve.json) | |
| \newcommand{\realFBaranZero}{0.000} % Baran k=0 (oracle positions retained), 3 seeds | |
| \newcommand{\realFBaranFive}{0.504} % Baran k=5 | |
| \newcommand{\realFOursFive}{0.282} % ours k=5 (labels validate/expand accept set only) | |
| \newcommand{\realFOursTwenty}{0.351} % ours k=20 | |
| \newcommand{\realFOursHead}{0.225} % ours k=0 at HEAD (post-freeze capabilities) | |
| % W4.3/4.4 β degenerate baselines + cost-weighted scores (eval/results/degenerate.json) | |
| \newcommand{\degShippedF}{0.343} | |
| \newcommand{\degShippedP}{0.576} | |
| \newcommand{\degShippedDamage}{0.023} | |
| \newcommand{\degShippedPhiOne}{$+0.13$} | |
| % W1.c β zero-shot capability scaling arm (eval/results/scaling_arm.json) | |
| \newcommand{\scalePrecBig}{0.915} % devstral-24B and gemma4-31B union point | |
| \newcommand{\scaleCovBig}{0.485} | |
| \newcommand{\scalePrecNemo}{0.877} | |
| \newcommand{\scaleCovNemo}{0.336} | |
| % hosted wall-clock for the 5 planning calls, single capture (scaling_arm.json runtime_s) | |
| \newcommand{\runtimeDevstral}{135} | |
| \newcommand{\runtimeNemo}{114} | |
| \newcommand{\runtimeGemma}{104} | |
| % Frontier zero-shot reference point: hospital repair recall of a vanilla frontier-scale | |
| % cloud planner run through the same propose/execute harness (2026-06-04..07 architecture | |
| % validation captures, pre-verifier; recorded in project training-run logs). Quoted in the | |
| % fine-tune results subsection as the zero-shot ceiling the v6 recall approaches. | |
| \newcommand{\frontierZeroShotRecall}{0.51} | |
| % R3 β absolute champion GEN-F1 basis of the equivalence retrain series | |
| % (eval/results/equivalence.json spec.champion_macro_gen_f1 = 0.014606) | |
| \newcommand{\genChampionBasis}{0.0146} | |