Spaces:

build-small-hackathon
/

scrubdata

Running

OpenAI Codex

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

16dc556 18 days ago

6.5 kB

	% Result macros — every value regenerates from one command (see Reproducibility section).
	% Headline fine-tune (synthetic frozen gold, Layer 1)
	\newcommand{\canonFOurs}{0.815} % v5 bf16, n=20 (single run; multi-seed below)
	\newcommand{\canonFOursBest}{0.901} % v4 Q8 measurement
	\newcommand{\canonFBig}{0.452} % large generic model (GLM-class, zero-shot)
	\newcommand{\canonFHeur}{0.152} % rule heuristic
	\newcommand{\canonFMultiSeed}{$0.803 \pm 0.009$ (95\% CI, 3 training seeds)}
	\newcommand{\opFOurs}{0.957}
	\newcommand{\jsonValidOurs}{0.950}

	% Hospital head-to-head, repairs-only churn-neutral (both paths incl. errors-are-rare gates)
	\newcommand{\hospRecallGrounded}{0.257}
	\newcommand{\hospRecallFreq}{0.293}
	\newcommand{\hospPrecGrounded}{0.845}
	\newcommand{\hospPrecFreq}{0.871}
	\newcommand{\hospModelRecall}{0.424} % fine-tuned v5, repair_recall (vs 0.000 synthetic-only)

	% Wide-suite comparison (3 seeds, churn-neutral metric) — money table.
	% PRIMARY = HEAD regeneration 2026-06-12 (eval/results/money_table_head.json,
	% post-capability system, NaN metric fix in). Freeze (2026-06-10) values kept
	% as *Freeze macros where the narrative discusses the freeze-version system.
	\newcommand{\northGrounded}{0.224}
	\newcommand{\northGroundedCI}{0.004}
	\newcommand{\northORFp}{0.211}
	\newcommand{\northORKnn}{0.122}
	\newcommand{\realFGrounded}{0.225}
	\newcommand{\realFORKnn}{0.058}
	\newcommand{\damageGrounded}{0.092}
	\newcommand{\damageORKnn}{0.096}
	\newcommand{\northGroundedFreeze}{0.203}
	\newcommand{\realFGroundedFreeze}{0.174}
	\newcommand{\damageGroundedFreeze}{0.104}

	% SHIPPED system (verified union, v6 adapter) on suite — scripts/modal_eval_suite.py
	\newcommand{\modelRealF}{0.142}
	\newcommand{\modelDamage}{0.015}
	\newcommand{\modelAbstain}{1.000}

	% Ablations (churn-neutral metric, 3 seeds — eval/results/ablations.json)
	\newcommand{\ablFull}{0.203}
	\newcommand{\ablNoGround}{0.223}
	\newcommand{\ablNoAbstain}{0.190}
	\newcommand{\ablNoMargin}{0.197}
	\newcommand{\ablNoCase}{0.201}
	\newcommand{\ablFullRealF}{0.174}
	\newcommand{\ablNoGroundRealF}{0.135}
	\newcommand{\ablNoAbstainDamage}{0.108}
	\newcommand{\ablFullDamage}{0.104}
	\newcommand{\ablFullAbstain}{1.000}
	\newcommand{\ablNoAbstainAbstain}{0.250}

	% Selective prediction / calibration
	\newcommand{\aurc}{0.120}
	\newcommand{\ece}{0.169}
	\newcommand{\precAtDefault}{0.899} % threshold 0.84
	\newcommand{\covAtDefault}{0.669}
	\newcommand{\threshNinetyFive}{0.91}
	\newcommand{\covNinetyFive}{0.206}

	% PII transfer validation (OpenMed-PII 44M on bare cells)
	\newcommand{\piiNameBare}{100\%}
	\newcommand{\piiAddrBare}{100\%}
	\newcommand{\piiNegRate}{43\%}
	\newcommand{\piiLeakRate}{zero (0/360)} % eval/pii_leak.py, seeded
	\newcommand{\realFORFp}{0.039}
	\newcommand{\injFGrounded}{0.224}
	\newcommand{\injFORFp}{0.282}
	\newcommand{\damageORFp}{0.001}
	\newcommand{\hospModelRecallVSix}{0.475}
	\newcommand{\hospModelPrecVSix}{0.185}

	% WS1 — plan-level selective prediction (verified union planner)
	% repro: uv run python -m eval.precision_curve --plan eval/results/v6_hospital_raw_plan.json --union
	\newcommand{\unionGatePrec}{0.905}
	\newcommand{\unionGateCov}{0.413}
	\newcommand{\modelGatePrec}{0.993} % gated model plan alone, tau=0.5 (146/147 correct)
	\newcommand{\modelGateCov}{0.287}
	\newcommand{\unionChanged}{232}
	\newcommand{\unionFixed}{210}
	\newcommand{\hospErrors}{509}
	% 3 training seeds (mixA 21=shipped/25/26), union @ tau=0.5 (eval/results/union_gate_3seed.json)
	\newcommand{\unionGateThreeSeedPrec}{$0.891 \pm 0.012$}
	\newcommand{\unionGateThreeSeedCov}{$0.396 \pm 0.025$}
	% WS2 pair-profiles (measured-and-cut): constrained raw plan / composed at tau=0.5
	\newcommand{\pairsRawPrec}{0.760}
	\newcommand{\pairsRawCov}{0.348}
	\newcommand{\pairsUnionPrec}{0.876}
	\newcommand{\pairsUnionCov}{0.387}

	% ===== v2 (post-freeze system, 2026-06-11/12) =====
	\newcommand{\nPairs}{42}
	\newcommand{\nWild}{35}
	\newcommand{\nTrust}{239}
	\newcommand{\unseenMacroF}{0.363}
	\newcommand{\unseenMacroDamage}{0.0219}
	\newcommand{\wildRecovery}{0.207}
	\newcommand{\genFTwo}{0.058}
	\newcommand{\genVRTwo}{0.108}
	\newcommand{\genDamageTwo}{0.036}
	\newcommand{\ttFOne}{0.955--0.957}
	\newcommand{\flightsVoteF}{0.164}
	\newcommand{\flightsBaseF}{0.044}
	\newcommand{\hospVoteHeur}{0.186}
	\newcommand{\hospBaseHeur}{0.092}
	\newcommand{\gidclHosp}{0.97}

	% WS4 — learned-repair baselines, Raha real slice only (eval/baselines_learned.py)
	\newcommand{\realFBaran}{0.811} % oracle detection + 20 gold labels: upper bound
	\newcommand{\realFBaranCI}{0.018} % 3 label-sampling seeds
	\newcommand{\damageBaran}{0.003}
	\newcommand{\precBaran}{0.824}
	\newcommand{\realFJelly}{0.074} % Jellyfish-13B ED+DI (scripts/modal_jellyfish.py)
	\newcommand{\damageJelly}{0.027}

	% W1.a — matched-budget label curve, 5-dataset Raha macro (eval/results/label_curve.json)
	\newcommand{\realFBaranZero}{0.000} % Baran k=0 (oracle positions retained), 3 seeds
	\newcommand{\realFBaranFive}{0.504} % Baran k=5
	\newcommand{\realFOursFive}{0.282} % ours k=5 (labels validate/expand accept set only)
	\newcommand{\realFOursTwenty}{0.351} % ours k=20
	\newcommand{\realFOursHead}{0.225} % ours k=0 at HEAD (post-freeze capabilities)

	% W4.3/4.4 — degenerate baselines + cost-weighted scores (eval/results/degenerate.json)
	\newcommand{\degShippedF}{0.343}
	\newcommand{\degShippedP}{0.576}
	\newcommand{\degShippedDamage}{0.023}
	\newcommand{\degShippedPhiOne}{$+0.13$}

	% W1.c — zero-shot capability scaling arm (eval/results/scaling_arm.json)
	\newcommand{\scalePrecBig}{0.915} % devstral-24B and gemma4-31B union point
	\newcommand{\scaleCovBig}{0.485}
	\newcommand{\scalePrecNemo}{0.877}
	\newcommand{\scaleCovNemo}{0.336}
	% hosted wall-clock for the 5 planning calls, single capture (scaling_arm.json runtime_s)
	\newcommand{\runtimeDevstral}{135}
	\newcommand{\runtimeNemo}{114}
	\newcommand{\runtimeGemma}{104}

	% Frontier zero-shot reference point: hospital repair recall of a vanilla frontier-scale
	% cloud planner run through the same propose/execute harness (2026-06-04..07 architecture
	% validation captures, pre-verifier; recorded in project training-run logs). Quoted in the
	% fine-tune results subsection as the zero-shot ceiling the v6 recall approaches.
	\newcommand{\frontierZeroShotRecall}{0.51}

	% R3 — absolute champion GEN-F1 basis of the equivalence retrain series
	% (eval/results/equivalence.json spec.champion_macro_gen_f1 = 0.014606)
	\newcommand{\genChampionBasis}{0.0146}