{ "stage": "stage0_7_posthoc_verifier_ft028", "base_stage": "stage0_7_support_aggregation", "verifier_kind": "hgb", "target_ft": 0.028, "max_accuracy_drop": 0.015, "seeds": [ 42, 1337, 7 ], "runs": [ { "seed": 42, "path": "outputs\\moe\\stage0_7_posthoc_verifier_g3_seed42_ft028\\verifier_report.json", "selected_threshold": 0.73, "selection_reason": "target_ft_and_accuracy_floor", "test_accuracy_baseline": 0.9003660024400163, "test_false_trustworthy_baseline": 0.032582938388625596, "test_trustworthy_recall_baseline": 0.8326848249027238, "test_accuracy_guarded": 0.9007726718178121, "test_false_trustworthy_guarded": 0.021919431279620854, "test_trustworthy_recall_guarded": 0.814526588845655, "test_rejected_candidate_trustworthy": 32, "eval_accuracy_guarded": 0.8926392842618951, "eval_false_trustworthy_guarded": 0.027630805408583186 }, { "seed": 1337, "path": "outputs\\moe\\stage0_7_posthoc_verifier_g3_seed1337_ft028\\verifier_report.json", "selected_threshold": 0.62, "selection_reason": "target_ft_and_accuracy_floor", "test_accuracy_baseline": 0.8918259455063033, "test_false_trustworthy_baseline": 0.0254739336492891, "test_trustworthy_recall_baseline": 0.8184176394293126, "test_accuracy_guarded": 0.89019926799512, "test_false_trustworthy_guarded": 0.022511848341232227, "test_trustworthy_recall_guarded": 0.808041504539559, "test_rejected_candidate_trustworthy": 13, "eval_accuracy_guarded": 0.8938592923952826, "eval_false_trustworthy_guarded": 0.027630805408583186 }, { "seed": 7, "path": "outputs\\moe\\stage0_7_posthoc_verifier_g3_seed7_ft028\\verifier_report.json", "selected_threshold": 0.67, "selection_reason": "target_ft_and_accuracy_floor", "test_accuracy_baseline": 0.8889792598617324, "test_false_trustworthy_baseline": 0.030213270142180094, "test_trustworthy_recall_baseline": 0.7937743190661478, "test_accuracy_guarded": 0.8877592517283448, "test_false_trustworthy_guarded": 0.02665876777251185, "test_trustworthy_recall_guarded": 0.7833981841763943, "test_rejected_candidate_trustworthy": 14, "eval_accuracy_guarded": 0.8934526230174867, "eval_false_trustworthy_guarded": 0.027630805408583186 } ], "mean_std": { "selected_threshold": { "mean": 0.6733333333333333, "std": 0.05507570547286101, "min": 0.62, "max": 0.73 }, "test_accuracy_baseline": { "mean": 0.8937237359360174, "std": 0.0059258487174717244, "min": 0.8889792598617324, "max": 0.9003660024400163 }, "test_false_trustworthy_baseline": { "mean": 0.029423380726698263, "std": 0.0036197280370899227, "min": 0.0254739336492891, "max": 0.032582938388625596 }, "test_trustworthy_recall_baseline": { "mean": 0.8149589277993947, "std": 0.019684483247440092, "min": 0.7937743190661478, "max": 0.8326848249027238 }, "test_accuracy_guarded": { "mean": 0.8929103971804256, "std": 0.00691736522823536, "min": 0.8877592517283448, "max": 0.9007726718178121 }, "test_false_trustworthy_guarded": { "mean": 0.023696682464454978, "std": 0.0025822861039932906, "min": 0.021919431279620854, "max": 0.02665876777251185 }, "test_trustworthy_recall_guarded": { "mean": 0.8019887591872028, "std": 0.016423190586444085, "min": 0.7833981841763943, "max": 0.814526588845655 }, "test_rejected_candidate_trustworthy": { "mean": 19.666666666666668, "std": 10.692676621563626, "min": 13, "max": 32 }, "eval_accuracy_guarded": { "mean": 0.8933170665582215, "std": 0.0006211977355233384, "min": 0.8926392842618951, "max": 0.8938592923952826 }, "eval_false_trustworthy_guarded": { "mean": 0.027630805408583186, "std": 0.0, "min": 0.027630805408583186, "max": 0.027630805408583186 } }, "key_test_slices": { "consistent_chain": { "accuracy_baseline": { "mean": 0.7470449172576832, "std": 0.0503163041460822, "min": 0.7021276595744681, "max": 0.8014184397163121 }, "accuracy_guarded": { "mean": 0.7092198581560284, "std": 0.039487690516524974, "min": 0.6737588652482269, "max": 0.75177304964539 }, "false_trustworthy_baseline": { "mean": 0.0, "std": 0.0, "min": 0.0, "max": 0.0 }, "false_trustworthy_guarded": { "mean": 0.0, "std": 0.0, "min": 0.0, "max": 0.0 } }, "multi_source_corroboration": { "accuracy_baseline": { "mean": 0.6881720430107526, "std": 0.03876936855337625, "min": 0.6559139784946236, "max": 0.7311827956989247 }, "accuracy_guarded": { "mean": 0.6738351254480287, "std": 0.031040337053205722, "min": 0.6559139784946236, "max": 0.7096774193548387 }, "false_trustworthy_baseline": { "mean": 0.0, "std": 0.0, "min": 0.0, "max": 0.0 }, "false_trustworthy_guarded": { "mean": 0.0, "std": 0.0, "min": 0.0, "max": 0.0 } }, "quantitative_consensus": { "accuracy_baseline": { "mean": 0.7904761904761904, "std": 0.05039526306789702, "min": 0.7333333333333333, "max": 0.8285714285714286 }, "accuracy_guarded": { "mean": 0.780952380952381, "std": 0.050395263067896975, "min": 0.7238095238095238, "max": 0.819047619047619 }, "false_trustworthy_baseline": { "mean": 0.0, "std": 0.0, "min": 0.0, "max": 0.0 }, "false_trustworthy_guarded": { "mean": 0.0, "std": 0.0, "min": 0.0, "max": 0.0 } }, "expert_consensus": { "accuracy_baseline": { "mean": 0.7704402515723271, "std": 0.023741617720977253, "min": 0.7452830188679245, "max": 0.7924528301886793 }, "accuracy_guarded": { "mean": 0.7672955974842768, "std": 0.02882123078588584, "min": 0.7358490566037735, "max": 0.7924528301886793 }, "false_trustworthy_baseline": { "mean": 0.0, "std": 0.0, "min": 0.0, "max": 0.0 }, "false_trustworthy_guarded": { "mean": 0.0, "std": 0.0, "min": 0.0, "max": 0.0 } }, "factual_contradiction": { "accuracy_baseline": { "mean": 0.8908554572271387, "std": 0.03684364600825013, "min": 0.8495575221238938, "max": 0.9203539823008849 }, "accuracy_guarded": { "mean": 0.8908554572271387, "std": 0.03684364600825013, "min": 0.8495575221238938, "max": 0.9203539823008849 }, "false_trustworthy_baseline": { "mean": 0.061946902654867256, "std": 0.0, "min": 0.061946902654867256, "max": 0.061946902654867256 }, "false_trustworthy_guarded": { "mean": 0.061946902654867256, "std": 0.0, "min": 0.061946902654867256, "max": 0.061946902654867256 } }, "partial_overlap": { "accuracy_baseline": { "mean": 0.875, "std": 0.022047927592204947, "min": 0.8583333333333333, "max": 0.9 }, "accuracy_guarded": { "mean": 0.8888888888888888, "std": 0.024056261216234387, "min": 0.875, "max": 0.9166666666666666 }, "false_trustworthy_baseline": { "mean": 0.075, "std": 0.025, "min": 0.05, "max": 0.1 }, "false_trustworthy_guarded": { "mean": 0.06111111111111111, "std": 0.029265704869035382, "min": 0.03333333333333333, "max": 0.09166666666666666 } }, "evidence_absent": { "accuracy_baseline": { "mean": 0.8850574712643678, "std": 0.00995431498602805, "min": 0.8793103448275862, "max": 0.896551724137931 }, "accuracy_guarded": { "mean": 0.8908045977011494, "std": 0.00995431498602805, "min": 0.8793103448275862, "max": 0.896551724137931 }, "false_trustworthy_baseline": { "mean": 0.04597701149425287, "std": 0.013168320962516782, "min": 0.034482758620689655, "max": 0.0603448275862069 }, "false_trustworthy_guarded": { "mean": 0.031609195402298854, "std": 0.004977157493014015, "min": 0.02586206896551724, "max": 0.034482758620689655 } }, "wrong_entity": { "accuracy_baseline": { "mean": 0.9081632653061225, "std": 0.0176739878323355, "min": 0.8877551020408163, "max": 0.9183673469387755 }, "accuracy_guarded": { "mean": 0.9251700680272109, "std": 0.015586992159713759, "min": 0.9081632653061225, "max": 0.9387755102040817 }, "false_trustworthy_baseline": { "mean": 0.06462585034013606, "std": 0.0058913292774451596, "min": 0.061224489795918366, "max": 0.07142857142857142 }, "false_trustworthy_guarded": { "mean": 0.047619047619047616, "std": 0.005891329277445163, "min": 0.04081632653061224, "max": 0.05102040816326531 } }, "wrong_specificity": { "accuracy_baseline": { "mean": 0.9519519519519519, "std": 0.03163859985841659, "min": 0.918918918918919, "max": 0.9819819819819819 }, "accuracy_guarded": { "mean": 0.9669669669669669, "std": 0.013761488573440929, "min": 0.954954954954955, "max": 0.9819819819819819 }, "false_trustworthy_baseline": { "mean": 0.030030030030030026, "std": 0.028959912195174038, "min": 0.009009009009009009, "max": 0.06306306306306306 }, "false_trustworthy_guarded": { "mean": 0.015015015015015015, "std": 0.01040270755296623, "min": 0.009009009009009009, "max": 0.02702702702702703 } }, "numerical_conflict": { "accuracy_baseline": { "mean": 0.8707482993197279, "std": 0.029456646387225772, "min": 0.8367346938775511, "max": 0.8877551020408163 }, "accuracy_guarded": { "mean": 0.8843537414965986, "std": 0.03583555698249229, "min": 0.8469387755102041, "max": 0.9183673469387755 }, "false_trustworthy_baseline": { "mean": 0.07142857142857142, "std": 0.0, "min": 0.07142857142857142, "max": 0.07142857142857142 }, "false_trustworthy_guarded": { "mean": 0.05442176870748299, "std": 0.02124148979047074, "min": 0.030612244897959183, "max": 0.07142857142857142 } }, "temporal_conflict": { "accuracy_baseline": { "mean": 0.8766666666666667, "std": 0.020816659994661344, "min": 0.86, "max": 0.9 }, "accuracy_guarded": { "mean": 0.8766666666666667, "std": 0.020816659994661344, "min": 0.86, "max": 0.9 }, "false_trustworthy_baseline": { "mean": 0.03333333333333333, "std": 0.011547005383792516, "min": 0.02, "max": 0.04 }, "false_trustworthy_guarded": { "mean": 0.03333333333333333, "std": 0.011547005383792516, "min": 0.02, "max": 0.04 } } }, "key_test_routes": { "science_medicine": { "accuracy_baseline": { "mean": 0.8512820512820513, "std": 0.01282051282051283, "min": 0.8384615384615385, "max": 0.8641025641025641 }, "accuracy_guarded": { "mean": 0.8564102564102564, "std": 0.016012815380508718, "min": 0.8435897435897436, "max": 0.8743589743589744 }, "false_trustworthy_baseline": { "mean": 0.054421768707483, "std": 0.018405101031930184, "min": 0.036734693877551024, "max": 0.07346938775510205 }, "false_trustworthy_guarded": { "mean": 0.03129251700680272, "std": 0.008496595916188296, "min": 0.024489795918367346, "max": 0.04081632653061224 } }, "general_commonsense": { "accuracy_baseline": { "mean": 0.8795045045045046, "std": 0.0039010153323623606, "min": 0.875, "max": 0.8817567567567568 }, "accuracy_guarded": { "mean": 0.8772522522522522, "std": 0.005160558215040393, "min": 0.8716216216216216, "max": 0.8817567567567568 }, "false_trustworthy_baseline": { "mean": 0.04433497536945813, "std": 0.004926108374384237, "min": 0.03940886699507389, "max": 0.04926108374384237 }, "false_trustworthy_guarded": { "mean": 0.041050903119868636, "std": 0.0028440899960080093, "min": 0.03940886699507389, "max": 0.04433497536945813 } }, "technology_computing": { "accuracy_baseline": { "mean": 0.8904761904761904, "std": 0.005947617141331817, "min": 0.8857142857142857, "max": 0.8971428571428571 }, "accuracy_guarded": { "mean": 0.8876190476190476, "std": 0.0043643578047198395, "min": 0.8828571428571429, "max": 0.8914285714285715 }, "false_trustworthy_baseline": { "mean": 0.04700854700854701, "std": 0.007401926528072123, "min": 0.042735042735042736, "max": 0.05555555555555555 }, "false_trustworthy_guarded": { "mean": 0.041310541310541314, "std": 0.004934617685381415, "min": 0.038461538461538464, "max": 0.04700854700854701 } }, "law_policy": { "accuracy_baseline": { "mean": 0.9, "std": 0.014301358438186956, "min": 0.8891891891891892, "max": 0.9162162162162162 }, "accuracy_guarded": { "mean": 0.8963963963963963, "std": 0.01536821811642522, "min": 0.8837837837837837, "max": 0.9135135135135135 }, "false_trustworthy_baseline": { "mean": 0.015810276679841896, "std": 0.003952569169960475, "min": 0.011857707509881422, "max": 0.019762845849802372 }, "false_trustworthy_guarded": { "mean": 0.014492753623188404, "std": 0.0022820168742672953, "min": 0.011857707509881422, "max": 0.015810276679841896 } }, "economics_finance": { "accuracy_baseline": { "mean": 0.9124218051831994, "std": 0.0015478559495700434, "min": 0.9115281501340483, "max": 0.9142091152815014 }, "accuracy_guarded": { "mean": 0.9142091152815014, "std": 0.004643567848710066, "min": 0.9115281501340483, "max": 0.9195710455764075 }, "false_trustworthy_baseline": { "mean": 0.029850746268656716, "std": 0.009872206384569369, "min": 0.022388059701492536, "max": 0.041044776119402986 }, "false_trustworthy_guarded": { "mean": 0.02487562189054726, "std": 0.0043085840984300435, "min": 0.022388059701492536, "max": 0.029850746268656716 } } } }