| { | |
| "stage": "stage0_7_posthoc_verifier_ft028", | |
| "base_stage": "stage0_7_support_aggregation", | |
| "verifier_kind": "hgb", | |
| "target_ft": 0.028, | |
| "max_accuracy_drop": 0.015, | |
| "seeds": [ | |
| 42, | |
| 1337, | |
| 7 | |
| ], | |
| "runs": [ | |
| { | |
| "seed": 42, | |
| "path": "outputs\\moe\\stage0_7_posthoc_verifier_g3_seed42_ft028\\verifier_report.json", | |
| "selected_threshold": 0.73, | |
| "selection_reason": "target_ft_and_accuracy_floor", | |
| "test_accuracy_baseline": 0.9003660024400163, | |
| "test_false_trustworthy_baseline": 0.032582938388625596, | |
| "test_trustworthy_recall_baseline": 0.8326848249027238, | |
| "test_accuracy_guarded": 0.9007726718178121, | |
| "test_false_trustworthy_guarded": 0.021919431279620854, | |
| "test_trustworthy_recall_guarded": 0.814526588845655, | |
| "test_rejected_candidate_trustworthy": 32, | |
| "eval_accuracy_guarded": 0.8926392842618951, | |
| "eval_false_trustworthy_guarded": 0.027630805408583186 | |
| }, | |
| { | |
| "seed": 1337, | |
| "path": "outputs\\moe\\stage0_7_posthoc_verifier_g3_seed1337_ft028\\verifier_report.json", | |
| "selected_threshold": 0.62, | |
| "selection_reason": "target_ft_and_accuracy_floor", | |
| "test_accuracy_baseline": 0.8918259455063033, | |
| "test_false_trustworthy_baseline": 0.0254739336492891, | |
| "test_trustworthy_recall_baseline": 0.8184176394293126, | |
| "test_accuracy_guarded": 0.89019926799512, | |
| "test_false_trustworthy_guarded": 0.022511848341232227, | |
| "test_trustworthy_recall_guarded": 0.808041504539559, | |
| "test_rejected_candidate_trustworthy": 13, | |
| "eval_accuracy_guarded": 0.8938592923952826, | |
| "eval_false_trustworthy_guarded": 0.027630805408583186 | |
| }, | |
| { | |
| "seed": 7, | |
| "path": "outputs\\moe\\stage0_7_posthoc_verifier_g3_seed7_ft028\\verifier_report.json", | |
| "selected_threshold": 0.67, | |
| "selection_reason": "target_ft_and_accuracy_floor", | |
| "test_accuracy_baseline": 0.8889792598617324, | |
| "test_false_trustworthy_baseline": 0.030213270142180094, | |
| "test_trustworthy_recall_baseline": 0.7937743190661478, | |
| "test_accuracy_guarded": 0.8877592517283448, | |
| "test_false_trustworthy_guarded": 0.02665876777251185, | |
| "test_trustworthy_recall_guarded": 0.7833981841763943, | |
| "test_rejected_candidate_trustworthy": 14, | |
| "eval_accuracy_guarded": 0.8934526230174867, | |
| "eval_false_trustworthy_guarded": 0.027630805408583186 | |
| } | |
| ], | |
| "mean_std": { | |
| "selected_threshold": { | |
| "mean": 0.6733333333333333, | |
| "std": 0.05507570547286101, | |
| "min": 0.62, | |
| "max": 0.73 | |
| }, | |
| "test_accuracy_baseline": { | |
| "mean": 0.8937237359360174, | |
| "std": 0.0059258487174717244, | |
| "min": 0.8889792598617324, | |
| "max": 0.9003660024400163 | |
| }, | |
| "test_false_trustworthy_baseline": { | |
| "mean": 0.029423380726698263, | |
| "std": 0.0036197280370899227, | |
| "min": 0.0254739336492891, | |
| "max": 0.032582938388625596 | |
| }, | |
| "test_trustworthy_recall_baseline": { | |
| "mean": 0.8149589277993947, | |
| "std": 0.019684483247440092, | |
| "min": 0.7937743190661478, | |
| "max": 0.8326848249027238 | |
| }, | |
| "test_accuracy_guarded": { | |
| "mean": 0.8929103971804256, | |
| "std": 0.00691736522823536, | |
| "min": 0.8877592517283448, | |
| "max": 0.9007726718178121 | |
| }, | |
| "test_false_trustworthy_guarded": { | |
| "mean": 0.023696682464454978, | |
| "std": 0.0025822861039932906, | |
| "min": 0.021919431279620854, | |
| "max": 0.02665876777251185 | |
| }, | |
| "test_trustworthy_recall_guarded": { | |
| "mean": 0.8019887591872028, | |
| "std": 0.016423190586444085, | |
| "min": 0.7833981841763943, | |
| "max": 0.814526588845655 | |
| }, | |
| "test_rejected_candidate_trustworthy": { | |
| "mean": 19.666666666666668, | |
| "std": 10.692676621563626, | |
| "min": 13, | |
| "max": 32 | |
| }, | |
| "eval_accuracy_guarded": { | |
| "mean": 0.8933170665582215, | |
| "std": 0.0006211977355233384, | |
| "min": 0.8926392842618951, | |
| "max": 0.8938592923952826 | |
| }, | |
| "eval_false_trustworthy_guarded": { | |
| "mean": 0.027630805408583186, | |
| "std": 0.0, | |
| "min": 0.027630805408583186, | |
| "max": 0.027630805408583186 | |
| } | |
| }, | |
| "key_test_slices": { | |
| "consistent_chain": { | |
| "accuracy_baseline": { | |
| "mean": 0.7470449172576832, | |
| "std": 0.0503163041460822, | |
| "min": 0.7021276595744681, | |
| "max": 0.8014184397163121 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.7092198581560284, | |
| "std": 0.039487690516524974, | |
| "min": 0.6737588652482269, | |
| "max": 0.75177304964539 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.0, | |
| "std": 0.0, | |
| "min": 0.0, | |
| "max": 0.0 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.0, | |
| "std": 0.0, | |
| "min": 0.0, | |
| "max": 0.0 | |
| } | |
| }, | |
| "multi_source_corroboration": { | |
| "accuracy_baseline": { | |
| "mean": 0.6881720430107526, | |
| "std": 0.03876936855337625, | |
| "min": 0.6559139784946236, | |
| "max": 0.7311827956989247 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.6738351254480287, | |
| "std": 0.031040337053205722, | |
| "min": 0.6559139784946236, | |
| "max": 0.7096774193548387 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.0, | |
| "std": 0.0, | |
| "min": 0.0, | |
| "max": 0.0 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.0, | |
| "std": 0.0, | |
| "min": 0.0, | |
| "max": 0.0 | |
| } | |
| }, | |
| "quantitative_consensus": { | |
| "accuracy_baseline": { | |
| "mean": 0.7904761904761904, | |
| "std": 0.05039526306789702, | |
| "min": 0.7333333333333333, | |
| "max": 0.8285714285714286 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.780952380952381, | |
| "std": 0.050395263067896975, | |
| "min": 0.7238095238095238, | |
| "max": 0.819047619047619 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.0, | |
| "std": 0.0, | |
| "min": 0.0, | |
| "max": 0.0 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.0, | |
| "std": 0.0, | |
| "min": 0.0, | |
| "max": 0.0 | |
| } | |
| }, | |
| "expert_consensus": { | |
| "accuracy_baseline": { | |
| "mean": 0.7704402515723271, | |
| "std": 0.023741617720977253, | |
| "min": 0.7452830188679245, | |
| "max": 0.7924528301886793 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.7672955974842768, | |
| "std": 0.02882123078588584, | |
| "min": 0.7358490566037735, | |
| "max": 0.7924528301886793 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.0, | |
| "std": 0.0, | |
| "min": 0.0, | |
| "max": 0.0 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.0, | |
| "std": 0.0, | |
| "min": 0.0, | |
| "max": 0.0 | |
| } | |
| }, | |
| "factual_contradiction": { | |
| "accuracy_baseline": { | |
| "mean": 0.8908554572271387, | |
| "std": 0.03684364600825013, | |
| "min": 0.8495575221238938, | |
| "max": 0.9203539823008849 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.8908554572271387, | |
| "std": 0.03684364600825013, | |
| "min": 0.8495575221238938, | |
| "max": 0.9203539823008849 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.061946902654867256, | |
| "std": 0.0, | |
| "min": 0.061946902654867256, | |
| "max": 0.061946902654867256 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.061946902654867256, | |
| "std": 0.0, | |
| "min": 0.061946902654867256, | |
| "max": 0.061946902654867256 | |
| } | |
| }, | |
| "partial_overlap": { | |
| "accuracy_baseline": { | |
| "mean": 0.875, | |
| "std": 0.022047927592204947, | |
| "min": 0.8583333333333333, | |
| "max": 0.9 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.8888888888888888, | |
| "std": 0.024056261216234387, | |
| "min": 0.875, | |
| "max": 0.9166666666666666 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.075, | |
| "std": 0.025, | |
| "min": 0.05, | |
| "max": 0.1 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.06111111111111111, | |
| "std": 0.029265704869035382, | |
| "min": 0.03333333333333333, | |
| "max": 0.09166666666666666 | |
| } | |
| }, | |
| "evidence_absent": { | |
| "accuracy_baseline": { | |
| "mean": 0.8850574712643678, | |
| "std": 0.00995431498602805, | |
| "min": 0.8793103448275862, | |
| "max": 0.896551724137931 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.8908045977011494, | |
| "std": 0.00995431498602805, | |
| "min": 0.8793103448275862, | |
| "max": 0.896551724137931 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.04597701149425287, | |
| "std": 0.013168320962516782, | |
| "min": 0.034482758620689655, | |
| "max": 0.0603448275862069 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.031609195402298854, | |
| "std": 0.004977157493014015, | |
| "min": 0.02586206896551724, | |
| "max": 0.034482758620689655 | |
| } | |
| }, | |
| "wrong_entity": { | |
| "accuracy_baseline": { | |
| "mean": 0.9081632653061225, | |
| "std": 0.0176739878323355, | |
| "min": 0.8877551020408163, | |
| "max": 0.9183673469387755 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.9251700680272109, | |
| "std": 0.015586992159713759, | |
| "min": 0.9081632653061225, | |
| "max": 0.9387755102040817 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.06462585034013606, | |
| "std": 0.0058913292774451596, | |
| "min": 0.061224489795918366, | |
| "max": 0.07142857142857142 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.047619047619047616, | |
| "std": 0.005891329277445163, | |
| "min": 0.04081632653061224, | |
| "max": 0.05102040816326531 | |
| } | |
| }, | |
| "wrong_specificity": { | |
| "accuracy_baseline": { | |
| "mean": 0.9519519519519519, | |
| "std": 0.03163859985841659, | |
| "min": 0.918918918918919, | |
| "max": 0.9819819819819819 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.9669669669669669, | |
| "std": 0.013761488573440929, | |
| "min": 0.954954954954955, | |
| "max": 0.9819819819819819 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.030030030030030026, | |
| "std": 0.028959912195174038, | |
| "min": 0.009009009009009009, | |
| "max": 0.06306306306306306 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.015015015015015015, | |
| "std": 0.01040270755296623, | |
| "min": 0.009009009009009009, | |
| "max": 0.02702702702702703 | |
| } | |
| }, | |
| "numerical_conflict": { | |
| "accuracy_baseline": { | |
| "mean": 0.8707482993197279, | |
| "std": 0.029456646387225772, | |
| "min": 0.8367346938775511, | |
| "max": 0.8877551020408163 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.8843537414965986, | |
| "std": 0.03583555698249229, | |
| "min": 0.8469387755102041, | |
| "max": 0.9183673469387755 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.07142857142857142, | |
| "std": 0.0, | |
| "min": 0.07142857142857142, | |
| "max": 0.07142857142857142 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.05442176870748299, | |
| "std": 0.02124148979047074, | |
| "min": 0.030612244897959183, | |
| "max": 0.07142857142857142 | |
| } | |
| }, | |
| "temporal_conflict": { | |
| "accuracy_baseline": { | |
| "mean": 0.8766666666666667, | |
| "std": 0.020816659994661344, | |
| "min": 0.86, | |
| "max": 0.9 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.8766666666666667, | |
| "std": 0.020816659994661344, | |
| "min": 0.86, | |
| "max": 0.9 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.03333333333333333, | |
| "std": 0.011547005383792516, | |
| "min": 0.02, | |
| "max": 0.04 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.03333333333333333, | |
| "std": 0.011547005383792516, | |
| "min": 0.02, | |
| "max": 0.04 | |
| } | |
| } | |
| }, | |
| "key_test_routes": { | |
| "science_medicine": { | |
| "accuracy_baseline": { | |
| "mean": 0.8512820512820513, | |
| "std": 0.01282051282051283, | |
| "min": 0.8384615384615385, | |
| "max": 0.8641025641025641 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.8564102564102564, | |
| "std": 0.016012815380508718, | |
| "min": 0.8435897435897436, | |
| "max": 0.8743589743589744 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.054421768707483, | |
| "std": 0.018405101031930184, | |
| "min": 0.036734693877551024, | |
| "max": 0.07346938775510205 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.03129251700680272, | |
| "std": 0.008496595916188296, | |
| "min": 0.024489795918367346, | |
| "max": 0.04081632653061224 | |
| } | |
| }, | |
| "general_commonsense": { | |
| "accuracy_baseline": { | |
| "mean": 0.8795045045045046, | |
| "std": 0.0039010153323623606, | |
| "min": 0.875, | |
| "max": 0.8817567567567568 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.8772522522522522, | |
| "std": 0.005160558215040393, | |
| "min": 0.8716216216216216, | |
| "max": 0.8817567567567568 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.04433497536945813, | |
| "std": 0.004926108374384237, | |
| "min": 0.03940886699507389, | |
| "max": 0.04926108374384237 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.041050903119868636, | |
| "std": 0.0028440899960080093, | |
| "min": 0.03940886699507389, | |
| "max": 0.04433497536945813 | |
| } | |
| }, | |
| "technology_computing": { | |
| "accuracy_baseline": { | |
| "mean": 0.8904761904761904, | |
| "std": 0.005947617141331817, | |
| "min": 0.8857142857142857, | |
| "max": 0.8971428571428571 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.8876190476190476, | |
| "std": 0.0043643578047198395, | |
| "min": 0.8828571428571429, | |
| "max": 0.8914285714285715 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.04700854700854701, | |
| "std": 0.007401926528072123, | |
| "min": 0.042735042735042736, | |
| "max": 0.05555555555555555 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.041310541310541314, | |
| "std": 0.004934617685381415, | |
| "min": 0.038461538461538464, | |
| "max": 0.04700854700854701 | |
| } | |
| }, | |
| "law_policy": { | |
| "accuracy_baseline": { | |
| "mean": 0.9, | |
| "std": 0.014301358438186956, | |
| "min": 0.8891891891891892, | |
| "max": 0.9162162162162162 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.8963963963963963, | |
| "std": 0.01536821811642522, | |
| "min": 0.8837837837837837, | |
| "max": 0.9135135135135135 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.015810276679841896, | |
| "std": 0.003952569169960475, | |
| "min": 0.011857707509881422, | |
| "max": 0.019762845849802372 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.014492753623188404, | |
| "std": 0.0022820168742672953, | |
| "min": 0.011857707509881422, | |
| "max": 0.015810276679841896 | |
| } | |
| }, | |
| "economics_finance": { | |
| "accuracy_baseline": { | |
| "mean": 0.9124218051831994, | |
| "std": 0.0015478559495700434, | |
| "min": 0.9115281501340483, | |
| "max": 0.9142091152815014 | |
| }, | |
| "accuracy_guarded": { | |
| "mean": 0.9142091152815014, | |
| "std": 0.004643567848710066, | |
| "min": 0.9115281501340483, | |
| "max": 0.9195710455764075 | |
| }, | |
| "false_trustworthy_baseline": { | |
| "mean": 0.029850746268656716, | |
| "std": 0.009872206384569369, | |
| "min": 0.022388059701492536, | |
| "max": 0.041044776119402986 | |
| }, | |
| "false_trustworthy_guarded": { | |
| "mean": 0.02487562189054726, | |
| "std": 0.0043085840984300435, | |
| "min": 0.022388059701492536, | |
| "max": 0.029850746268656716 | |
| } | |
| } | |
| } | |
| } |