pyrrho-MoE-g3-alpha / summary.json
yafitzdev's picture
Release pyrrho-MoE-g3-alpha
e56ceaa verified
{
"stage": "stage0_7_posthoc_verifier_ft028",
"base_stage": "stage0_7_support_aggregation",
"verifier_kind": "hgb",
"target_ft": 0.028,
"max_accuracy_drop": 0.015,
"seeds": [
42,
1337,
7
],
"runs": [
{
"seed": 42,
"path": "outputs\\moe\\stage0_7_posthoc_verifier_g3_seed42_ft028\\verifier_report.json",
"selected_threshold": 0.73,
"selection_reason": "target_ft_and_accuracy_floor",
"test_accuracy_baseline": 0.9003660024400163,
"test_false_trustworthy_baseline": 0.032582938388625596,
"test_trustworthy_recall_baseline": 0.8326848249027238,
"test_accuracy_guarded": 0.9007726718178121,
"test_false_trustworthy_guarded": 0.021919431279620854,
"test_trustworthy_recall_guarded": 0.814526588845655,
"test_rejected_candidate_trustworthy": 32,
"eval_accuracy_guarded": 0.8926392842618951,
"eval_false_trustworthy_guarded": 0.027630805408583186
},
{
"seed": 1337,
"path": "outputs\\moe\\stage0_7_posthoc_verifier_g3_seed1337_ft028\\verifier_report.json",
"selected_threshold": 0.62,
"selection_reason": "target_ft_and_accuracy_floor",
"test_accuracy_baseline": 0.8918259455063033,
"test_false_trustworthy_baseline": 0.0254739336492891,
"test_trustworthy_recall_baseline": 0.8184176394293126,
"test_accuracy_guarded": 0.89019926799512,
"test_false_trustworthy_guarded": 0.022511848341232227,
"test_trustworthy_recall_guarded": 0.808041504539559,
"test_rejected_candidate_trustworthy": 13,
"eval_accuracy_guarded": 0.8938592923952826,
"eval_false_trustworthy_guarded": 0.027630805408583186
},
{
"seed": 7,
"path": "outputs\\moe\\stage0_7_posthoc_verifier_g3_seed7_ft028\\verifier_report.json",
"selected_threshold": 0.67,
"selection_reason": "target_ft_and_accuracy_floor",
"test_accuracy_baseline": 0.8889792598617324,
"test_false_trustworthy_baseline": 0.030213270142180094,
"test_trustworthy_recall_baseline": 0.7937743190661478,
"test_accuracy_guarded": 0.8877592517283448,
"test_false_trustworthy_guarded": 0.02665876777251185,
"test_trustworthy_recall_guarded": 0.7833981841763943,
"test_rejected_candidate_trustworthy": 14,
"eval_accuracy_guarded": 0.8934526230174867,
"eval_false_trustworthy_guarded": 0.027630805408583186
}
],
"mean_std": {
"selected_threshold": {
"mean": 0.6733333333333333,
"std": 0.05507570547286101,
"min": 0.62,
"max": 0.73
},
"test_accuracy_baseline": {
"mean": 0.8937237359360174,
"std": 0.0059258487174717244,
"min": 0.8889792598617324,
"max": 0.9003660024400163
},
"test_false_trustworthy_baseline": {
"mean": 0.029423380726698263,
"std": 0.0036197280370899227,
"min": 0.0254739336492891,
"max": 0.032582938388625596
},
"test_trustworthy_recall_baseline": {
"mean": 0.8149589277993947,
"std": 0.019684483247440092,
"min": 0.7937743190661478,
"max": 0.8326848249027238
},
"test_accuracy_guarded": {
"mean": 0.8929103971804256,
"std": 0.00691736522823536,
"min": 0.8877592517283448,
"max": 0.9007726718178121
},
"test_false_trustworthy_guarded": {
"mean": 0.023696682464454978,
"std": 0.0025822861039932906,
"min": 0.021919431279620854,
"max": 0.02665876777251185
},
"test_trustworthy_recall_guarded": {
"mean": 0.8019887591872028,
"std": 0.016423190586444085,
"min": 0.7833981841763943,
"max": 0.814526588845655
},
"test_rejected_candidate_trustworthy": {
"mean": 19.666666666666668,
"std": 10.692676621563626,
"min": 13,
"max": 32
},
"eval_accuracy_guarded": {
"mean": 0.8933170665582215,
"std": 0.0006211977355233384,
"min": 0.8926392842618951,
"max": 0.8938592923952826
},
"eval_false_trustworthy_guarded": {
"mean": 0.027630805408583186,
"std": 0.0,
"min": 0.027630805408583186,
"max": 0.027630805408583186
}
},
"key_test_slices": {
"consistent_chain": {
"accuracy_baseline": {
"mean": 0.7470449172576832,
"std": 0.0503163041460822,
"min": 0.7021276595744681,
"max": 0.8014184397163121
},
"accuracy_guarded": {
"mean": 0.7092198581560284,
"std": 0.039487690516524974,
"min": 0.6737588652482269,
"max": 0.75177304964539
},
"false_trustworthy_baseline": {
"mean": 0.0,
"std": 0.0,
"min": 0.0,
"max": 0.0
},
"false_trustworthy_guarded": {
"mean": 0.0,
"std": 0.0,
"min": 0.0,
"max": 0.0
}
},
"multi_source_corroboration": {
"accuracy_baseline": {
"mean": 0.6881720430107526,
"std": 0.03876936855337625,
"min": 0.6559139784946236,
"max": 0.7311827956989247
},
"accuracy_guarded": {
"mean": 0.6738351254480287,
"std": 0.031040337053205722,
"min": 0.6559139784946236,
"max": 0.7096774193548387
},
"false_trustworthy_baseline": {
"mean": 0.0,
"std": 0.0,
"min": 0.0,
"max": 0.0
},
"false_trustworthy_guarded": {
"mean": 0.0,
"std": 0.0,
"min": 0.0,
"max": 0.0
}
},
"quantitative_consensus": {
"accuracy_baseline": {
"mean": 0.7904761904761904,
"std": 0.05039526306789702,
"min": 0.7333333333333333,
"max": 0.8285714285714286
},
"accuracy_guarded": {
"mean": 0.780952380952381,
"std": 0.050395263067896975,
"min": 0.7238095238095238,
"max": 0.819047619047619
},
"false_trustworthy_baseline": {
"mean": 0.0,
"std": 0.0,
"min": 0.0,
"max": 0.0
},
"false_trustworthy_guarded": {
"mean": 0.0,
"std": 0.0,
"min": 0.0,
"max": 0.0
}
},
"expert_consensus": {
"accuracy_baseline": {
"mean": 0.7704402515723271,
"std": 0.023741617720977253,
"min": 0.7452830188679245,
"max": 0.7924528301886793
},
"accuracy_guarded": {
"mean": 0.7672955974842768,
"std": 0.02882123078588584,
"min": 0.7358490566037735,
"max": 0.7924528301886793
},
"false_trustworthy_baseline": {
"mean": 0.0,
"std": 0.0,
"min": 0.0,
"max": 0.0
},
"false_trustworthy_guarded": {
"mean": 0.0,
"std": 0.0,
"min": 0.0,
"max": 0.0
}
},
"factual_contradiction": {
"accuracy_baseline": {
"mean": 0.8908554572271387,
"std": 0.03684364600825013,
"min": 0.8495575221238938,
"max": 0.9203539823008849
},
"accuracy_guarded": {
"mean": 0.8908554572271387,
"std": 0.03684364600825013,
"min": 0.8495575221238938,
"max": 0.9203539823008849
},
"false_trustworthy_baseline": {
"mean": 0.061946902654867256,
"std": 0.0,
"min": 0.061946902654867256,
"max": 0.061946902654867256
},
"false_trustworthy_guarded": {
"mean": 0.061946902654867256,
"std": 0.0,
"min": 0.061946902654867256,
"max": 0.061946902654867256
}
},
"partial_overlap": {
"accuracy_baseline": {
"mean": 0.875,
"std": 0.022047927592204947,
"min": 0.8583333333333333,
"max": 0.9
},
"accuracy_guarded": {
"mean": 0.8888888888888888,
"std": 0.024056261216234387,
"min": 0.875,
"max": 0.9166666666666666
},
"false_trustworthy_baseline": {
"mean": 0.075,
"std": 0.025,
"min": 0.05,
"max": 0.1
},
"false_trustworthy_guarded": {
"mean": 0.06111111111111111,
"std": 0.029265704869035382,
"min": 0.03333333333333333,
"max": 0.09166666666666666
}
},
"evidence_absent": {
"accuracy_baseline": {
"mean": 0.8850574712643678,
"std": 0.00995431498602805,
"min": 0.8793103448275862,
"max": 0.896551724137931
},
"accuracy_guarded": {
"mean": 0.8908045977011494,
"std": 0.00995431498602805,
"min": 0.8793103448275862,
"max": 0.896551724137931
},
"false_trustworthy_baseline": {
"mean": 0.04597701149425287,
"std": 0.013168320962516782,
"min": 0.034482758620689655,
"max": 0.0603448275862069
},
"false_trustworthy_guarded": {
"mean": 0.031609195402298854,
"std": 0.004977157493014015,
"min": 0.02586206896551724,
"max": 0.034482758620689655
}
},
"wrong_entity": {
"accuracy_baseline": {
"mean": 0.9081632653061225,
"std": 0.0176739878323355,
"min": 0.8877551020408163,
"max": 0.9183673469387755
},
"accuracy_guarded": {
"mean": 0.9251700680272109,
"std": 0.015586992159713759,
"min": 0.9081632653061225,
"max": 0.9387755102040817
},
"false_trustworthy_baseline": {
"mean": 0.06462585034013606,
"std": 0.0058913292774451596,
"min": 0.061224489795918366,
"max": 0.07142857142857142
},
"false_trustworthy_guarded": {
"mean": 0.047619047619047616,
"std": 0.005891329277445163,
"min": 0.04081632653061224,
"max": 0.05102040816326531
}
},
"wrong_specificity": {
"accuracy_baseline": {
"mean": 0.9519519519519519,
"std": 0.03163859985841659,
"min": 0.918918918918919,
"max": 0.9819819819819819
},
"accuracy_guarded": {
"mean": 0.9669669669669669,
"std": 0.013761488573440929,
"min": 0.954954954954955,
"max": 0.9819819819819819
},
"false_trustworthy_baseline": {
"mean": 0.030030030030030026,
"std": 0.028959912195174038,
"min": 0.009009009009009009,
"max": 0.06306306306306306
},
"false_trustworthy_guarded": {
"mean": 0.015015015015015015,
"std": 0.01040270755296623,
"min": 0.009009009009009009,
"max": 0.02702702702702703
}
},
"numerical_conflict": {
"accuracy_baseline": {
"mean": 0.8707482993197279,
"std": 0.029456646387225772,
"min": 0.8367346938775511,
"max": 0.8877551020408163
},
"accuracy_guarded": {
"mean": 0.8843537414965986,
"std": 0.03583555698249229,
"min": 0.8469387755102041,
"max": 0.9183673469387755
},
"false_trustworthy_baseline": {
"mean": 0.07142857142857142,
"std": 0.0,
"min": 0.07142857142857142,
"max": 0.07142857142857142
},
"false_trustworthy_guarded": {
"mean": 0.05442176870748299,
"std": 0.02124148979047074,
"min": 0.030612244897959183,
"max": 0.07142857142857142
}
},
"temporal_conflict": {
"accuracy_baseline": {
"mean": 0.8766666666666667,
"std": 0.020816659994661344,
"min": 0.86,
"max": 0.9
},
"accuracy_guarded": {
"mean": 0.8766666666666667,
"std": 0.020816659994661344,
"min": 0.86,
"max": 0.9
},
"false_trustworthy_baseline": {
"mean": 0.03333333333333333,
"std": 0.011547005383792516,
"min": 0.02,
"max": 0.04
},
"false_trustworthy_guarded": {
"mean": 0.03333333333333333,
"std": 0.011547005383792516,
"min": 0.02,
"max": 0.04
}
}
},
"key_test_routes": {
"science_medicine": {
"accuracy_baseline": {
"mean": 0.8512820512820513,
"std": 0.01282051282051283,
"min": 0.8384615384615385,
"max": 0.8641025641025641
},
"accuracy_guarded": {
"mean": 0.8564102564102564,
"std": 0.016012815380508718,
"min": 0.8435897435897436,
"max": 0.8743589743589744
},
"false_trustworthy_baseline": {
"mean": 0.054421768707483,
"std": 0.018405101031930184,
"min": 0.036734693877551024,
"max": 0.07346938775510205
},
"false_trustworthy_guarded": {
"mean": 0.03129251700680272,
"std": 0.008496595916188296,
"min": 0.024489795918367346,
"max": 0.04081632653061224
}
},
"general_commonsense": {
"accuracy_baseline": {
"mean": 0.8795045045045046,
"std": 0.0039010153323623606,
"min": 0.875,
"max": 0.8817567567567568
},
"accuracy_guarded": {
"mean": 0.8772522522522522,
"std": 0.005160558215040393,
"min": 0.8716216216216216,
"max": 0.8817567567567568
},
"false_trustworthy_baseline": {
"mean": 0.04433497536945813,
"std": 0.004926108374384237,
"min": 0.03940886699507389,
"max": 0.04926108374384237
},
"false_trustworthy_guarded": {
"mean": 0.041050903119868636,
"std": 0.0028440899960080093,
"min": 0.03940886699507389,
"max": 0.04433497536945813
}
},
"technology_computing": {
"accuracy_baseline": {
"mean": 0.8904761904761904,
"std": 0.005947617141331817,
"min": 0.8857142857142857,
"max": 0.8971428571428571
},
"accuracy_guarded": {
"mean": 0.8876190476190476,
"std": 0.0043643578047198395,
"min": 0.8828571428571429,
"max": 0.8914285714285715
},
"false_trustworthy_baseline": {
"mean": 0.04700854700854701,
"std": 0.007401926528072123,
"min": 0.042735042735042736,
"max": 0.05555555555555555
},
"false_trustworthy_guarded": {
"mean": 0.041310541310541314,
"std": 0.004934617685381415,
"min": 0.038461538461538464,
"max": 0.04700854700854701
}
},
"law_policy": {
"accuracy_baseline": {
"mean": 0.9,
"std": 0.014301358438186956,
"min": 0.8891891891891892,
"max": 0.9162162162162162
},
"accuracy_guarded": {
"mean": 0.8963963963963963,
"std": 0.01536821811642522,
"min": 0.8837837837837837,
"max": 0.9135135135135135
},
"false_trustworthy_baseline": {
"mean": 0.015810276679841896,
"std": 0.003952569169960475,
"min": 0.011857707509881422,
"max": 0.019762845849802372
},
"false_trustworthy_guarded": {
"mean": 0.014492753623188404,
"std": 0.0022820168742672953,
"min": 0.011857707509881422,
"max": 0.015810276679841896
}
},
"economics_finance": {
"accuracy_baseline": {
"mean": 0.9124218051831994,
"std": 0.0015478559495700434,
"min": 0.9115281501340483,
"max": 0.9142091152815014
},
"accuracy_guarded": {
"mean": 0.9142091152815014,
"std": 0.004643567848710066,
"min": 0.9115281501340483,
"max": 0.9195710455764075
},
"false_trustworthy_baseline": {
"mean": 0.029850746268656716,
"std": 0.009872206384569369,
"min": 0.022388059701492536,
"max": 0.041044776119402986
},
"false_trustworthy_guarded": {
"mean": 0.02487562189054726,
"std": 0.0043085840984300435,
"min": 0.022388059701492536,
"max": 0.029850746268656716
}
}
}
}