Sage / data /calibration /token_threshold.analysis.json
vxa8502's picture
Calibrate evidence quality gate thresholds
fbc14e7
{
"token_analysis": {
"recommended_threshold": 20,
"best_f1": 0.805,
"best_precision": 0.733,
"best_recall": 0.892,
"correlation_tokens_hhem": -0.54,
"current_threshold": 30,
"total_samples": 50,
"hallucination_rate": 0.26,
"mean_tokens": 181.1,
"median_tokens": 128.0,
"all_thresholds": [
{
"precision": 0.733,
"recall": 0.892,
"f1": 0.805,
"tp": 33,
"fp": 12,
"tn": 1,
"fn": 4,
"n_passed": 45,
"n_refused": 5,
"threshold": 20,
"mean_hhem_above": 0.68
},
{
"precision": 0.732,
"recall": 0.811,
"f1": 0.769,
"tp": 30,
"fp": 11,
"tn": 2,
"fn": 7,
"n_passed": 41,
"n_refused": 9,
"threshold": 30,
"mean_hhem_above": 0.675
},
{
"precision": 0.718,
"recall": 0.757,
"f1": 0.737,
"tp": 28,
"fp": 11,
"tn": 2,
"fn": 9,
"n_passed": 39,
"n_refused": 11,
"threshold": 40,
"mean_hhem_above": 0.668
},
{
"precision": 0.711,
"recall": 0.73,
"f1": 0.72,
"tp": 27,
"fp": 11,
"tn": 2,
"fn": 10,
"n_passed": 38,
"n_refused": 12,
"threshold": 50,
"mean_hhem_above": 0.666
},
{
"precision": 0.694,
"recall": 0.676,
"f1": 0.685,
"tp": 25,
"fp": 11,
"tn": 2,
"fn": 12,
"n_passed": 36,
"n_refused": 14,
"threshold": 60,
"mean_hhem_above": 0.655
},
{
"precision": 0.686,
"recall": 0.649,
"f1": 0.667,
"tp": 24,
"fp": 11,
"tn": 2,
"fn": 13,
"n_passed": 35,
"n_refused": 15,
"threshold": 70,
"mean_hhem_above": 0.646
},
{
"precision": 0.667,
"recall": 0.595,
"f1": 0.629,
"tp": 22,
"fp": 11,
"tn": 2,
"fn": 15,
"n_passed": 33,
"n_refused": 17,
"threshold": 80,
"mean_hhem_above": 0.639
},
{
"precision": 0.667,
"recall": 0.541,
"f1": 0.597,
"tp": 20,
"fp": 10,
"tn": 3,
"fn": 17,
"n_passed": 30,
"n_refused": 20,
"threshold": 90,
"mean_hhem_above": 0.642
},
{
"precision": 0.655,
"recall": 0.514,
"f1": 0.576,
"tp": 19,
"fp": 10,
"tn": 3,
"fn": 18,
"n_passed": 29,
"n_refused": 21,
"threshold": 100,
"mean_hhem_above": 0.631
},
{
"precision": 0.615,
"recall": 0.432,
"f1": 0.508,
"tp": 16,
"fp": 10,
"tn": 3,
"fn": 21,
"n_passed": 26,
"n_refused": 24,
"threshold": 110,
"mean_hhem_above": 0.616
},
{
"precision": 0.615,
"recall": 0.432,
"f1": 0.508,
"tp": 16,
"fp": 10,
"tn": 3,
"fn": 21,
"n_passed": 26,
"n_refused": 24,
"threshold": 120,
"mean_hhem_above": 0.616
},
{
"precision": 0.6,
"recall": 0.405,
"f1": 0.484,
"tp": 15,
"fp": 10,
"tn": 3,
"fn": 22,
"n_passed": 25,
"n_refused": 25,
"threshold": 130,
"mean_hhem_above": 0.604
},
{
"precision": 0.583,
"recall": 0.378,
"f1": 0.459,
"tp": 14,
"fp": 10,
"tn": 3,
"fn": 23,
"n_passed": 24,
"n_refused": 26,
"threshold": 140,
"mean_hhem_above": 0.593
},
{
"precision": 0.571,
"recall": 0.324,
"f1": 0.414,
"tp": 12,
"fp": 9,
"tn": 4,
"fn": 25,
"n_passed": 21,
"n_refused": 29,
"threshold": 150,
"mean_hhem_above": 0.589
},
{
"precision": 0.5,
"recall": 0.243,
"f1": 0.327,
"tp": 9,
"fp": 9,
"tn": 4,
"fn": 28,
"n_passed": 18,
"n_refused": 32,
"threshold": 160,
"mean_hhem_above": 0.534
},
{
"precision": 0.471,
"recall": 0.216,
"f1": 0.296,
"tp": 8,
"fp": 9,
"tn": 4,
"fn": 29,
"n_passed": 17,
"n_refused": 33,
"threshold": 170,
"mean_hhem_above": 0.508
},
{
"precision": 0.438,
"recall": 0.189,
"f1": 0.264,
"tp": 7,
"fp": 9,
"tn": 4,
"fn": 30,
"n_passed": 16,
"n_refused": 34,
"threshold": 180,
"mean_hhem_above": 0.481
},
{
"precision": 0.438,
"recall": 0.189,
"f1": 0.264,
"tp": 7,
"fp": 9,
"tn": 4,
"fn": 30,
"n_passed": 16,
"n_refused": 34,
"threshold": 190,
"mean_hhem_above": 0.481
}
]
},
"chunk_analysis": {
"recommended_threshold": 1,
"best_f1": 0.851,
"best_precision": 0.74,
"best_recall": 1.0,
"correlation_chunks_hhem": -0.514,
"current_threshold": 1,
"mean_chunks": 2.06,
"all_thresholds": [
{
"precision": 0.74,
"recall": 1.0,
"f1": 0.851,
"tp": 37,
"fp": 13,
"tn": 0,
"fn": 0,
"n_passed": 50,
"n_refused": 0,
"threshold": 1,
"mean_hhem_above": 0.679
},
{
"precision": 0.522,
"recall": 0.324,
"f1": 0.4,
"tp": 12,
"fp": 11,
"tn": 2,
"fn": 25,
"n_passed": 23,
"n_refused": 27,
"threshold": 2,
"mean_hhem_above": 0.539
},
{
"precision": 0.308,
"recall": 0.108,
"f1": 0.16,
"tp": 4,
"fp": 9,
"tn": 4,
"fn": 33,
"n_passed": 13,
"n_refused": 37,
"threshold": 3,
"mean_hhem_above": 0.398
},
{
"precision": 0.375,
"recall": 0.081,
"f1": 0.133,
"tp": 3,
"fp": 5,
"tn": 8,
"fn": 34,
"n_passed": 8,
"n_refused": 42,
"threshold": 4,
"mean_hhem_above": 0.427
},
{
"precision": 0.25,
"recall": 0.027,
"f1": 0.049,
"tp": 1,
"fp": 3,
"tn": 10,
"fn": 36,
"n_passed": 4,
"n_refused": 46,
"threshold": 5,
"mean_hhem_above": 0.402
}
]
},
"combined_analysis": {
"recommended_token_threshold": 20,
"recommended_chunk_threshold": 1,
"best_f1": 0.805,
"best_precision": 0.733,
"best_recall": 0.892,
"current_token_threshold": 30,
"current_chunk_threshold": 1,
"all_combinations": [
{
"precision": 0.733,
"recall": 0.892,
"f1": 0.805,
"tp": 33,
"fp": 12,
"tn": 1,
"fn": 4,
"n_passed": 45,
"n_refused": 5,
"token_threshold": 20,
"chunk_threshold": 1
},
{
"precision": 0.522,
"recall": 0.324,
"f1": 0.4,
"tp": 12,
"fp": 11,
"tn": 2,
"fn": 25,
"n_passed": 23,
"n_refused": 27,
"token_threshold": 20,
"chunk_threshold": 2
},
{
"precision": 0.308,
"recall": 0.108,
"f1": 0.16,
"tp": 4,
"fp": 9,
"tn": 4,
"fn": 33,
"n_passed": 13,
"n_refused": 37,
"token_threshold": 20,
"chunk_threshold": 3
},
{
"precision": 0.732,
"recall": 0.811,
"f1": 0.769,
"tp": 30,
"fp": 11,
"tn": 2,
"fn": 7,
"n_passed": 41,
"n_refused": 9,
"token_threshold": 30,
"chunk_threshold": 1
},
{
"precision": 0.522,
"recall": 0.324,
"f1": 0.4,
"tp": 12,
"fp": 11,
"tn": 2,
"fn": 25,
"n_passed": 23,
"n_refused": 27,
"token_threshold": 30,
"chunk_threshold": 2
},
{
"precision": 0.308,
"recall": 0.108,
"f1": 0.16,
"tp": 4,
"fp": 9,
"tn": 4,
"fn": 33,
"n_passed": 13,
"n_refused": 37,
"token_threshold": 30,
"chunk_threshold": 3
},
{
"precision": 0.718,
"recall": 0.757,
"f1": 0.737,
"tp": 28,
"fp": 11,
"tn": 2,
"fn": 9,
"n_passed": 39,
"n_refused": 11,
"token_threshold": 40,
"chunk_threshold": 1
},
{
"precision": 0.522,
"recall": 0.324,
"f1": 0.4,
"tp": 12,
"fp": 11,
"tn": 2,
"fn": 25,
"n_passed": 23,
"n_refused": 27,
"token_threshold": 40,
"chunk_threshold": 2
},
{
"precision": 0.308,
"recall": 0.108,
"f1": 0.16,
"tp": 4,
"fp": 9,
"tn": 4,
"fn": 33,
"n_passed": 13,
"n_refused": 37,
"token_threshold": 40,
"chunk_threshold": 3
},
{
"precision": 0.711,
"recall": 0.73,
"f1": 0.72,
"tp": 27,
"fp": 11,
"tn": 2,
"fn": 10,
"n_passed": 38,
"n_refused": 12,
"token_threshold": 50,
"chunk_threshold": 1
},
{
"precision": 0.522,
"recall": 0.324,
"f1": 0.4,
"tp": 12,
"fp": 11,
"tn": 2,
"fn": 25,
"n_passed": 23,
"n_refused": 27,
"token_threshold": 50,
"chunk_threshold": 2
},
{
"precision": 0.308,
"recall": 0.108,
"f1": 0.16,
"tp": 4,
"fp": 9,
"tn": 4,
"fn": 33,
"n_passed": 13,
"n_refused": 37,
"token_threshold": 50,
"chunk_threshold": 3
},
{
"precision": 0.667,
"recall": 0.595,
"f1": 0.629,
"tp": 22,
"fp": 11,
"tn": 2,
"fn": 15,
"n_passed": 33,
"n_refused": 17,
"token_threshold": 75,
"chunk_threshold": 1
},
{
"precision": 0.522,
"recall": 0.324,
"f1": 0.4,
"tp": 12,
"fp": 11,
"tn": 2,
"fn": 25,
"n_passed": 23,
"n_refused": 27,
"token_threshold": 75,
"chunk_threshold": 2
},
{
"precision": 0.308,
"recall": 0.108,
"f1": 0.16,
"tp": 4,
"fp": 9,
"tn": 4,
"fn": 33,
"n_passed": 13,
"n_refused": 37,
"token_threshold": 75,
"chunk_threshold": 3
},
{
"precision": 0.655,
"recall": 0.514,
"f1": 0.576,
"tp": 19,
"fp": 10,
"tn": 3,
"fn": 18,
"n_passed": 29,
"n_refused": 21,
"token_threshold": 100,
"chunk_threshold": 1
},
{
"precision": 0.524,
"recall": 0.297,
"f1": 0.379,
"tp": 11,
"fp": 10,
"tn": 3,
"fn": 26,
"n_passed": 21,
"n_refused": 29,
"token_threshold": 100,
"chunk_threshold": 2
},
{
"precision": 0.308,
"recall": 0.108,
"f1": 0.16,
"tp": 4,
"fp": 9,
"tn": 4,
"fn": 33,
"n_passed": 13,
"n_refused": 37,
"token_threshold": 100,
"chunk_threshold": 3
}
]
}
}