phi-coherence / benchmark_comparison_results.json
bitsabhi's picture
v2: Hallucination Risk Scoring - 75% accuracy
36e08e8
{
"benchmark": "phi-coherence-comparison",
"timestamp": "2026-02-28 19:40:17",
"max_samples": 100,
"constants": {
"phi": 1.618033988749895,
"alpha": 137
},
"results": [
{
"method": "\u03c6-Coherence (t=0.7)",
"dataset": "truthfulqa",
"subset": "",
"accuracy": 0.737,
"precision": 0.8213,
"recall": 0.8622,
"f1": 0.8413,
"avg_time_ms": 0.03,
"total_samples": 521,
"true_positives": 363,
"false_positives": 79,
"true_negatives": 21,
"false_negatives": 58
},
{
"method": "\u03c6-Coherence (t=0.5)",
"dataset": "truthfulqa",
"subset": "",
"accuracy": 0.1919,
"precision": 0,
"recall": 0.0,
"f1": 0,
"avg_time_ms": 0.03,
"total_samples": 521,
"true_positives": 0,
"false_positives": 0,
"true_negatives": 100,
"false_negatives": 421
},
{
"method": "\u03c6-Coherence (t=0.6)",
"dataset": "truthfulqa",
"subset": "",
"accuracy": 0.2361,
"precision": 0.7949,
"recall": 0.0736,
"f1": 0.1348,
"avg_time_ms": 0.03,
"total_samples": 521,
"true_positives": 31,
"false_positives": 8,
"true_negatives": 92,
"false_negatives": 390
},
{
"method": "Length Baseline (t=100)",
"dataset": "truthfulqa",
"subset": "",
"accuracy": 0.3647,
"precision": 0.8516,
"recall": 0.2589,
"f1": 0.3971,
"avg_time_ms": 0.0,
"total_samples": 521,
"true_positives": 109,
"false_positives": 19,
"true_negatives": 81,
"false_negatives": 312
},
{
"method": "Random Baseline",
"dataset": "truthfulqa",
"subset": "",
"accuracy": 0.4894,
"precision": 0.7947,
"recall": 0.4964,
"f1": 0.6111,
"avg_time_ms": 0.0,
"total_samples": 521,
"true_positives": 209,
"false_positives": 54,
"true_negatives": 46,
"false_negatives": 212
},
{
"method": "\u03c6-Coherence (t=0.7)",
"dataset": "halueval_qa",
"subset": "",
"accuracy": 0.5,
"precision": 0.5,
"recall": 0.98,
"f1": 0.6622,
"avg_time_ms": 0.09,
"total_samples": 200,
"true_positives": 98,
"false_positives": 98,
"true_negatives": 2,
"false_negatives": 2
},
{
"method": "\u03c6-Coherence (t=0.5)",
"dataset": "halueval_qa",
"subset": "",
"accuracy": 0.5,
"precision": 0,
"recall": 0.0,
"f1": 0,
"avg_time_ms": 0.09,
"total_samples": 200,
"true_positives": 0,
"false_positives": 0,
"true_negatives": 100,
"false_negatives": 100
},
{
"method": "\u03c6-Coherence (t=0.6)",
"dataset": "halueval_qa",
"subset": "",
"accuracy": 0.575,
"precision": 0.6471,
"recall": 0.33,
"f1": 0.4371,
"avg_time_ms": 0.09,
"total_samples": 200,
"true_positives": 33,
"false_positives": 18,
"true_negatives": 82,
"false_negatives": 67
},
{
"method": "Length Baseline (t=100)",
"dataset": "halueval_qa",
"subset": "",
"accuracy": 0.5,
"precision": 0,
"recall": 0.0,
"f1": 0,
"avg_time_ms": 0.0,
"total_samples": 200,
"true_positives": 0,
"false_positives": 0,
"true_negatives": 100,
"false_negatives": 100
},
{
"method": "Random Baseline",
"dataset": "halueval_qa",
"subset": "",
"accuracy": 0.465,
"precision": 0.4639,
"recall": 0.45,
"f1": 0.4569,
"avg_time_ms": 0.0,
"total_samples": 200,
"true_positives": 45,
"false_positives": 52,
"true_negatives": 48,
"false_negatives": 55
}
]
}