{ "benchmark": "phi-coherence-comparison", "timestamp": "2026-02-28 19:40:17", "max_samples": 100, "constants": { "phi": 1.618033988749895, "alpha": 137 }, "results": [ { "method": "\u03c6-Coherence (t=0.7)", "dataset": "truthfulqa", "subset": "", "accuracy": 0.737, "precision": 0.8213, "recall": 0.8622, "f1": 0.8413, "avg_time_ms": 0.03, "total_samples": 521, "true_positives": 363, "false_positives": 79, "true_negatives": 21, "false_negatives": 58 }, { "method": "\u03c6-Coherence (t=0.5)", "dataset": "truthfulqa", "subset": "", "accuracy": 0.1919, "precision": 0, "recall": 0.0, "f1": 0, "avg_time_ms": 0.03, "total_samples": 521, "true_positives": 0, "false_positives": 0, "true_negatives": 100, "false_negatives": 421 }, { "method": "\u03c6-Coherence (t=0.6)", "dataset": "truthfulqa", "subset": "", "accuracy": 0.2361, "precision": 0.7949, "recall": 0.0736, "f1": 0.1348, "avg_time_ms": 0.03, "total_samples": 521, "true_positives": 31, "false_positives": 8, "true_negatives": 92, "false_negatives": 390 }, { "method": "Length Baseline (t=100)", "dataset": "truthfulqa", "subset": "", "accuracy": 0.3647, "precision": 0.8516, "recall": 0.2589, "f1": 0.3971, "avg_time_ms": 0.0, "total_samples": 521, "true_positives": 109, "false_positives": 19, "true_negatives": 81, "false_negatives": 312 }, { "method": "Random Baseline", "dataset": "truthfulqa", "subset": "", "accuracy": 0.4894, "precision": 0.7947, "recall": 0.4964, "f1": 0.6111, "avg_time_ms": 0.0, "total_samples": 521, "true_positives": 209, "false_positives": 54, "true_negatives": 46, "false_negatives": 212 }, { "method": "\u03c6-Coherence (t=0.7)", "dataset": "halueval_qa", "subset": "", "accuracy": 0.5, "precision": 0.5, "recall": 0.98, "f1": 0.6622, "avg_time_ms": 0.09, "total_samples": 200, "true_positives": 98, "false_positives": 98, "true_negatives": 2, "false_negatives": 2 }, { "method": "\u03c6-Coherence (t=0.5)", "dataset": "halueval_qa", "subset": "", "accuracy": 0.5, "precision": 0, "recall": 0.0, "f1": 0, "avg_time_ms": 0.09, "total_samples": 200, "true_positives": 0, "false_positives": 0, "true_negatives": 100, "false_negatives": 100 }, { "method": "\u03c6-Coherence (t=0.6)", "dataset": "halueval_qa", "subset": "", "accuracy": 0.575, "precision": 0.6471, "recall": 0.33, "f1": 0.4371, "avg_time_ms": 0.09, "total_samples": 200, "true_positives": 33, "false_positives": 18, "true_negatives": 82, "false_negatives": 67 }, { "method": "Length Baseline (t=100)", "dataset": "halueval_qa", "subset": "", "accuracy": 0.5, "precision": 0, "recall": 0.0, "f1": 0, "avg_time_ms": 0.0, "total_samples": 200, "true_positives": 0, "false_positives": 0, "true_negatives": 100, "false_negatives": 100 }, { "method": "Random Baseline", "dataset": "halueval_qa", "subset": "", "accuracy": 0.465, "precision": 0.4639, "recall": 0.45, "f1": 0.4569, "avg_time_ms": 0.0, "total_samples": 200, "true_positives": 45, "false_positives": 52, "true_negatives": 48, "false_negatives": 55 } ] }