Spaces:
Running
Running
| { | |
| "benchmark": "phi-coherence-comparison", | |
| "timestamp": "2026-02-28 19:40:17", | |
| "max_samples": 100, | |
| "constants": { | |
| "phi": 1.618033988749895, | |
| "alpha": 137 | |
| }, | |
| "results": [ | |
| { | |
| "method": "\u03c6-Coherence (t=0.7)", | |
| "dataset": "truthfulqa", | |
| "subset": "", | |
| "accuracy": 0.737, | |
| "precision": 0.8213, | |
| "recall": 0.8622, | |
| "f1": 0.8413, | |
| "avg_time_ms": 0.03, | |
| "total_samples": 521, | |
| "true_positives": 363, | |
| "false_positives": 79, | |
| "true_negatives": 21, | |
| "false_negatives": 58 | |
| }, | |
| { | |
| "method": "\u03c6-Coherence (t=0.5)", | |
| "dataset": "truthfulqa", | |
| "subset": "", | |
| "accuracy": 0.1919, | |
| "precision": 0, | |
| "recall": 0.0, | |
| "f1": 0, | |
| "avg_time_ms": 0.03, | |
| "total_samples": 521, | |
| "true_positives": 0, | |
| "false_positives": 0, | |
| "true_negatives": 100, | |
| "false_negatives": 421 | |
| }, | |
| { | |
| "method": "\u03c6-Coherence (t=0.6)", | |
| "dataset": "truthfulqa", | |
| "subset": "", | |
| "accuracy": 0.2361, | |
| "precision": 0.7949, | |
| "recall": 0.0736, | |
| "f1": 0.1348, | |
| "avg_time_ms": 0.03, | |
| "total_samples": 521, | |
| "true_positives": 31, | |
| "false_positives": 8, | |
| "true_negatives": 92, | |
| "false_negatives": 390 | |
| }, | |
| { | |
| "method": "Length Baseline (t=100)", | |
| "dataset": "truthfulqa", | |
| "subset": "", | |
| "accuracy": 0.3647, | |
| "precision": 0.8516, | |
| "recall": 0.2589, | |
| "f1": 0.3971, | |
| "avg_time_ms": 0.0, | |
| "total_samples": 521, | |
| "true_positives": 109, | |
| "false_positives": 19, | |
| "true_negatives": 81, | |
| "false_negatives": 312 | |
| }, | |
| { | |
| "method": "Random Baseline", | |
| "dataset": "truthfulqa", | |
| "subset": "", | |
| "accuracy": 0.4894, | |
| "precision": 0.7947, | |
| "recall": 0.4964, | |
| "f1": 0.6111, | |
| "avg_time_ms": 0.0, | |
| "total_samples": 521, | |
| "true_positives": 209, | |
| "false_positives": 54, | |
| "true_negatives": 46, | |
| "false_negatives": 212 | |
| }, | |
| { | |
| "method": "\u03c6-Coherence (t=0.7)", | |
| "dataset": "halueval_qa", | |
| "subset": "", | |
| "accuracy": 0.5, | |
| "precision": 0.5, | |
| "recall": 0.98, | |
| "f1": 0.6622, | |
| "avg_time_ms": 0.09, | |
| "total_samples": 200, | |
| "true_positives": 98, | |
| "false_positives": 98, | |
| "true_negatives": 2, | |
| "false_negatives": 2 | |
| }, | |
| { | |
| "method": "\u03c6-Coherence (t=0.5)", | |
| "dataset": "halueval_qa", | |
| "subset": "", | |
| "accuracy": 0.5, | |
| "precision": 0, | |
| "recall": 0.0, | |
| "f1": 0, | |
| "avg_time_ms": 0.09, | |
| "total_samples": 200, | |
| "true_positives": 0, | |
| "false_positives": 0, | |
| "true_negatives": 100, | |
| "false_negatives": 100 | |
| }, | |
| { | |
| "method": "\u03c6-Coherence (t=0.6)", | |
| "dataset": "halueval_qa", | |
| "subset": "", | |
| "accuracy": 0.575, | |
| "precision": 0.6471, | |
| "recall": 0.33, | |
| "f1": 0.4371, | |
| "avg_time_ms": 0.09, | |
| "total_samples": 200, | |
| "true_positives": 33, | |
| "false_positives": 18, | |
| "true_negatives": 82, | |
| "false_negatives": 67 | |
| }, | |
| { | |
| "method": "Length Baseline (t=100)", | |
| "dataset": "halueval_qa", | |
| "subset": "", | |
| "accuracy": 0.5, | |
| "precision": 0, | |
| "recall": 0.0, | |
| "f1": 0, | |
| "avg_time_ms": 0.0, | |
| "total_samples": 200, | |
| "true_positives": 0, | |
| "false_positives": 0, | |
| "true_negatives": 100, | |
| "false_negatives": 100 | |
| }, | |
| { | |
| "method": "Random Baseline", | |
| "dataset": "halueval_qa", | |
| "subset": "", | |
| "accuracy": 0.465, | |
| "precision": 0.4639, | |
| "recall": 0.45, | |
| "f1": 0.4569, | |
| "avg_time_ms": 0.0, | |
| "total_samples": 200, | |
| "true_positives": 45, | |
| "false_positives": 52, | |
| "true_negatives": 48, | |
| "false_negatives": 55 | |
| } | |
| ] | |
| } |