File size: 3,847 Bytes
36e08e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
{
  "benchmark": "phi-coherence-comparison",
  "timestamp": "2026-02-28 19:40:17",
  "max_samples": 100,
  "constants": {
    "phi": 1.618033988749895,
    "alpha": 137
  },
  "results": [
    {
      "method": "\u03c6-Coherence (t=0.7)",
      "dataset": "truthfulqa",
      "subset": "",
      "accuracy": 0.737,
      "precision": 0.8213,
      "recall": 0.8622,
      "f1": 0.8413,
      "avg_time_ms": 0.03,
      "total_samples": 521,
      "true_positives": 363,
      "false_positives": 79,
      "true_negatives": 21,
      "false_negatives": 58
    },
    {
      "method": "\u03c6-Coherence (t=0.5)",
      "dataset": "truthfulqa",
      "subset": "",
      "accuracy": 0.1919,
      "precision": 0,
      "recall": 0.0,
      "f1": 0,
      "avg_time_ms": 0.03,
      "total_samples": 521,
      "true_positives": 0,
      "false_positives": 0,
      "true_negatives": 100,
      "false_negatives": 421
    },
    {
      "method": "\u03c6-Coherence (t=0.6)",
      "dataset": "truthfulqa",
      "subset": "",
      "accuracy": 0.2361,
      "precision": 0.7949,
      "recall": 0.0736,
      "f1": 0.1348,
      "avg_time_ms": 0.03,
      "total_samples": 521,
      "true_positives": 31,
      "false_positives": 8,
      "true_negatives": 92,
      "false_negatives": 390
    },
    {
      "method": "Length Baseline (t=100)",
      "dataset": "truthfulqa",
      "subset": "",
      "accuracy": 0.3647,
      "precision": 0.8516,
      "recall": 0.2589,
      "f1": 0.3971,
      "avg_time_ms": 0.0,
      "total_samples": 521,
      "true_positives": 109,
      "false_positives": 19,
      "true_negatives": 81,
      "false_negatives": 312
    },
    {
      "method": "Random Baseline",
      "dataset": "truthfulqa",
      "subset": "",
      "accuracy": 0.4894,
      "precision": 0.7947,
      "recall": 0.4964,
      "f1": 0.6111,
      "avg_time_ms": 0.0,
      "total_samples": 521,
      "true_positives": 209,
      "false_positives": 54,
      "true_negatives": 46,
      "false_negatives": 212
    },
    {
      "method": "\u03c6-Coherence (t=0.7)",
      "dataset": "halueval_qa",
      "subset": "",
      "accuracy": 0.5,
      "precision": 0.5,
      "recall": 0.98,
      "f1": 0.6622,
      "avg_time_ms": 0.09,
      "total_samples": 200,
      "true_positives": 98,
      "false_positives": 98,
      "true_negatives": 2,
      "false_negatives": 2
    },
    {
      "method": "\u03c6-Coherence (t=0.5)",
      "dataset": "halueval_qa",
      "subset": "",
      "accuracy": 0.5,
      "precision": 0,
      "recall": 0.0,
      "f1": 0,
      "avg_time_ms": 0.09,
      "total_samples": 200,
      "true_positives": 0,
      "false_positives": 0,
      "true_negatives": 100,
      "false_negatives": 100
    },
    {
      "method": "\u03c6-Coherence (t=0.6)",
      "dataset": "halueval_qa",
      "subset": "",
      "accuracy": 0.575,
      "precision": 0.6471,
      "recall": 0.33,
      "f1": 0.4371,
      "avg_time_ms": 0.09,
      "total_samples": 200,
      "true_positives": 33,
      "false_positives": 18,
      "true_negatives": 82,
      "false_negatives": 67
    },
    {
      "method": "Length Baseline (t=100)",
      "dataset": "halueval_qa",
      "subset": "",
      "accuracy": 0.5,
      "precision": 0,
      "recall": 0.0,
      "f1": 0,
      "avg_time_ms": 0.0,
      "total_samples": 200,
      "true_positives": 0,
      "false_positives": 0,
      "true_negatives": 100,
      "false_negatives": 100
    },
    {
      "method": "Random Baseline",
      "dataset": "halueval_qa",
      "subset": "",
      "accuracy": 0.465,
      "precision": 0.4639,
      "recall": 0.45,
      "f1": 0.4569,
      "avg_time_ms": 0.0,
      "total_samples": 200,
      "true_positives": 45,
      "false_positives": 52,
      "true_negatives": 48,
      "false_negatives": 55
    }
  ]
}