narcolepticchicken commited on
Commit
ef70478
·
verified ·
1 Parent(s): fdeb852

Upload reports/truthfulqa_allenai_results.json

Browse files
reports/truthfulqa_allenai_results.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "platform": "H200",
3
+ "model": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
4
+ "judge_truth": "allenai/truthfulqa-truth-judge-llama2-7B",
5
+ "judge_info": "allenai/truthfulqa-info-judge-llama2-7B",
6
+ "n_questions": 60,
7
+ "date": "2026-05-08",
8
+ "conditions": {
9
+ "A_direct": {
10
+ "truthful": 0.917,
11
+ "truthful_count": 55,
12
+ "informative": 1.0,
13
+ "both": 0.917,
14
+ "tokens": 7198
15
+ },
16
+ "B_occ_tiered": {
17
+ "truthful": 0.867,
18
+ "truthful_count": 52,
19
+ "informative": 1.0,
20
+ "both": 0.867,
21
+ "retries": 17,
22
+ "tokens": 6692
23
+ },
24
+ "C_occ_abstain": {
25
+ "truthful": 0.917,
26
+ "truthful_count": 55,
27
+ "informative": 0.967,
28
+ "both": 0.883,
29
+ "abstained": 2,
30
+ "abstention_rate": 0.033,
31
+ "truthful_answered_only": 0.914,
32
+ "informative_answered_only": 1.0,
33
+ "tokens": 5682,
34
+ "token_savings_vs_direct_pct": 21.1
35
+ }
36
+ },
37
+ "key_findings": [
38
+ "AllenAI judge is far more lenient than string matching (0.917 vs 0.325 direct truthfulness)",
39
+ "OCC+Abstain matches direct truthfulness (0.917) with 21.1% token savings (5,682 vs 7,198)",
40
+ "OCC Tiered retry underperforms (0.867) — retry can replace correct with incorrect",
41
+ "Near-perfect informativeness (0.967-1.000) — Qwen3-Coder-30B rarely evades",
42
+ "Only 2/60 abstentions (3.3%) under AllenAI judge vs 17/60 (28.3%) under string matching",
43
+ "Abstention mechanism's value varies dramatically by judge"
44
+ ]
45
+ }