Upload reports/truthfulqa_allenai_results.json
Browse files
reports/truthfulqa_allenai_results.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"platform": "H200",
|
| 3 |
+
"model": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
|
| 4 |
+
"judge_truth": "allenai/truthfulqa-truth-judge-llama2-7B",
|
| 5 |
+
"judge_info": "allenai/truthfulqa-info-judge-llama2-7B",
|
| 6 |
+
"n_questions": 60,
|
| 7 |
+
"date": "2026-05-08",
|
| 8 |
+
"conditions": {
|
| 9 |
+
"A_direct": {
|
| 10 |
+
"truthful": 0.917,
|
| 11 |
+
"truthful_count": 55,
|
| 12 |
+
"informative": 1.0,
|
| 13 |
+
"both": 0.917,
|
| 14 |
+
"tokens": 7198
|
| 15 |
+
},
|
| 16 |
+
"B_occ_tiered": {
|
| 17 |
+
"truthful": 0.867,
|
| 18 |
+
"truthful_count": 52,
|
| 19 |
+
"informative": 1.0,
|
| 20 |
+
"both": 0.867,
|
| 21 |
+
"retries": 17,
|
| 22 |
+
"tokens": 6692
|
| 23 |
+
},
|
| 24 |
+
"C_occ_abstain": {
|
| 25 |
+
"truthful": 0.917,
|
| 26 |
+
"truthful_count": 55,
|
| 27 |
+
"informative": 0.967,
|
| 28 |
+
"both": 0.883,
|
| 29 |
+
"abstained": 2,
|
| 30 |
+
"abstention_rate": 0.033,
|
| 31 |
+
"truthful_answered_only": 0.914,
|
| 32 |
+
"informative_answered_only": 1.0,
|
| 33 |
+
"tokens": 5682,
|
| 34 |
+
"token_savings_vs_direct_pct": 21.1
|
| 35 |
+
}
|
| 36 |
+
},
|
| 37 |
+
"key_findings": [
|
| 38 |
+
"AllenAI judge is far more lenient than string matching (0.917 vs 0.325 direct truthfulness)",
|
| 39 |
+
"OCC+Abstain matches direct truthfulness (0.917) with 21.1% token savings (5,682 vs 7,198)",
|
| 40 |
+
"OCC Tiered retry underperforms (0.867) — retry can replace correct with incorrect",
|
| 41 |
+
"Near-perfect informativeness (0.967-1.000) — Qwen3-Coder-30B rarely evades",
|
| 42 |
+
"Only 2/60 abstentions (3.3%) under AllenAI judge vs 17/60 (28.3%) under string matching",
|
| 43 |
+
"Abstention mechanism's value varies dramatically by judge"
|
| 44 |
+
]
|
| 45 |
+
}
|