narcolepticchicken
/

occ-stack

ml-intern

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 21 days ago

Commit

ef70478

verified ·

1 Parent(s): fdeb852

Upload reports/truthfulqa_allenai_results.json

Browse files

Files changed (1) hide show

reports/truthfulqa_allenai_results.json +45 -0

reports/truthfulqa_allenai_results.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "platform": "H200",
+  "model": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
+  "judge_truth": "allenai/truthfulqa-truth-judge-llama2-7B",
+  "judge_info": "allenai/truthfulqa-info-judge-llama2-7B",
+  "n_questions": 60,
+  "date": "2026-05-08",
+  "conditions": {
+    "A_direct": {
+      "truthful": 0.917,
+      "truthful_count": 55,
+      "informative": 1.0,
+      "both": 0.917,
+      "tokens": 7198
+    },
+    "B_occ_tiered": {
+      "truthful": 0.867,
+      "truthful_count": 52,
+      "informative": 1.0,
+      "both": 0.867,
+      "retries": 17,
+      "tokens": 6692
+    },
+    "C_occ_abstain": {
+      "truthful": 0.917,
+      "truthful_count": 55,
+      "informative": 0.967,
+      "both": 0.883,
+      "abstained": 2,
+      "abstention_rate": 0.033,
+      "truthful_answered_only": 0.914,
+      "informative_answered_only": 1.0,
+      "tokens": 5682,
+      "token_savings_vs_direct_pct": 21.1
+    }
+  },
+  "key_findings": [
+    "AllenAI judge is far more lenient than string matching (0.917 vs 0.325 direct truthfulness)",
+    "OCC+Abstain matches direct truthfulness (0.917) with 21.1% token savings (5,682 vs 7,198)",
+    "OCC Tiered retry underperforms (0.867) — retry can replace correct with incorrect",
+    "Near-perfect informativeness (0.967-1.000) — Qwen3-Coder-30B rarely evades",
+    "Only 2/60 abstentions (3.3%) under AllenAI judge vs 17/60 (28.3%) under string matching",
+    "Abstention mechanism's value varies dramatically by judge"
+  ]
+}