Upload reports/blackwell_results_v9.json
Browse files
reports/blackwell_results_v9.json
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"platform": "NVIDIA RTX PRO 6000 Blackwell Workstation Edition",
|
| 3 |
+
"pytorch": "2.11.0+cu130",
|
| 4 |
+
"cuda": "13.0",
|
| 5 |
+
"vram_gb": 95,
|
| 6 |
+
"model": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
|
| 7 |
+
"seed": 42,
|
| 8 |
+
"date": "2026-05-07",
|
| 9 |
+
"execution": {
|
| 10 |
+
"script": "occ_benchmark_all.py (v2, subprocess+check)",
|
| 11 |
+
"patches": [
|
| 12 |
+
"total_mem → total_memory (PyTorch 2.11 property name)",
|
| 13 |
+
"local HF cache (global /srv/models/hf-cache full)",
|
| 14 |
+
"subprocess explicit check(entry_point) call"
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
"debate": {
|
| 18 |
+
"topics": 30,
|
| 19 |
+
"agents_per_topic": 4,
|
| 20 |
+
"baseline_equal_1round": {
|
| 21 |
+
"accuracy": 0.867,
|
| 22 |
+
"correct": 26,
|
| 23 |
+
"total": 30,
|
| 24 |
+
"tokens": 42752
|
| 25 |
+
},
|
| 26 |
+
"occ_240_5": {
|
| 27 |
+
"accuracy": 0.933,
|
| 28 |
+
"correct": 28,
|
| 29 |
+
"total": 30,
|
| 30 |
+
"tokens": 40259,
|
| 31 |
+
"denied": 5
|
| 32 |
+
},
|
| 33 |
+
"occ_180_3": {
|
| 34 |
+
"accuracy": 0.967,
|
| 35 |
+
"correct": 29,
|
| 36 |
+
"total": 30,
|
| 37 |
+
"tokens": 42760,
|
| 38 |
+
"denied": 0
|
| 39 |
+
},
|
| 40 |
+
"occ_120_3": {
|
| 41 |
+
"accuracy": 0.833,
|
| 42 |
+
"correct": 25,
|
| 43 |
+
"total": 30,
|
| 44 |
+
"tokens": 41309,
|
| 45 |
+
"denied": 0
|
| 46 |
+
},
|
| 47 |
+
"occ_delta_over_baseline": "+10.0pp (180/3 on both platforms)"
|
| 48 |
+
},
|
| 49 |
+
"humaneval": {
|
| 50 |
+
"n_problems": 164,
|
| 51 |
+
"evaluation": "subprocess.run(sys.executable, timeout=30) + explicit check(entry_point)",
|
| 52 |
+
"pass1_128tok": {
|
| 53 |
+
"passed": "~55",
|
| 54 |
+
"tokens": "see totals"
|
| 55 |
+
},
|
| 56 |
+
"pass2_1024tok": {
|
| 57 |
+
"recovered": "~20"
|
| 58 |
+
},
|
| 59 |
+
"final": {
|
| 60 |
+
"pass_at_1": 0.335,
|
| 61 |
+
"correct": 55,
|
| 62 |
+
"total_tokens": 62886,
|
| 63 |
+
"baseline_tokens_all_1024": 167936,
|
| 64 |
+
"token_savings_pct": 62.6
|
| 65 |
+
},
|
| 66 |
+
"methodology_note": "Prior H200 75.0% used in-process exec() without explicit check(). This Blackwell run is the correct methodology. H200 re-run pending."
|
| 67 |
+
},
|
| 68 |
+
"truthfulqa": {
|
| 69 |
+
"n_questions": 60,
|
| 70 |
+
"scoring": "0.0=misconception, 0.5=unclear, 1.0=correct. Scored against TruthfulQA correct_answers/incorrect_answers.",
|
| 71 |
+
"direct": {
|
| 72 |
+
"truthfulness": 0.325,
|
| 73 |
+
"misconceptions": 23,
|
| 74 |
+
"tokens": 7349
|
| 75 |
+
},
|
| 76 |
+
"occ_tiered": {
|
| 77 |
+
"note": "retry count data pending"
|
| 78 |
+
},
|
| 79 |
+
"occ_abstain": {
|
| 80 |
+
"truthfulness": 0.395,
|
| 81 |
+
"misconceptions": 11,
|
| 82 |
+
"abstained": 17,
|
| 83 |
+
"answered": 43,
|
| 84 |
+
"tokens": 5345,
|
| 85 |
+
"token_savings_vs_direct_pct": 27.3
|
| 86 |
+
}
|
| 87 |
+
},
|
| 88 |
+
"cross_platform": {
|
| 89 |
+
"h200_baseline_debate_acc": 0.767,
|
| 90 |
+
"blackwell_baseline_debate_acc": 0.867,
|
| 91 |
+
"h200_occ_180_3_debate_acc": 0.867,
|
| 92 |
+
"blackwell_occ_180_3_debate_acc": 0.967,
|
| 93 |
+
"occ_delta_h200": "+10.0pp",
|
| 94 |
+
"occ_delta_blackwell": "+10.0pp",
|
| 95 |
+
"note": "OCC delta is identical across platforms despite 10pp baseline difference. Baseline difference likely due to PyTorch 2.9→2.11 and CUDA 12→13 sampling distribution shifts."
|
| 96 |
+
}
|
| 97 |
+
}
|