narcolepticchicken commited on
Commit
af936b6
·
verified ·
1 Parent(s): 5ad2b8b

Upload reports/blackwell_results_v9.json

Browse files
Files changed (1) hide show
  1. reports/blackwell_results_v9.json +97 -0
reports/blackwell_results_v9.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "platform": "NVIDIA RTX PRO 6000 Blackwell Workstation Edition",
3
+ "pytorch": "2.11.0+cu130",
4
+ "cuda": "13.0",
5
+ "vram_gb": 95,
6
+ "model": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
7
+ "seed": 42,
8
+ "date": "2026-05-07",
9
+ "execution": {
10
+ "script": "occ_benchmark_all.py (v2, subprocess+check)",
11
+ "patches": [
12
+ "total_mem → total_memory (PyTorch 2.11 property name)",
13
+ "local HF cache (global /srv/models/hf-cache full)",
14
+ "subprocess explicit check(entry_point) call"
15
+ ]
16
+ },
17
+ "debate": {
18
+ "topics": 30,
19
+ "agents_per_topic": 4,
20
+ "baseline_equal_1round": {
21
+ "accuracy": 0.867,
22
+ "correct": 26,
23
+ "total": 30,
24
+ "tokens": 42752
25
+ },
26
+ "occ_240_5": {
27
+ "accuracy": 0.933,
28
+ "correct": 28,
29
+ "total": 30,
30
+ "tokens": 40259,
31
+ "denied": 5
32
+ },
33
+ "occ_180_3": {
34
+ "accuracy": 0.967,
35
+ "correct": 29,
36
+ "total": 30,
37
+ "tokens": 42760,
38
+ "denied": 0
39
+ },
40
+ "occ_120_3": {
41
+ "accuracy": 0.833,
42
+ "correct": 25,
43
+ "total": 30,
44
+ "tokens": 41309,
45
+ "denied": 0
46
+ },
47
+ "occ_delta_over_baseline": "+10.0pp (180/3 on both platforms)"
48
+ },
49
+ "humaneval": {
50
+ "n_problems": 164,
51
+ "evaluation": "subprocess.run(sys.executable, timeout=30) + explicit check(entry_point)",
52
+ "pass1_128tok": {
53
+ "passed": "~55",
54
+ "tokens": "see totals"
55
+ },
56
+ "pass2_1024tok": {
57
+ "recovered": "~20"
58
+ },
59
+ "final": {
60
+ "pass_at_1": 0.335,
61
+ "correct": 55,
62
+ "total_tokens": 62886,
63
+ "baseline_tokens_all_1024": 167936,
64
+ "token_savings_pct": 62.6
65
+ },
66
+ "methodology_note": "Prior H200 75.0% used in-process exec() without explicit check(). This Blackwell run is the correct methodology. H200 re-run pending."
67
+ },
68
+ "truthfulqa": {
69
+ "n_questions": 60,
70
+ "scoring": "0.0=misconception, 0.5=unclear, 1.0=correct. Scored against TruthfulQA correct_answers/incorrect_answers.",
71
+ "direct": {
72
+ "truthfulness": 0.325,
73
+ "misconceptions": 23,
74
+ "tokens": 7349
75
+ },
76
+ "occ_tiered": {
77
+ "note": "retry count data pending"
78
+ },
79
+ "occ_abstain": {
80
+ "truthfulness": 0.395,
81
+ "misconceptions": 11,
82
+ "abstained": 17,
83
+ "answered": 43,
84
+ "tokens": 5345,
85
+ "token_savings_vs_direct_pct": 27.3
86
+ }
87
+ },
88
+ "cross_platform": {
89
+ "h200_baseline_debate_acc": 0.767,
90
+ "blackwell_baseline_debate_acc": 0.867,
91
+ "h200_occ_180_3_debate_acc": 0.867,
92
+ "blackwell_occ_180_3_debate_acc": 0.967,
93
+ "occ_delta_h200": "+10.0pp",
94
+ "occ_delta_blackwell": "+10.0pp",
95
+ "note": "OCC delta is identical across platforms despite 10pp baseline difference. Baseline difference likely due to PyTorch 2.9→2.11 and CUDA 12→13 sampling distribution shifts."
96
+ }
97
+ }