narcolepticchicken
/

occ-stack

ml-intern

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 24 days ago

Commit

af936b6

verified ·

1 Parent(s): 5ad2b8b

Upload reports/blackwell_results_v9.json

Browse files

Files changed (1) hide show

reports/blackwell_results_v9.json +97 -0

reports/blackwell_results_v9.json ADDED Viewed

	@@ -0,0 +1,97 @@

+{
+  "platform": "NVIDIA RTX PRO 6000 Blackwell Workstation Edition",
+  "pytorch": "2.11.0+cu130",
+  "cuda": "13.0",
+  "vram_gb": 95,
+  "model": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
+  "seed": 42,
+  "date": "2026-05-07",
+  "execution": {
+    "script": "occ_benchmark_all.py (v2, subprocess+check)",
+    "patches": [
+      "total_mem → total_memory (PyTorch 2.11 property name)",
+      "local HF cache (global /srv/models/hf-cache full)",
+      "subprocess explicit check(entry_point) call"
+    ]
+  },
+  "debate": {
+    "topics": 30,
+    "agents_per_topic": 4,
+    "baseline_equal_1round": {
+      "accuracy": 0.867,
+      "correct": 26,
+      "total": 30,
+      "tokens": 42752
+    },
+    "occ_240_5": {
+      "accuracy": 0.933,
+      "correct": 28,
+      "total": 30,
+      "tokens": 40259,
+      "denied": 5
+    },
+    "occ_180_3": {
+      "accuracy": 0.967,
+      "correct": 29,
+      "total": 30,
+      "tokens": 42760,
+      "denied": 0
+    },
+    "occ_120_3": {
+      "accuracy": 0.833,
+      "correct": 25,
+      "total": 30,
+      "tokens": 41309,
+      "denied": 0
+    },
+    "occ_delta_over_baseline": "+10.0pp (180/3 on both platforms)"
+  },
+  "humaneval": {
+    "n_problems": 164,
+    "evaluation": "subprocess.run(sys.executable, timeout=30) + explicit check(entry_point)",
+    "pass1_128tok": {
+      "passed": "~55",
+      "tokens": "see totals"
+    },
+    "pass2_1024tok": {
+      "recovered": "~20"
+    },
+    "final": {
+      "pass_at_1": 0.335,
+      "correct": 55,
+      "total_tokens": 62886,
+      "baseline_tokens_all_1024": 167936,
+      "token_savings_pct": 62.6
+    },
+    "methodology_note": "Prior H200 75.0% used in-process exec() without explicit check(). This Blackwell run is the correct methodology. H200 re-run pending."
+  },
+  "truthfulqa": {
+    "n_questions": 60,
+    "scoring": "0.0=misconception, 0.5=unclear, 1.0=correct. Scored against TruthfulQA correct_answers/incorrect_answers.",
+    "direct": {
+      "truthfulness": 0.325,
+      "misconceptions": 23,
+      "tokens": 7349
+    },
+    "occ_tiered": {
+      "note": "retry count data pending"
+    },
+    "occ_abstain": {
+      "truthfulness": 0.395,
+      "misconceptions": 11,
+      "abstained": 17,
+      "answered": 43,
+      "tokens": 5345,
+      "token_savings_vs_direct_pct": 27.3
+    }
+  },
+  "cross_platform": {
+    "h200_baseline_debate_acc": 0.767,
+    "blackwell_baseline_debate_acc": 0.867,
+    "h200_occ_180_3_debate_acc": 0.867,
+    "blackwell_occ_180_3_debate_acc": 0.967,
+    "occ_delta_h200": "+10.0pp",
+    "occ_delta_blackwell": "+10.0pp",
+    "note": "OCC delta is identical across platforms despite 10pp baseline difference. Baseline difference likely due to PyTorch 2.9→2.11 and CUDA 12→13 sampling distribution shifts."
+  }
+}