Upload reports/results_summary.json
Browse files- reports/results_summary.json +53 -0
reports/results_summary.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"project": "OCC: Oracle-Credit-Compute",
|
| 3 |
+
"repository": "https://huggingface.co/narcolepticchicken/occ-stack",
|
| 4 |
+
"date": "2026-05-05",
|
| 5 |
+
"key_results": {
|
| 6 |
+
"code_simulated": {
|
| 7 |
+
"strategy": "OCC tiered escalation",
|
| 8 |
+
"pass_at_1": 0.78,
|
| 9 |
+
"iso_accuracy": true,
|
| 10 |
+
"compute_savings_vs_fixed": "52.3%",
|
| 11 |
+
"baseline_compute": 17500,
|
| 12 |
+
"occ_compute": 8350
|
| 13 |
+
},
|
| 14 |
+
"code_real_llm": {
|
| 15 |
+
"model": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
|
| 16 |
+
"status": "attempted_but_blocked",
|
| 17 |
+
"blocker": "code_extraction_heuristics_fail",
|
| 18 |
+
"accuracy": 0.0,
|
| 19 |
+
"note": "Model loads and generates on GPU. Need markdown stripping + AST validation."
|
| 20 |
+
},
|
| 21 |
+
"qa_simulated": {
|
| 22 |
+
"occ_accuracy": 0.71,
|
| 23 |
+
"rag_verifier_accuracy": 0.79,
|
| 24 |
+
"occ_retrievals": 227,
|
| 25 |
+
"rag_retrievals": 338,
|
| 26 |
+
"note": "OCC saves retrievals but lags on raw accuracy. Broker thresholds too conservative."
|
| 27 |
+
},
|
| 28 |
+
"debate_v2_adversarial": {
|
| 29 |
+
"occ_accuracy": 0.76,
|
| 30 |
+
"confidence_weighted_accuracy": 0.56,
|
| 31 |
+
"occ_bad_agent_containment": "100%",
|
| 32 |
+
"note": "Confidence-weighted voting collapses with adversarial agents. OCC filters them out."
|
| 33 |
+
}
|
| 34 |
+
},
|
| 35 |
+
"anti_gaming": {
|
| 36 |
+
"spam_attack": "100% credit exhaustion after ~10 actions",
|
| 37 |
+
"hidden_test_gaming": "100% oracle detection",
|
| 38 |
+
"over_abstention": "70% penalization",
|
| 39 |
+
"collusion": "Credit-based filtering excludes adversarial agents"
|
| 40 |
+
},
|
| 41 |
+
"limitations": [
|
| 42 |
+
"All main results are from simulated agents, not real LLMs",
|
| 43 |
+
"Real LLM code extraction needs improvement (markdown/AST)",
|
| 44 |
+
"Retrieval QA accuracy below RAG+verifier baseline",
|
| 45 |
+
"GRPO training hook implemented but not executed on real data"
|
| 46 |
+
],
|
| 47 |
+
"next_steps": [
|
| 48 |
+
"Fix code extraction for real LLM inference",
|
| 49 |
+
"Domain-tune NLI for QA evidence scoring",
|
| 50 |
+
"Run GRPO training on Qwen 0.5B with DeepMath-103K",
|
| 51 |
+
"Publish as workshop paper (SafeGenAI/ALTA/ALOE)"
|
| 52 |
+
]
|
| 53 |
+
}
|