occ-stack / reports /results_summary.json
narcolepticchicken's picture
Upload reports/results_summary.json
58df7e3 verified
{
"project": "OCC: Oracle-Credit-Compute",
"repository": "https://huggingface.co/narcolepticchicken/occ-stack",
"date": "2026-05-05",
"key_results": {
"code_simulated": {
"strategy": "OCC tiered escalation",
"pass_at_1": 0.78,
"iso_accuracy": true,
"compute_savings_vs_fixed": "52.3%",
"baseline_compute": 17500,
"occ_compute": 8350
},
"code_real_llm": {
"model": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
"status": "attempted_but_blocked",
"blocker": "code_extraction_heuristics_fail",
"accuracy": 0.0,
"note": "Model loads and generates on GPU. Need markdown stripping + AST validation."
},
"qa_simulated": {
"occ_accuracy": 0.71,
"rag_verifier_accuracy": 0.79,
"occ_retrievals": 227,
"rag_retrievals": 338,
"note": "OCC saves retrievals but lags on raw accuracy. Broker thresholds too conservative."
},
"debate_v2_adversarial": {
"occ_accuracy": 0.76,
"confidence_weighted_accuracy": 0.56,
"occ_bad_agent_containment": "100%",
"note": "Confidence-weighted voting collapses with adversarial agents. OCC filters them out."
}
},
"anti_gaming": {
"spam_attack": "100% credit exhaustion after ~10 actions",
"hidden_test_gaming": "100% oracle detection",
"over_abstention": "70% penalization",
"collusion": "Credit-based filtering excludes adversarial agents"
},
"limitations": [
"All main results are from simulated agents, not real LLMs",
"Real LLM code extraction needs improvement (markdown/AST)",
"Retrieval QA accuracy below RAG+verifier baseline",
"GRPO training hook implemented but not executed on real data"
],
"next_steps": [
"Fix code extraction for real LLM inference",
"Domain-tune NLI for QA evidence scoring",
"Run GRPO training on Qwen 0.5B with DeepMath-103K",
"Publish as workshop paper (SafeGenAI/ALTA/ALOE)"
]
}