| { |
| "project": "OCC: Oracle-Credit-Compute", |
| "repository": "https://huggingface.co/narcolepticchicken/occ-stack", |
| "date": "2026-05-05", |
| "key_results": { |
| "code_simulated": { |
| "strategy": "OCC tiered escalation", |
| "pass_at_1": 0.78, |
| "iso_accuracy": true, |
| "compute_savings_vs_fixed": "52.3%", |
| "baseline_compute": 17500, |
| "occ_compute": 8350 |
| }, |
| "code_real_llm": { |
| "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct", |
| "status": "attempted_but_blocked", |
| "blocker": "code_extraction_heuristics_fail", |
| "accuracy": 0.0, |
| "note": "Model loads and generates on GPU. Need markdown stripping + AST validation." |
| }, |
| "qa_simulated": { |
| "occ_accuracy": 0.71, |
| "rag_verifier_accuracy": 0.79, |
| "occ_retrievals": 227, |
| "rag_retrievals": 338, |
| "note": "OCC saves retrievals but lags on raw accuracy. Broker thresholds too conservative." |
| }, |
| "debate_v2_adversarial": { |
| "occ_accuracy": 0.76, |
| "confidence_weighted_accuracy": 0.56, |
| "occ_bad_agent_containment": "100%", |
| "note": "Confidence-weighted voting collapses with adversarial agents. OCC filters them out." |
| } |
| }, |
| "anti_gaming": { |
| "spam_attack": "100% credit exhaustion after ~10 actions", |
| "hidden_test_gaming": "100% oracle detection", |
| "over_abstention": "70% penalization", |
| "collusion": "Credit-based filtering excludes adversarial agents" |
| }, |
| "limitations": [ |
| "All main results are from simulated agents, not real LLMs", |
| "Real LLM code extraction needs improvement (markdown/AST)", |
| "Retrieval QA accuracy below RAG+verifier baseline", |
| "GRPO training hook implemented but not executed on real data" |
| ], |
| "next_steps": [ |
| "Fix code extraction for real LLM inference", |
| "Domain-tune NLI for QA evidence scoring", |
| "Run GRPO training on Qwen 0.5B with DeepMath-103K", |
| "Publish as workshop paper (SafeGenAI/ALTA/ALOE)" |
| ] |
| } |