{ "project": "OCC: Oracle-Credit-Compute", "repository": "https://huggingface.co/narcolepticchicken/occ-stack", "date": "2026-05-05", "key_results": { "code_simulated": { "strategy": "OCC tiered escalation", "pass_at_1": 0.78, "iso_accuracy": true, "compute_savings_vs_fixed": "52.3%", "baseline_compute": 17500, "occ_compute": 8350 }, "code_real_llm": { "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct", "status": "attempted_but_blocked", "blocker": "code_extraction_heuristics_fail", "accuracy": 0.0, "note": "Model loads and generates on GPU. Need markdown stripping + AST validation." }, "qa_simulated": { "occ_accuracy": 0.71, "rag_verifier_accuracy": 0.79, "occ_retrievals": 227, "rag_retrievals": 338, "note": "OCC saves retrievals but lags on raw accuracy. Broker thresholds too conservative." }, "debate_v2_adversarial": { "occ_accuracy": 0.76, "confidence_weighted_accuracy": 0.56, "occ_bad_agent_containment": "100%", "note": "Confidence-weighted voting collapses with adversarial agents. OCC filters them out." } }, "anti_gaming": { "spam_attack": "100% credit exhaustion after ~10 actions", "hidden_test_gaming": "100% oracle detection", "over_abstention": "70% penalization", "collusion": "Credit-based filtering excludes adversarial agents" }, "limitations": [ "All main results are from simulated agents, not real LLMs", "Real LLM code extraction needs improvement (markdown/AST)", "Retrieval QA accuracy below RAG+verifier baseline", "GRPO training hook implemented but not executed on real data" ], "next_steps": [ "Fix code extraction for real LLM inference", "Domain-tune NLI for QA evidence scoring", "Run GRPO training on Qwen 0.5B with DeepMath-103K", "Publish as workshop paper (SafeGenAI/ALTA/ALOE)" ] }