{
  "project": "OCC: Oracle-Credit-Compute",
  "repository": "https://huggingface.co/narcolepticchicken/occ-stack",
  "date": "2026-05-05",
  "key_results": {
    "code_simulated": {
      "strategy": "OCC tiered escalation",
      "pass_at_1": 0.78,
      "iso_accuracy": true,
      "compute_savings_vs_fixed": "52.3%",
      "baseline_compute": 17500,
      "occ_compute": 8350
    },
    "code_real_llm": {
      "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
      "status": "attempted_but_blocked",
      "blocker": "code_extraction_heuristics_fail",
      "accuracy": 0.0,
      "note": "Model loads and generates on GPU. Need markdown stripping + AST validation."
    },
    "qa_simulated": {
      "occ_accuracy": 0.71,
      "rag_verifier_accuracy": 0.79,
      "occ_retrievals": 227,
      "rag_retrievals": 338,
      "note": "OCC saves retrievals but lags on raw accuracy. Broker thresholds too conservative."
    },
    "debate_v2_adversarial": {
      "occ_accuracy": 0.76,
      "confidence_weighted_accuracy": 0.56,
      "occ_bad_agent_containment": "100%",
      "note": "Confidence-weighted voting collapses with adversarial agents. OCC filters them out."
    }
  },
  "anti_gaming": {
    "spam_attack": "100% credit exhaustion after ~10 actions",
    "hidden_test_gaming": "100% oracle detection",
    "over_abstention": "70% penalization",
    "collusion": "Credit-based filtering excludes adversarial agents"
  },
  "limitations": [
    "All main results are from simulated agents, not real LLMs",
    "Real LLM code extraction needs improvement (markdown/AST)",
    "Retrieval QA accuracy below RAG+verifier baseline",
    "GRPO training hook implemented but not executed on real data"
  ],
  "next_steps": [
    "Fix code extraction for real LLM inference",
    "Domain-tune NLI for QA evidence scoring",
    "Run GRPO training on Qwen 0.5B with DeepMath-103K",
    "Publish as workshop paper (SafeGenAI/ALTA/ALOE)"
  ]
}