narcolepticchicken
/

occ-stack

ml-intern

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 26 days ago

Commit

58df7e3

verified ·

1 Parent(s): 8ee1677

Upload reports/results_summary.json

Browse files

Files changed (1) hide show

reports/results_summary.json +53 -0

reports/results_summary.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "project": "OCC: Oracle-Credit-Compute",
+  "repository": "https://huggingface.co/narcolepticchicken/occ-stack",
+  "date": "2026-05-05",
+  "key_results": {
+    "code_simulated": {
+      "strategy": "OCC tiered escalation",
+      "pass_at_1": 0.78,
+      "iso_accuracy": true,
+      "compute_savings_vs_fixed": "52.3%",
+      "baseline_compute": 17500,
+      "occ_compute": 8350
+    },
+    "code_real_llm": {
+      "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
+      "status": "attempted_but_blocked",
+      "blocker": "code_extraction_heuristics_fail",
+      "accuracy": 0.0,
+      "note": "Model loads and generates on GPU. Need markdown stripping + AST validation."
+    },
+    "qa_simulated": {
+      "occ_accuracy": 0.71,
+      "rag_verifier_accuracy": 0.79,
+      "occ_retrievals": 227,
+      "rag_retrievals": 338,
+      "note": "OCC saves retrievals but lags on raw accuracy. Broker thresholds too conservative."
+    },
+    "debate_v2_adversarial": {
+      "occ_accuracy": 0.76,
+      "confidence_weighted_accuracy": 0.56,
+      "occ_bad_agent_containment": "100%",
+      "note": "Confidence-weighted voting collapses with adversarial agents. OCC filters them out."
+    }
+  },
+  "anti_gaming": {
+    "spam_attack": "100% credit exhaustion after ~10 actions",
+    "hidden_test_gaming": "100% oracle detection",
+    "over_abstention": "70% penalization",
+    "collusion": "Credit-based filtering excludes adversarial agents"
+  },
+  "limitations": [
+    "All main results are from simulated agents, not real LLMs",
+    "Real LLM code extraction needs improvement (markdown/AST)",
+    "Retrieval QA accuracy below RAG+verifier baseline",
+    "GRPO training hook implemented but not executed on real data"
+  ],
+  "next_steps": [
+    "Fix code extraction for real LLM inference",
+    "Domain-tune NLI for QA evidence scoring",
+    "Run GRPO training on Qwen 0.5B with DeepMath-103K",
+    "Publish as workshop paper (SafeGenAI/ALTA/ALOE)"
+  ]
+}