narcolepticchicken
/

occ-stack

Model card Files Files and versions

occ-stack / reports /results_summary.json

narcolepticchicken's picture

narcolepticchicken

Upload reports/results_summary.json

58df7e3 verified 26 days ago

history blame contribute delete

1.95 kB

	{
	"project": "OCC: Oracle-Credit-Compute",
	"repository": "https://huggingface.co/narcolepticchicken/occ-stack",
	"date": "2026-05-05",
	"key_results": {
	"code_simulated": {
	"strategy": "OCC tiered escalation",
	"pass_at_1": 0.78,
	"iso_accuracy": true,
	"compute_savings_vs_fixed": "52.3%",
	"baseline_compute": 17500,
	"occ_compute": 8350
	},
	"code_real_llm": {
	"model": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
	"status": "attempted_but_blocked",
	"blocker": "code_extraction_heuristics_fail",
	"accuracy": 0.0,
	"note": "Model loads and generates on GPU. Need markdown stripping + AST validation."
	},
	"qa_simulated": {
	"occ_accuracy": 0.71,
	"rag_verifier_accuracy": 0.79,
	"occ_retrievals": 227,
	"rag_retrievals": 338,
	"note": "OCC saves retrievals but lags on raw accuracy. Broker thresholds too conservative."
	},
	"debate_v2_adversarial": {
	"occ_accuracy": 0.76,
	"confidence_weighted_accuracy": 0.56,
	"occ_bad_agent_containment": "100%",
	"note": "Confidence-weighted voting collapses with adversarial agents. OCC filters them out."
	}
	},
	"anti_gaming": {
	"spam_attack": "100% credit exhaustion after ~10 actions",
	"hidden_test_gaming": "100% oracle detection",
	"over_abstention": "70% penalization",
	"collusion": "Credit-based filtering excludes adversarial agents"
	},
	"limitations": [
	"All main results are from simulated agents, not real LLMs",
	"Real LLM code extraction needs improvement (markdown/AST)",
	"Retrieval QA accuracy below RAG+verifier baseline",
	"GRPO training hook implemented but not executed on real data"
	],
	"next_steps": [
	"Fix code extraction for real LLM inference",
	"Domain-tune NLI for QA evidence scoring",
	"Run GRPO training on Qwen 0.5B with DeepMath-103K",
	"Publish as workshop paper (SafeGenAI/ALTA/ALOE)"
	]
	}