narcolepticchicken commited on
Commit
58df7e3
·
verified ·
1 Parent(s): 8ee1677

Upload reports/results_summary.json

Browse files
Files changed (1) hide show
  1. reports/results_summary.json +53 -0
reports/results_summary.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "project": "OCC: Oracle-Credit-Compute",
3
+ "repository": "https://huggingface.co/narcolepticchicken/occ-stack",
4
+ "date": "2026-05-05",
5
+ "key_results": {
6
+ "code_simulated": {
7
+ "strategy": "OCC tiered escalation",
8
+ "pass_at_1": 0.78,
9
+ "iso_accuracy": true,
10
+ "compute_savings_vs_fixed": "52.3%",
11
+ "baseline_compute": 17500,
12
+ "occ_compute": 8350
13
+ },
14
+ "code_real_llm": {
15
+ "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
16
+ "status": "attempted_but_blocked",
17
+ "blocker": "code_extraction_heuristics_fail",
18
+ "accuracy": 0.0,
19
+ "note": "Model loads and generates on GPU. Need markdown stripping + AST validation."
20
+ },
21
+ "qa_simulated": {
22
+ "occ_accuracy": 0.71,
23
+ "rag_verifier_accuracy": 0.79,
24
+ "occ_retrievals": 227,
25
+ "rag_retrievals": 338,
26
+ "note": "OCC saves retrievals but lags on raw accuracy. Broker thresholds too conservative."
27
+ },
28
+ "debate_v2_adversarial": {
29
+ "occ_accuracy": 0.76,
30
+ "confidence_weighted_accuracy": 0.56,
31
+ "occ_bad_agent_containment": "100%",
32
+ "note": "Confidence-weighted voting collapses with adversarial agents. OCC filters them out."
33
+ }
34
+ },
35
+ "anti_gaming": {
36
+ "spam_attack": "100% credit exhaustion after ~10 actions",
37
+ "hidden_test_gaming": "100% oracle detection",
38
+ "over_abstention": "70% penalization",
39
+ "collusion": "Credit-based filtering excludes adversarial agents"
40
+ },
41
+ "limitations": [
42
+ "All main results are from simulated agents, not real LLMs",
43
+ "Real LLM code extraction needs improvement (markdown/AST)",
44
+ "Retrieval QA accuracy below RAG+verifier baseline",
45
+ "GRPO training hook implemented but not executed on real data"
46
+ ],
47
+ "next_steps": [
48
+ "Fix code extraction for real LLM inference",
49
+ "Domain-tune NLI for QA evidence scoring",
50
+ "Run GRPO training on Qwen 0.5B with DeepMath-103K",
51
+ "Publish as workshop paper (SafeGenAI/ALTA/ALOE)"
52
+ ]
53
+ }