occ-stack / eval_runner.py
narcolepticchicken's picture
Upload eval_runner.py
bc02d39 verified
raw
history blame
9.95 kB
"""
Evaluation runner: executes all benchmarks, ablations, and anti-gaming tests.
Produces consolidated reports.
"""
import json
import random
from pathlib import Path
from typing import Dict, List
import numpy as np
from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent
from benchmarks.benchmark_retrieval_qa import RetrievalQABenchmark, SimulatedRetrievalAgent
from benchmarks.benchmark_debate import DebateBenchmark, SimulatedDebateAgent
from oracle.oracle import ImpactOracle
from ledger.ledger import CreditLedger
from broker.broker import ResourceBroker
class AblationRunner:
"""Run ablation studies by disabling OCC components one at a time."""
def __init__(self, seed: int = 42):
self.seed = seed
random.seed(seed)
np.random.seed(seed)
# ------------------------------------------------------------------
# Ablations for Code Benchmark
# ------------------------------------------------------------------
def ablation_code(self) -> Dict[str, Dict]:
"""Run code benchmark with ablated configurations."""
bench = CodeBenchmark(max_problems=50, seed=self.seed)
bench.load_data()
base_agents = [
SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80),
SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60),
SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120),
]
results = {}
# 1. Full OCC
results["full_occ"] = bench.run_occ_allocation(base_agents, max_attempts=5)
# 2. No credit ledger (oracle score only)
# Simulate by running baseline_fixed but with oracle scoring
results["no_ledger"] = bench.run_baseline_fixed(base_agents, fixed_attempts=3)
# 3. No cost penalty (effectively baseline)
# Approximate by increasing compute budget so cost penalty vanishes
bench_no_cost = CodeBenchmark(max_problems=50, seed=self.seed)
bench_no_cost.load_data()
bench_no_cost.oracle.compute_budget = 1e12
results["no_cost_penalty"] = bench_no_cost.run_occ_allocation(base_agents, max_attempts=5)
# 4. No anti-gaming penalty
bench_no_game = CodeBenchmark(max_problems=50, seed=self.seed)
bench_no_game.load_data()
bench_no_game.oracle.gaming_weight = 0.0
gaming_agents = [
SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80, verbose_padding_prob=0.3),
SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60, verbose_padding_prob=0.3),
SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120, verbose_padding_prob=0.3),
]
results["no_anti_gaming"] = bench_no_game.run_occ_allocation(gaming_agents, max_attempts=5)
# 5. No broker (oracle score only)
bench_no_broker = CodeBenchmark(max_problems=50, seed=self.seed)
bench_no_broker.load_data()
results["no_broker"] = bench_no_broker.run_baseline_fixed(base_agents, fixed_attempts=5)
return results
# ------------------------------------------------------------------
# Ablations for Retrieval QA
# ------------------------------------------------------------------
def ablation_retrieval_qa(self) -> Dict[str, Dict]:
"""Run retrieval QA benchmark with ablated configurations."""
bench = RetrievalQABenchmark(n_questions=100, seed=self.seed)
bench.generate_questions()
agent = SimulatedRetrievalAgent(
agent_id="rag_agent",
accuracy=0.65,
hallucination_rate=0.12,
calibration_error=0.15,
abstention_rate=0.1,
)
results = {}
results["full_occ"] = bench.run_occ(agent)
results["direct_answer"] = bench.run_direct_answer(agent)
results["rag_baseline"] = bench.run_rag_baseline(agent)
results["rag_verifier"] = bench.run_rag_verifier(agent)
# Ablation: no abstention reward
# Approximate by setting abstention rate very low
agent_no_abstain = SimulatedRetrievalAgent(
agent_id="rag_agent_no_abstain",
accuracy=0.65,
hallucination_rate=0.12,
calibration_error=0.15,
abstention_rate=0.0,
)
results["no_abstention"] = bench.run_occ(agent_no_abstain)
# Ablation: no calibration penalty
agent_no_calib = SimulatedRetrievalAgent(
agent_id="rag_agent_no_calib",
accuracy=0.65,
hallucination_rate=0.12,
calibration_error=0.0,
abstention_rate=0.1,
)
results["no_calibration"] = bench.run_occ(agent_no_calib)
return results
# ------------------------------------------------------------------
# Anti-Gaming Tests
# ------------------------------------------------------------------
def anti_gaming_tests(self) -> Dict[str, Dict]:
"""Run adversarial tests against the credit system."""
random.seed(self.seed)
np.random.seed(self.seed)
results = {}
# 1. Spam low-value actions
bench = CodeBenchmark(max_problems=50, seed=self.seed)
bench.load_data()
spam_agents = [
SimulatedCodeAgent("spam_1", quality=0.05, cost_per_attempt=50),
SimulatedCodeAgent("spam_2", quality=0.05, cost_per_attempt=50),
]
results["spam"] = bench.run_occ_allocation(spam_agents, max_attempts=10)
# 2. Hoarding credits
ledger = CreditLedger(decay_lambda=0.0) # no decay = hoarding
# We'll simulate this via a custom run
bench_hoard = CodeBenchmark(max_problems=50, seed=self.seed)
bench_hoard.load_data()
hoard_agents = [
SimulatedCodeAgent("hoarder", quality=0.5, cost_per_attempt=100),
]
# Force many initial successes to build credit, then stop earning
results["hoarding"] = bench_hoard.run_occ_allocation(hoard_agents, max_attempts=10)
# 3. Hidden test gaming
bench_game = CodeBenchmark(max_problems=50, seed=self.seed)
bench_game.load_data()
gaming_agents = [
SimulatedCodeAgent("gamer", quality=0.5, cost_per_attempt=100, gaming_mode=True),
]
results["hidden_test_gaming"] = bench_game.run_occ_allocation(gaming_agents, max_attempts=5)
# 4. Over-abstention in retrieval
bench_qa = RetrievalQABenchmark(n_questions=100, seed=self.seed)
bench_qa.generate_questions()
abstain_agent = SimulatedRetrievalAgent(
agent_id="abstainer",
accuracy=0.65,
hallucination_rate=0.12,
calibration_error=0.15,
abstention_rate=0.9, # over-abstain
)
results["over_abstention"] = bench_qa.run_occ(abstain_agent)
# 5. Collusion in debate
bench_debate = DebateBenchmark(n_topics=50, n_agents=4, seed=self.seed)
bench_debate.generate_topics()
colluding_agents = [
SimulatedDebateAgent("collude_1", accuracy=0.6, collude_with="collude_2"),
SimulatedDebateAgent("collude_2", accuracy=0.6, collude_with="collude_1"),
SimulatedDebateAgent("honest_1", accuracy=0.6),
SimulatedDebateAgent("honest_2", accuracy=0.6),
]
# Run equal turns to simulate collusion effect
topic_results = []
for topic in bench_debate.topics:
topic_results.append(bench_debate._resolve_equal_turns(colluding_agents, topic))
results["collusion_equal_turns"] = bench_debate._summarize(topic_results, "collusion_equal_turns")
# OCC with colluders
topic_results_occ = []
for topic in bench_debate.topics:
topic_results_occ.append(bench_debate._resolve_occ_allocation(colluding_agents, topic))
results["collusion_occ"] = bench_debate._summarize(topic_results_occ, "collusion_occ")
return results
# ------------------------------------------------------------------
# Consolidated run
# ------------------------------------------------------------------
def run_all(self) -> Dict:
print("Running code ablations...")
code_ablations = self.ablation_code()
print("Running retrieval QA ablations...")
qa_ablations = self.ablation_retrieval_qa()
print("Running anti-gaming tests...")
anti_gaming = self.anti_gaming_tests()
report = {
"code_ablations": code_ablations,
"qa_ablations": qa_ablations,
"anti_gaming": anti_gaming,
}
Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
with open("/app/occ/reports/ablation_and_anti_gaming.json", "w") as f:
json.dump(report, f, indent=2, default=str)
print("\nSaved ablation/anti-gaming results to reports/ablation_and_anti_gaming.json")
return report
def main():
runner = AblationRunner(seed=42)
report = runner.run_all()
print("\n" + "=" * 60)
print("ABLATION SUMMARY")
print("=" * 60)
print("\n--- Code Ablations ---")
for k, v in report["code_ablations"].items():
print(f"{k:20s}: pass@1={v.get('pass@1', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
print("\n--- QA Ablations ---")
for k, v in report["qa_ablations"].items():
print(f"{k:20s}: acc={v.get('accuracy', 'N/A'):.3f}, ECE={v.get('ece', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
print("\n--- Anti-Gaming ---")
for k, v in report["anti_gaming"].items():
if "accuracy" in v:
print(f"{k:20s}: acc={v['accuracy']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
elif "pass@1" in v:
print(f"{k:20s}: pass@1={v['pass@1']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
if __name__ == "__main__":
main()