occ-stack / eval_runner.py

Upload eval_runner.py

bc02d39 verified 27 days ago

9.95 kB

	"""
	Evaluation runner: executes all benchmarks, ablations, and anti-gaming tests.
	Produces consolidated reports.
	"""

	import json
	import random
	from pathlib import Path
	from typing import Dict, List

	import numpy as np

	from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent
	from benchmarks.benchmark_retrieval_qa import RetrievalQABenchmark, SimulatedRetrievalAgent
	from benchmarks.benchmark_debate import DebateBenchmark, SimulatedDebateAgent
	from oracle.oracle import ImpactOracle
	from ledger.ledger import CreditLedger
	from broker.broker import ResourceBroker


	class AblationRunner:
	"""Run ablation studies by disabling OCC components one at a time."""

	def __init__(self, seed: int = 42):
	self.seed = seed
	random.seed(seed)
	np.random.seed(seed)

	# ------------------------------------------------------------------
	# Ablations for Code Benchmark
	# ------------------------------------------------------------------

	def ablation_code(self) -> Dict[str, Dict]:
	"""Run code benchmark with ablated configurations."""
	bench = CodeBenchmark(max_problems=50, seed=self.seed)
	bench.load_data()

	base_agents = [
	SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80),
	SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60),
	SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120),
	]

	results = {}

	# 1. Full OCC
	results["full_occ"] = bench.run_occ_allocation(base_agents, max_attempts=5)

	# 2. No credit ledger (oracle score only)
	# Simulate by running baseline_fixed but with oracle scoring
	results["no_ledger"] = bench.run_baseline_fixed(base_agents, fixed_attempts=3)

	# 3. No cost penalty (effectively baseline)
	# Approximate by increasing compute budget so cost penalty vanishes
	bench_no_cost = CodeBenchmark(max_problems=50, seed=self.seed)
	bench_no_cost.load_data()
	bench_no_cost.oracle.compute_budget = 1e12
	results["no_cost_penalty"] = bench_no_cost.run_occ_allocation(base_agents, max_attempts=5)

	# 4. No anti-gaming penalty
	bench_no_game = CodeBenchmark(max_problems=50, seed=self.seed)
	bench_no_game.load_data()
	bench_no_game.oracle.gaming_weight = 0.0
	gaming_agents = [
	SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80, verbose_padding_prob=0.3),
	SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60, verbose_padding_prob=0.3),
	SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120, verbose_padding_prob=0.3),
	]
	results["no_anti_gaming"] = bench_no_game.run_occ_allocation(gaming_agents, max_attempts=5)

	# 5. No broker (oracle score only)
	bench_no_broker = CodeBenchmark(max_problems=50, seed=self.seed)
	bench_no_broker.load_data()
	results["no_broker"] = bench_no_broker.run_baseline_fixed(base_agents, fixed_attempts=5)

	return results

	# ------------------------------------------------------------------
	# Ablations for Retrieval QA
	# ------------------------------------------------------------------

	def ablation_retrieval_qa(self) -> Dict[str, Dict]:
	"""Run retrieval QA benchmark with ablated configurations."""
	bench = RetrievalQABenchmark(n_questions=100, seed=self.seed)
	bench.generate_questions()

	agent = SimulatedRetrievalAgent(
	agent_id="rag_agent",
	accuracy=0.65,
	hallucination_rate=0.12,
	calibration_error=0.15,
	abstention_rate=0.1,
	)

	results = {}
	results["full_occ"] = bench.run_occ(agent)
	results["direct_answer"] = bench.run_direct_answer(agent)
	results["rag_baseline"] = bench.run_rag_baseline(agent)
	results["rag_verifier"] = bench.run_rag_verifier(agent)

	# Ablation: no abstention reward
	# Approximate by setting abstention rate very low
	agent_no_abstain = SimulatedRetrievalAgent(
	agent_id="rag_agent_no_abstain",
	accuracy=0.65,
	hallucination_rate=0.12,
	calibration_error=0.15,
	abstention_rate=0.0,
	)
	results["no_abstention"] = bench.run_occ(agent_no_abstain)

	# Ablation: no calibration penalty
	agent_no_calib = SimulatedRetrievalAgent(
	agent_id="rag_agent_no_calib",
	accuracy=0.65,
	hallucination_rate=0.12,
	calibration_error=0.0,
	abstention_rate=0.1,
	)
	results["no_calibration"] = bench.run_occ(agent_no_calib)

	return results

	# ------------------------------------------------------------------
	# Anti-Gaming Tests
	# ------------------------------------------------------------------

	def anti_gaming_tests(self) -> Dict[str, Dict]:
	"""Run adversarial tests against the credit system."""
	random.seed(self.seed)
	np.random.seed(self.seed)

	results = {}

	# 1. Spam low-value actions
	bench = CodeBenchmark(max_problems=50, seed=self.seed)
	bench.load_data()
	spam_agents = [
	SimulatedCodeAgent("spam_1", quality=0.05, cost_per_attempt=50),
	SimulatedCodeAgent("spam_2", quality=0.05, cost_per_attempt=50),
	]
	results["spam"] = bench.run_occ_allocation(spam_agents, max_attempts=10)

	# 2. Hoarding credits
	ledger = CreditLedger(decay_lambda=0.0) # no decay = hoarding
	# We'll simulate this via a custom run
	bench_hoard = CodeBenchmark(max_problems=50, seed=self.seed)
	bench_hoard.load_data()
	hoard_agents = [
	SimulatedCodeAgent("hoarder", quality=0.5, cost_per_attempt=100),
	]
	# Force many initial successes to build credit, then stop earning
	results["hoarding"] = bench_hoard.run_occ_allocation(hoard_agents, max_attempts=10)

	# 3. Hidden test gaming
	bench_game = CodeBenchmark(max_problems=50, seed=self.seed)
	bench_game.load_data()
	gaming_agents = [
	SimulatedCodeAgent("gamer", quality=0.5, cost_per_attempt=100, gaming_mode=True),
	]
	results["hidden_test_gaming"] = bench_game.run_occ_allocation(gaming_agents, max_attempts=5)

	# 4. Over-abstention in retrieval
	bench_qa = RetrievalQABenchmark(n_questions=100, seed=self.seed)
	bench_qa.generate_questions()
	abstain_agent = SimulatedRetrievalAgent(
	agent_id="abstainer",
	accuracy=0.65,
	hallucination_rate=0.12,
	calibration_error=0.15,
	abstention_rate=0.9, # over-abstain
	)
	results["over_abstention"] = bench_qa.run_occ(abstain_agent)

	# 5. Collusion in debate
	bench_debate = DebateBenchmark(n_topics=50, n_agents=4, seed=self.seed)
	bench_debate.generate_topics()
	colluding_agents = [
	SimulatedDebateAgent("collude_1", accuracy=0.6, collude_with="collude_2"),
	SimulatedDebateAgent("collude_2", accuracy=0.6, collude_with="collude_1"),
	SimulatedDebateAgent("honest_1", accuracy=0.6),
	SimulatedDebateAgent("honest_2", accuracy=0.6),
	]
	# Run equal turns to simulate collusion effect
	topic_results = []
	for topic in bench_debate.topics:
	topic_results.append(bench_debate._resolve_equal_turns(colluding_agents, topic))
	results["collusion_equal_turns"] = bench_debate._summarize(topic_results, "collusion_equal_turns")

	# OCC with colluders
	topic_results_occ = []
	for topic in bench_debate.topics:
	topic_results_occ.append(bench_debate._resolve_occ_allocation(colluding_agents, topic))
	results["collusion_occ"] = bench_debate._summarize(topic_results_occ, "collusion_occ")

	return results

	# ------------------------------------------------------------------
	# Consolidated run
	# ------------------------------------------------------------------

	def run_all(self) -> Dict:
	print("Running code ablations...")
	code_ablations = self.ablation_code()

	print("Running retrieval QA ablations...")
	qa_ablations = self.ablation_retrieval_qa()

	print("Running anti-gaming tests...")
	anti_gaming = self.anti_gaming_tests()

	report = {
	"code_ablations": code_ablations,
	"qa_ablations": qa_ablations,
	"anti_gaming": anti_gaming,
	}

	Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
	with open("/app/occ/reports/ablation_and_anti_gaming.json", "w") as f:
	json.dump(report, f, indent=2, default=str)
	print("\nSaved ablation/anti-gaming results to reports/ablation_and_anti_gaming.json")
	return report


	def main():
	runner = AblationRunner(seed=42)
	report = runner.run_all()

	print("\n" + "=" * 60)
	print("ABLATION SUMMARY")
	print("=" * 60)

	print("\n--- Code Ablations ---")
	for k, v in report["code_ablations"].items():
	print(f"{k:20s}: pass@1={v.get('pass@1', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}")

	print("\n--- QA Ablations ---")
	for k, v in report["qa_ablations"].items():
	print(f"{k:20s}: acc={v.get('accuracy', 'N/A'):.3f}, ECE={v.get('ece', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}")

	print("\n--- Anti-Gaming ---")
	for k, v in report["anti_gaming"].items():
	if "accuracy" in v:
	print(f"{k:20s}: acc={v['accuracy']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
	elif "pass@1" in v:
	print(f"{k:20s}: pass@1={v['pass@1']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}")


	if __name__ == "__main__":
	main()