File size: 12,982 Bytes

"""
Unified evaluation runner: all ablations + anti-gaming tests.
Runs simulated benchmarks under 10 ablation conditions and 6 anti-gaming attacks.
"""
import json
import random
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Tuple

import numpy as np

# Ensure imports work
sys.path.insert(0, str(Path(__file__).parent))
from oracle.oracle import ImpactOracle
from ledger.ledger import CreditLedger
from broker.broker import ResourceBroker, Decision
from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent
from benchmarks.benchmark_retrieval_qa import (
    QABenchmark,
    SimulatedAgent,
    create_qa_dataset,
)
from benchmarks.benchmark_debate_v2 import (
    DebateBenchmark,
    FactualAgent,
    OverconfidentAgent,
    UncertainAgent,
    SycophantAgent,
)


@dataclass
class AblConfig:
    name: str
    description: str
    oracle_weights: Dict[str, Any]
    broker_thresholds: Dict[str, float]
    decay_lambda: float
    gaming_penalty: float
    compute_penalty_rate: float
    anti_gaming_on: bool


ABLATIONS = [
    AblConfig("default", "Full OCC stack", {}, {}, 0.02, 2.0, 0.0001, True),
    AblConfig("no_decay", "No credit decay (lambda=0)", {}, {}, 0.0, 2.0, 0.0001, True),
    AblConfig("fast_decay", "Aggressive decay (lambda=0.1)", {}, {}, 0.1, 2.0, 0.0001, True),
    AblConfig("no_gaming_penalty", "No gaming penalties", {}, {}, 0.02, 0.0, 0.0001, True),
    AblConfig("high_gaming_penalty", "Severe gaming penalties (5.0)", {}, {}, 0.02, 5.0, 0.0001, True),
    AblConfig("lenient_broker", "Lenient broker (thresholds x0.5)", {}, {"low": 0.25, "medium": 1.0, "high": 2.5}, 0.02, 2.0, 0.0001, True),
    AblConfig("strict_broker", "Strict broker (thresholds x2.0)", {}, {"low": 1.0, "medium": 4.0, "high": 10.0}, 0.02, 2.0, 0.0001, True),
    AblConfig("high_compute_cost", "High compute penalty (x10)", {}, {}, 0.02, 2.0, 0.001, True),
    AblConfig("low_compute_cost", "Low compute penalty (x0.1)", {}, {}, 0.02, 2.0, 0.00001, True),
    AblConfig("anti_gaming_off", "Disable all anti-gaming detectors", {}, {}, 0.02, 2.0, 0.0001, False),
]


def run_ablation_code(config: AblConfig, seed: int = 42, n_problems: int = 50) -> Dict:
    random.seed(seed)
    np.random.seed(seed)

    oracle = ImpactOracle(
        code_weights={"correctness": 1.0, "pass_at_k": 0.3, "regression": -0.5, "compute_penalty": 0.001},
        compute_penalty_rate=config.compute_penalty_rate,
        gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
    )
    ledger = CreditLedger(decay_lambda=config.decay_lambda)
    broker = ResourceBroker(thresholds=config.broker_thresholds)

    bench = CodeBenchmark(n_problems=n_problems, seed=seed)
    cheap = SimulatedCodeAgent("cheap", 0.65, 0.15, 0.20, 60)
    medium = SimulatedCodeAgent("medium", 0.85, 0.35, 0.15, 150)
    expensive = SimulatedCodeAgent("expensive", 0.95, 0.65, 0.10, 350)

    # Seed ledger
    for a in [cheap, medium, expensive]:
        q = (a.pass_rate_easy + a.pass_rate_hard) / 2
        ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call")

    # Override benchmark's oracle/ledger/broker
    results = bench.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
    # (the benchmark internally uses its own instances; we use the standalone below)
    # Actually the benchmark creates its own objects. Let's run standalone:
    return _run_occ_code_standalone(oracle, ledger, broker, cheap, medium, expensive, n_problems, seed)


def _run_occ_code_standalone(oracle, ledger, broker, cheap, medium, expensive, n_problems, seed):
    random.seed(seed)
    np.random.seed(seed)
    bench = CodeBenchmark(n_problems=n_problems, seed=seed)
    agents = [cheap, medium, expensive]
    for a in agents:
        q = (a.pass_rate_easy + a.pass_rate_hard) / 2
        ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call")

    total_compute = 0
    results = []
    for problem in bench.problems:
        solved = False
        cost = 0
        used = []
        ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2))
        for agent in ranked:
            if solved or len(used) >= 3:
                break
            a.attempts += 1
            r = agent.solve(problem)
            cost += r["compute_cost"]
            total_compute += r["compute_cost"]
            used.append(agent.agent_id)
            solved = r["public_pass"]
            hidden = r["hidden_pass"]
            oracle_res = oracle.score(
                "code", {"attempt": len(used)}, {},
                {"correctness": 1.0 if solved else 0.0, "pass_at_k": 1.0 if hidden else 0.0,
                 "compute_cost": cost, "public_pass": solved, "hidden_tests_pass": hidden},
                agent_id=agent.agent_id,
            )
            if oracle_res.raw_score > 0:
                ledger.earn(agent.agent_id, problem.task_id, "solve", oracle_res.raw_score * 5,
                            oracle_res.raw_score, cost, "pass", "model_call")
            else:
                ledger.spend(agent.agent_id, problem.task_id, "solve", 1.0, "model_call", "fail")
            if hidden:
                break
        results.append({"solved": solved, "cost": cost, "agents": used})

    acc = sum(1 for r in results if r["solved"]) / len(results)
    return {
        "accuracy": acc,
        "total_compute": total_compute,
        "mean_compute": total_compute / len(results),
        "mean_agents": sum(len(r["agents"]) for r in results) / len(results),
    }


def run_ablation_qa(config: AblConfig, seed: int = 42) -> Dict:
    random.seed(seed)
    np.random.seed(seed)
    oracle = ImpactOracle(
        compute_penalty_rate=config.compute_penalty_rate,
        gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
    )
    ledger = CreditLedger(decay_lambda=config.decay_lambda)
    broker = ResourceBroker(thresholds=config.broker_thresholds)

    data = create_qa_dataset(seed=seed)
    bench = QABenchmark(data, oracle, ledger, broker, seed=seed)
    agent = SimulatedAgent("qa_agent", oracle, ledger, broker, 0.85)
    agent.budget = 50000
    agent.strategy = "adaptive"
    results = bench.run_occ(agent)
    return {
        "accuracy": results["accuracy"],
        "total_compute": results["total_compute"],
        "mean_compute": results["mean_compute"],
        "precision": results.get("precision", 0),
        "recall": results.get("recall", 0),
    }


def run_ablation_debate(config: AblConfig, seed: int = 42, n_debates: int = 20) -> Dict:
    random.seed(seed)
    np.random.seed(seed)
    oracle = ImpactOracle(
        compute_penalty_rate=config.compute_penalty_rate,
        gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
    )
    ledger = CreditLedger(decay_lambda=config.decay_lambda)
    broker = ResourceBroker(thresholds=config.broker_thresholds)

    bench = DebateBenchmark(
        oracle=oracle, ledger=ledger, broker=broker,
        factual_accuracy=0.9, adversarial_ratio=0.0, n_agents=3, seed=seed
    )
    agents = [FactualAgent(f"f{i}", 0.9) for i in range(3)]
    results = bench.run_debate(agents)
    return {
        "accuracy": results["accuracy"],
        "consensus_reached": results["consensus_reached"],
        "total_compute": results["total_compute"],
        "mean_compute": results["mean_compute"],
    }


def run_anti_gaming_code(seed: int = 42, n_problems: int = 50) -> Dict:
    """Test anti-gaming: create a gaming agent that always passes public but fails hidden."""
    random.seed(seed)
    np.random.seed(seed)
    oracle = ImpactOracle(gaming_penalty=2.0)
    ledger = CreditLedger()
    broker = ResourceBroker()

    bench = CodeBenchmark(n_problems=n_problems, seed=seed)
    # Normal agent
    normal = SimulatedCodeAgent("normal", 0.9, 0.5, 0.1, 150)
    # Gaming agent: high public pass, high hidden fail
    gamer = SimulatedCodeAgent("gamer", 0.95, 0.05, 0.8, 100)

    ledger.earn("normal", "seed", "seed", 15, 0.0, 0.0, "initial", "model_call")
    ledger.earn("gamer", "seed", "seed", 15, 0.0, 0.0, "initial", "model_call")

    results = {"normal": [], "gamer": []}
    for problem in bench.problems:
        for agent in [normal, gamer]:
            r = agent.solve(problem)
            oracle_res = oracle.score(
                "code", {}, {},
                {"correctness": 1.0 if r["public_pass"] else 0.0,
                 "pass_at_k": 1.0 if r["hidden_pass"] else 0.0,
                 "compute_cost": r["compute_cost"],
                 "public_pass": r["public_pass"],
                 "hidden_tests_pass": r["hidden_pass"]},
                agent_id=agent.agent_id,
            )
            results[agent.agent_id].append({
                "public_pass": r["public_pass"],
                "hidden_pass": r["hidden_pass"],
                "raw_score": oracle_res.raw_score,
                "penalized": "gaming_hidden_tests" in oracle_res.failure_tags,
            })

    def summarize(who):
        rs = results[who]
        return {
            "public_acc": sum(1 for r in rs if r["public_pass"]) / len(rs),
            "hidden_acc": sum(1 for r in rs if r["hidden_pass"]) / len(rs),
            "mean_raw": sum(r["raw_score"] for r in rs) / len(rs),
            "penalized_rate": sum(1 for r in rs if r["penalized"]) / len(rs),
        }

    return {"normal": summarize("normal"), "gamer": summarize("gamer")}


def run_anti_gaming_collusion(seed: int = 42) -> Dict:
    """Test that credit transfers are always blocked."""
    ledger = CreditLedger()
    ledger.earn("alice", "seed", "seed", 10, 0.0, 0.0, "initial")
    ledger.earn("bob", "seed", "seed", 1, 0.0, 0.0, "initial")

    ok = ledger.transfer("alice", "bob", 5.0, "global")
    alice_bal = ledger.balance("alice")
    bob_bal = ledger.balance("bob")

    collusion = ledger.detect_collusion(window=10)
    return {
        "transfer_allowed": ok,
        "alice_balance": alice_bal,
        "bob_balance": bob_bal,
        "collusion_detected": bool(collusion),
        "transfer_blocked": not ok,
    }


def run_anti_gaming_abstention(seed: int = 42) -> Dict:
    """Test over-abstention penalty."""
    oracle = ImpactOracle()
    # Agent abstains on everything
    results = []
    for i in range(10):
        res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"},
                           {"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
        results.append(res.reward_value)
    return {"mean_reward": sum(results) / len(results), "expected_negative": sum(results) < 0}


def run_anti_gaming_spam(seed: int = 42) -> Dict:
    """Test spam detection: high compute, low score."""
    oracle = ImpactOracle()
    # High compute but wrong answer
    res = oracle.score("retrieval_qa", {}, {"gold_answer": "paris"},
                       {"answer": "london", "confidence": 0.1, "evidence": {}, "compute_cost": 5000})
    return {"reward": res.reward_value, "tagged": bool(res.failure_tags), "tags": res.failure_tags}


def run_all() -> Dict:
    print("=" * 60)
    print("OCC UNIFIED EVALUATION RUNNER")
    print("=" * 60)

    all_results: Dict[str, Any] = {"ablations": {}, "anti_gaming": {}}

    # Ablations
    for abl in ABLATIONS:
        print(f"\n--- ABLATION: {abl.name} ---")
        print(f"  {abl.description}")
        code_res = run_ablation_code(abl, seed=42, n_problems=50)
        qa_res = run_ablation_qa(abl, seed=42)
        debate_res = run_ablation_debate(abl, seed=42)
        print(f"  Code: acc={code_res['accuracy']:.3f}, compute={code_res['total_compute']:.0f}")
        print(f"  QA:   acc={qa_res['accuracy']:.3f}, compute={qa_res['total_compute']:.0f}")
        print(f"  Debate: acc={debate_res['accuracy']:.3f}, compute={debate_res['total_compute']:.0f}")
        all_results["ablations"][abl.name] = {
            "config": abl.__dict__,
            "code": code_res,
            "qa": qa_res,
            "debate": debate_res,
        }

    # Anti-gaming
    print("\n--- ANTI-GAMING TESTS ---")
    all_results["anti_gaming"]["hidden_test_gaming"] = run_anti_gaming_code(seed=42)
    all_results["anti_gaming"]["collusion"] = run_anti_gaming_collusion(seed=42)
    all_results["anti_gaming"]["abstention"] = run_anti_gaming_abstention(seed=42)
    all_results["anti_gaming"]["spam"] = run_anti_gaming_spam(seed=42)

    for test_name, res in all_results["anti_gaming"].items():
        print(f"\n  {test_name}: {json.dumps(res, indent=2, default=str)}")

    # Save
    out = Path("/app/occ/reports")
    out.mkdir(parents=True, exist_ok=True)
    with open(out / "eval_runner_results.json", "w") as f:
        json.dump(all_results, f, indent=2, default=str)
    print(f"\nSaved to {out / 'eval_runner_results.json'}")
    return all_results


if __name__ == "__main__":
    run_all()