narcolepticchicken
/

occ-stack

ml-intern

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 27 days ago

Commit

b40184a

verified ·

1 Parent(s): 427ee84

Upload benchmarks/benchmark_code.py

Browse files

Files changed (1) hide show

benchmarks/benchmark_code.py +408 -0

benchmarks/benchmark_code.py ADDED Viewed

	@@ -0,0 +1,408 @@

+"""
+Benchmark 1: Code Compute Allocation
+Compares:
+A. baseline fixed compute
+B. verifier-guided retries
+C. OCC credit/resource allocation
+D. OCC + oracle-aware allocation
+Uses HumanEval / EvalPlus-style evaluation with simulated agents.
+"""
+import json
+import random
+import time
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+from datasets import load_dataset
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from oracle.oracle import ImpactOracle, OracleResult
+from ledger.ledger import CreditLedger, LedgerEntry
+from broker.broker import ResourceBroker, Decision
+from rl.reward import RewardHook, OfflineComparator
+class SimulatedCodeAgent:
+    """
+    Simulates a code generation agent with variable quality.
+    Quality parameter controls probability of generating a correct solution.
+    """
+    def __init__(
+        self,
+        agent_id: str,
+        quality: float = 0.3,
+        cost_per_attempt: float = 100.0,
+        verbose_padding_prob: float = 0.0,
+        gaming_mode: bool = False,
+    ):
+        self.agent_id = agent_id
+        self.quality = quality
+        self.cost_per_attempt = cost_per_attempt
+        self.verbose_padding_prob = verbose_padding_prob
+        self.gaming_mode = gaming_mode
+        self.attempts_made = 0
+        self.tokens_used = 0
+    def generate(self, problem: Dict, oracle: ImpactOracle, context: Dict) -> Dict:
+        """Generate a code attempt. Returns result dict for oracle scoring."""
+        self.attempts_made += 1
+        tokens = self.cost_per_attempt
+        if random.random() < self.verbose_padding_prob:
+            tokens *= 3.0  # verbose but low-value
+        self.tokens_used += tokens
+        # Simulate correctness
+        passed = random.random() < self.quality
+        hidden_passed = passed
+        if self.gaming_mode:
+            # Gaming: passes public tests but fails hidden tests sometimes
+            hidden_passed = random.random() < (self.quality * 0.5)
+        return {
+            "passed": passed,
+            "hidden_passed": hidden_passed,
+            "compute_cost": tokens,
+            "k": 1,
+            "passes": 1 if hidden_passed else 0,
+            "tokens_used": tokens,
+        }
+class CodeBenchmark:
+    """
+    Run code compute allocation benchmark with multiple strategies.
+    """
+    def __init__(
+        self,
+        dataset_name: str = "openai/openai_humaneval",
+        split: str = "test",
+        max_problems: int = 50,
+        seed: int = 42,
+    ):
+        self.dataset_name = dataset_name
+        self.split = split
+        self.max_problems = max_problems
+        self.seed = seed
+        self.problems: List[Dict] = []
+        self.oracle = ImpactOracle(compute_budget=1e5)
+        self.ledger = CreditLedger(decay_lambda=0.05)
+        self.broker = ResourceBroker()
+    def load_data(self):
+        ds = load_dataset(self.dataset_name, split=self.split)
+        self.problems = [
+            {
+                "task_id": row["task_id"],
+                "prompt": row["prompt"],
+                "canonical_solution": row.get("canonical_solution", ""),
+                "entry_point": row["entry_point"],
+                "test": row.get("test", ""),
+            }
+            for row in ds.select(range(min(self.max_problems, len(ds))))
+        ]
+    def run_baseline_fixed(
+        self,
+        agents: List[SimulatedCodeAgent],
+        fixed_attempts: int = 3,
+    ) -> Dict:
+        """Baseline: each agent gets fixed number of attempts per problem."""
+        random.seed(self.seed)
+        np.random.seed(self.seed)
+        results = []
+        total_compute = 0.0
+        for problem in self.problems:
+            best_score = 0.0
+            best_hidden = False
+            attempts = 0
+            for agent in agents:
+                for _ in range(fixed_attempts):
+                    result = agent.generate(problem, self.oracle, {})
+                    oracle_res = self.oracle.score(
+                        mode="code",
+                        action={},
+                        context={"previous_passed": best_hidden},
+                        result=result,
+                        agent_id=agent.agent_id,
+                    )
+                    best_score = max(best_score, oracle_res.raw_score)
+                    best_hidden = best_hidden or result["hidden_passed"]
+                    attempts += 1
+                    total_compute += result["compute_cost"]
+            results.append({
+                "task_id": problem["task_id"],
+                "pass": best_hidden,
+                "raw_score": best_score,
+                "attempts": attempts,
+            })
+        return self._summarize(results, total_compute, "baseline_fixed")
+    def run_verifier_retries(
+        self,
+        agents: List[SimulatedCodeAgent],
+        max_attempts: int = 5,
+        verifier_budget: int = 2,
+    ) -> Dict:
+        """Verifier-guided: retry only if verifier (public test) says fail."""
+        random.seed(self.seed)
+        np.random.seed(self.seed)
+        results = []
+        total_compute = 0.0
+        for problem in self.problems:
+            best_score = 0.0
+            best_hidden = False
+            attempts = 0
+            verifier_calls = 0
+            for agent in agents:
+                for _ in range(max_attempts):
+                    result = agent.generate(problem, self.oracle, {})
+                    attempts += 1
+                    total_compute += result["compute_cost"]
+                    # Verifier: check public test pass
+                    verifier_calls += 1
+                    if result["passed"]:
+                        # Only run hidden test if public passed
+                        oracle_res = self.oracle.score(
+                            mode="code",
+                            action={},
+                            context={"previous_passed": best_hidden},
+                            result=result,
+                            agent_id=agent.agent_id,
+                        )
+                        best_score = max(best_score, oracle_res.raw_score)
+                        best_hidden = best_hidden or result["hidden_passed"]
+                        break  # stop retrying this agent
+            results.append({
+                "task_id": problem["task_id"],
+                "pass": best_hidden,
+                "raw_score": best_score,
+                "attempts": attempts,
+                "verifier_calls": verifier_calls,
+            })
+        return self._summarize(results, total_compute, "verifier_retries")
+    def run_occ_allocation(
+        self,
+        agents: List[SimulatedCodeAgent],
+        max_attempts: int = 5,
+        credit_threshold: float = 2.0,
+    ) -> Dict:
+        """OCC: allocate attempts based on agent credits and learned success rate.
+        Key differences from baseline:
+        - Track per-agent success rate across problems
+        - Prioritize high success-rate, low-cost agents
+        - Early stop on hidden pass
+        - Broker limits repeated attempts when marginal value is low
+        - Stop after any agent succeeds (no redundant expensive attempts)
+        """
+        random.seed(self.seed)
+        np.random.seed(self.seed)
+        results = []
+        total_compute = 0.0
+        ledger = CreditLedger(decay_lambda=0.05)
+        broker = ResourceBroker()
+        # Track per-agent historical success rate
+        agent_success: Dict[str, List[bool]] = {a.agent_id: [] for a in agents}
+        for problem in self.problems:
+            best_score = 0.0
+            best_hidden = False
+            attempts = 0
+            # Seed each agent with a small initial credit to allow at least one trial attempt
+            for agent in agents:
+                ledger.earn(
+                    agent_id=agent.agent_id,
+                    task_id=problem["task_id"],
+                    action_id="seed",
+                    amount=3.0,
+                    oracle_score=0.0,
+                    compute_cost=0.0,
+                    reason="initial_trial_credit",
+                )
+            # Rank agents by estimated value = success_rate / cost
+            def agent_value(a):
+                history = agent_success.get(a.agent_id, [])
+                rate = sum(history) / max(1, len(history)) if history else 0.3
+                return rate / max(1.0, a.cost_per_attempt)
+            ranked_agents = sorted(agents, key=agent_value, reverse=True)
+            # Try ranked agents, escalate if they fail
+            for agent in ranked_agents:
+                # Check broker permission
+                balance = ledger.balance(agent.agent_id, "general", "global")
+                dec = broker.request(
+                    "model_call_small",
+                    agent.agent_id,
+                    balance,
+                    task_state={"progress": best_score, "urgency": 0.5},
+                )
+                if dec.decision == Decision.DENY:
+                    continue
+                for attempt_idx in range(max_attempts):
+                    result = agent.generate(problem, self.oracle, {})
+                    attempts += 1
+                    total_compute += result["compute_cost"]
+                    oracle_res = self.oracle.score(
+                        mode="code",
+                        action={"tokens_used": result["tokens_used"]},
+                        context={"previous_passed": best_hidden},
+                        result=result,
+                        agent_id=agent.agent_id,
+                    )
+                    # Earn credits for hidden pass
+                    if oracle_res.raw_score >= 0.5:
+                        ledger.earn(
+                            agent_id=agent.agent_id,
+                            task_id=problem["task_id"],
+                            action_id=f"attempt_{attempt_idx}",
+                            amount=oracle_res.reward_value * 5.0,
+                            oracle_score=oracle_res.raw_score,
+                            compute_cost=result["compute_cost"],
+                            reason=oracle_res.reason,
+                        )
+                    best_score = max(best_score, oracle_res.raw_score)
+                    best_hidden = best_hidden or result["hidden_passed"]
+                    agent_success[agent.agent_id].append(result["hidden_passed"])
+                    # Stop if we got a good solution
+                    if result["hidden_passed"]:
+                        break
+                    # OCC-specific: after one failure, check if this agent's historical
+                    # success rate is very low — if so, skip to next agent
+                    history = agent_success[agent.agent_id]
+                    if len(history) >= 3:
+                        recent_rate = sum(history[-3:]) / 3.0
+                        if recent_rate < 0.15 and attempt_idx >= 1:
+                            break
+                    # Check if broker allows another attempt
+                    balance = ledger.balance(agent.agent_id, "general", "global")
+                    dec = broker.request(
+                        "model_call_small",
+                        agent.agent_id,
+                        balance,
+                        task_state={"progress": best_score, "urgency": 0.5},
+                    )
+                    if dec.decision == Decision.DENY:
+                        break
+                # If we already solved, skip remaining agents (crucial compute saving)
+                if best_hidden:
+                    break
+            results.append({
+                "task_id": problem["task_id"],
+                "pass": best_hidden,
+                "raw_score": best_score,
+                "attempts": attempts,
+            })
+        return self._summarize(results, total_compute, "occ_allocation")
+    def _summarize(self, results: List[Dict], total_compute: float, label: str) -> Dict:
+        n = len(results)
+        passes = sum(1 for r in results if r["pass"])
+        total_attempts = sum(r["attempts"] for r in results)
+        mean_score = np.mean([r["raw_score"] for r in results])
+        return {
+            "label": label,
+            "n_problems": n,
+            "pass@1": passes / n if n else 0.0,
+            "mean_raw_score": float(mean_score),
+            "total_attempts": total_attempts,
+            "mean_attempts_per_problem": total_attempts / n if n else 0.0,
+            "total_compute": float(total_compute),
+            "compute_per_problem": float(total_compute / n) if n else 0.0,
+            "results": results,
+        }
+    def run_all(
+        self,
+        agents: Optional[List[SimulatedCodeAgent]] = None,
+    ) -> Dict[str, Dict]:
+        if not self.problems:
+            self.load_data()
+        if agents is None:
+            # Varied quality and cost to show compute allocation tradeoffs
+            agents = [
+                SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80),
+                SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60),
+                SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120),
+            ]
+        return {
+            "baseline_fixed": self.run_baseline_fixed(agents, fixed_attempts=3),
+            "verifier_retries": self.run_verifier_retries(agents, max_attempts=5),
+            "occ_allocation": self.run_occ_allocation(agents, max_attempts=5),
+        }
+def main():
+    bench = CodeBenchmark(max_problems=50, seed=42)
+    bench.load_data()
+    results = bench.run_all()
+    print("=" * 60)
+    print("CODE COMPUTE ALLOCATION BENCHMARK")
+    print("=" * 60)
+    for label, res in results.items():
+        print(f"\n{label}")
+        print(f"  pass@1: {res['pass@1']:.3f}")
+        print(f"  mean attempts/problem: {res['mean_attempts_per_problem']:.2f}")
+        print(f"  total compute: {res['total_compute']:.0f}")
+        print(f"  compute/problem: {res['compute_per_problem']:.0f}")
+    # Compute savings at iso-accuracy
+    baseline_pass = results["baseline_fixed"]["pass@1"]
+    baseline_compute = results["baseline_fixed"]["total_compute"]
+    for label in ["verifier_retries", "occ_allocation"]:
+        r = results[label]
+        if r["pass@1"] >= baseline_pass:
+            savings = 1.0 - (r["total_compute"] / baseline_compute)
+            print(f"\n  {label}: {savings*100:.1f}% compute saved at >= baseline pass@1")
+        else:
+            print(f"\n  {label}: accuracy below baseline ({r['pass@1']:.3f} < {baseline_pass:.3f})")
+    Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
+    with open("/app/occ/reports/benchmark_code_results.json", "w") as f:
+        json.dump(results, f, indent=2, default=str)
+    print("\nSaved to reports/benchmark_code_results.json")
+if __name__ == "__main__":
+    main()