meta-hackathon / benchmark_agentic.py
Gokultm's picture
Agentics flow for different planners
6543833
"""Run local benchmarks for the explicit skill-routed agentic policies.
This script keeps the environment unchanged and benchmarks policy-side agentic
extensions on top of it. The main goal is to show that a small skill router can
compose reusable conversational skills while still solving the benchmark.
Example:
py -3 benchmark_agentic.py
"""
from __future__ import annotations
import argparse
import json
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from statistics import mean
from typing import Any, Dict, List
from src.agentic import SkillRoutedDeterministicPolicy
from src.baselines import StageAwareHeuristicBaseline
from src.env import ESCEnv
from src.models import Action
from src.tasks import TASKS
@dataclass
class AgenticEpisodeSummary:
policy: str
task_id: str
difficulty: str
steps: int
score: float
success: bool
completion: float
avg_step_reward: float
final_resolution: float
had_safety_reference: bool
skill_counts: Dict[str, int] = field(default_factory=dict)
skill_trace: List[str] = field(default_factory=list)
transcript_excerpt: List[str] = field(default_factory=list)
def _extract_skill_trace(policy: Any) -> tuple[Dict[str, int], List[str]]:
if not hasattr(policy, "decision_log") or not hasattr(policy, "memory"):
return {}, []
counts = dict(getattr(policy.memory, "skill_counts", {}))
trace: List[str] = []
for entry in getattr(policy, "decision_log", [])[:6]:
turn = int(entry.get("turn", "0")) + 1
trace.append(
f"Turn {turn} [{entry.get('stage', '')}] -> {entry.get('skill', '')}: {entry.get('reason', '')}"
)
return counts, trace
def run_episode(env: ESCEnv, policy: Any, task_id: str) -> AgenticEpisodeSummary:
task = TASKS[task_id]
policy.reset(task_id)
reset = env.reset(task_id=task_id)
obs = reset.observation
rewards: List[float] = []
transcript_excerpt: List[str] = [f"Seeker: {obs.seeker_utterance}"]
last_result = None
while True:
message = policy.act(obs)
transcript_excerpt.append(f"Agent: {message}")
result = env.step(Action(message=message))
last_result = result
rewards.append(float(result.reward))
obs = result.observation
transcript_excerpt.append(f"Seeker: {obs.seeker_utterance}")
if result.done:
break
assert last_result is not None
final = last_result.info.get("final", {})
skill_counts, skill_trace = _extract_skill_trace(policy)
return AgenticEpisodeSummary(
policy=policy.name,
task_id=task_id,
difficulty=task.difficulty,
steps=obs.turn,
score=float(final.get("score", 0.0)),
success=bool(final.get("success", 0.0) >= 1.0),
completion=float(final.get("completion", 0.0)),
avg_step_reward=mean(rewards) if rewards else 0.0,
final_resolution=float(final.get("final_resolution", 0.0)),
had_safety_reference=bool(last_result.info.get("had_safety_reference", False)),
skill_counts=skill_counts,
skill_trace=skill_trace,
transcript_excerpt=transcript_excerpt[:10],
)
def summarize_by_policy(episodes: List[AgenticEpisodeSummary]) -> List[Dict[str, Any]]:
by_name: Dict[str, List[AgenticEpisodeSummary]] = {}
for episode in episodes:
by_name.setdefault(episode.policy, []).append(episode)
rows: List[Dict[str, Any]] = []
for policy, group in by_name.items():
rows.append(
{
"policy": policy,
"avg_score": mean(ep.score for ep in group),
"success_rate": mean(1.0 if ep.success else 0.0 for ep in group),
"avg_steps": mean(ep.steps for ep in group),
"avg_completion": mean(ep.completion for ep in group),
"avg_resolution": mean(ep.final_resolution for ep in group),
}
)
rows.sort(key=lambda row: row["avg_score"], reverse=True)
return rows
def aggregate_skill_counts(episodes: List[AgenticEpisodeSummary], policy_name: str) -> Dict[str, int]:
totals: Dict[str, int] = {}
for episode in episodes:
if episode.policy != policy_name:
continue
for skill_name, count in episode.skill_counts.items():
totals[skill_name] = totals.get(skill_name, 0) + count
return dict(sorted(totals.items(), key=lambda item: (-item[1], item[0])))
def render_markdown(episodes: List[AgenticEpisodeSummary], generated_at: str) -> str:
summary_rows = summarize_by_policy(episodes)
skill_totals = aggregate_skill_counts(episodes, "skill_routed_deterministic")
reference_row = next((row for row in summary_rows if row["policy"] == "stage_aware_heuristic"), None)
agentic_row = next((row for row in summary_rows if row["policy"] == "skill_routed_deterministic"), None)
lines: List[str] = []
lines.append("# Agentic Benchmark Results")
lines.append("")
lines.append(f"_Generated: {generated_at}_")
lines.append("")
lines.append("This report isolates the policy-side skills/agents story. The environment and rubric are unchanged.")
lines.append("")
lines.append("## Summary")
lines.append("")
lines.append("| Policy | Avg score | Success rate | Avg steps | Completion | Final resolution |")
lines.append("| --- | ---: | ---: | ---: | ---: | ---: |")
for row in summary_rows:
lines.append(
"| "
f"{row['policy']} | "
f"{row['avg_score']:.3f} | "
f"{row['success_rate']:.2f} | "
f"{row['avg_steps']:.2f} | "
f"{row['avg_completion']:.2f} | "
f"{row['avg_resolution']:.3f} |"
)
lines.append("")
lines.append("## Takeaways")
lines.append("")
if reference_row is not None and agentic_row is not None:
delta = agentic_row["avg_score"] - reference_row["avg_score"]
lines.append(
f"- The explicit skill-routed policy scored `{agentic_row['avg_score']:.3f}`, "
f"for a delta of `{delta:+.3f}` versus the non-agentic staged heuristic."
)
lines.append("- The skill-routed policy keeps the benchmark deterministic while making the policy decomposition visible to judges.")
lines.append("- Safety escalation remains a policy-side decision; the hard task still requires the environment-level safety-aware finish.")
lines.append("")
lines.append("## Skill Usage Totals")
lines.append("")
lines.append("| Skill | Total turns |")
lines.append("| --- | ---: |")
for skill_name, count in skill_totals.items():
lines.append(f"| {skill_name} | {count} |")
lines.append("")
lines.append("## Per-Task Results")
lines.append("")
lines.append("| Task | Difficulty | Policy | Score | Success | Completion | Steps | Safety ref |")
lines.append("| --- | --- | --- | ---: | ---: | ---: | ---: | ---: |")
for episode in sorted(episodes, key=lambda item: (item.task_id, item.policy)):
lines.append(
"| "
f"{episode.task_id} | "
f"{episode.difficulty} | "
f"{episode.policy} | "
f"{episode.score:.3f} | "
f"{int(episode.success)} | "
f"{episode.completion:.1f} | "
f"{episode.steps} | "
f"{int(episode.had_safety_reference)} |"
)
lines.append("")
lines.append("## Skill Trace Excerpts")
lines.append("")
for episode in episodes:
if not episode.skill_trace:
continue
lines.append(f"### {episode.task_id} - {episode.policy}")
lines.append("")
lines.append(
f"- Score: `{episode.score:.3f}` | Success: `{episode.success}` | "
f"Completion: `{episode.completion:.1f}`"
)
for trace in episode.skill_trace:
lines.append(f"- {trace}")
lines.append("")
lines.append("## Transcript Excerpts")
lines.append("")
for episode in episodes:
lines.append(f"### {episode.task_id} - {episode.policy}")
lines.append("")
for line in episode.transcript_excerpt:
lines.append(f"- {line}")
lines.append("")
return "\n".join(lines).strip() + "\n"
def main() -> None:
parser = argparse.ArgumentParser(description="Run the agentic local benchmark suite.")
parser.add_argument(
"--output",
default="results/agentic_benchmarks.md",
help="Markdown output path.",
)
parser.add_argument(
"--json-output",
default="results/agentic_benchmarks.json",
help="JSON output path.",
)
args = parser.parse_args()
env = ESCEnv()
policies = [
StageAwareHeuristicBaseline(),
SkillRoutedDeterministicPolicy(),
]
episodes: List[AgenticEpisodeSummary] = []
for policy in policies:
for task_id in TASKS:
episodes.append(run_episode(env, policy, task_id))
generated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%SZ")
markdown = render_markdown(episodes, generated_at=generated_at)
md_path = Path(args.output)
json_path = Path(args.json_output)
md_path.parent.mkdir(parents=True, exist_ok=True)
json_path.parent.mkdir(parents=True, exist_ok=True)
md_path.write_text(markdown, encoding="utf-8")
json_path.write_text(json.dumps([asdict(ep) for ep in episodes], indent=2), encoding="utf-8")
print(f"Wrote Markdown report to {md_path}")
print(f"Wrote JSON report to {json_path}")
if __name__ == "__main__":
main()