File size: 6,470 Bytes
126c21b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | """Run all 77 tasks with GPT-4o-mini and compute aggregate metrics."""
import sys
import json
import os
import re
import time
from dotenv import load_dotenv
load_dotenv()
sys.path.insert(0, ".")
sys.path.insert(0, "./server")
from openai import OpenAI
from server.hr_onboarding_environment import HROnboardingEnvironment
from models import HROnboardingAction
from server.tools import TOOL_DEFINITIONS
from server.rubrics import RubricEvaluator
client = OpenAI()
tool_desc = json.dumps(TOOL_DEFINITIONS, indent=2)
system_prompt = (
"You are an HR automation agent for AcmeCorp. You help with employee "
"onboarding and offboarding by calling the appropriate tools.\n\n"
"For each step, respond with ONLY a JSON tool call in this exact format:\n"
'{"tool": "<tool_name>", "params": {<parameters>}}\n\n'
'When you believe the task is complete, respond with:\n'
'{"tool": "__done__", "params": {}}\n\n'
"Important rules:\n"
"- Respond with ONLY the JSON object, no other text\n"
"- Use the exact tool names and parameter names from the tool definitions\n"
"- Think about what information you need and what tools to call in what order\n\n"
f"Available tools:\n{tool_desc}"
)
results = []
evaluator = RubricEvaluator()
num_tasks = 77
print("=" * 70)
print("HR ONBOARDING ENVIRONMENT — FULL EVALUATION (77 tasks)")
print(f"Model: gpt-4o-mini")
print("=" * 70)
for task_idx in range(num_tasks):
env = HROnboardingEnvironment(seed=42, max_steps=15)
# Cycle to the desired task
for _ in range(task_idx + 1):
obs = env.reset()
task = env._current_task
task_id = obs.task_id
difficulty = obs.metadata.get("difficulty", "?")
category = obs.metadata.get("category", "?")
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": obs.instruction},
]
steps_taken = 0
error_count = 0
for step in range(1, obs.max_steps + 1):
try:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
temperature=0.1,
max_tokens=512,
)
assistant_msg = response.choices[0].message.content.strip()
except Exception as e:
print(f" API error on {task_id} step {step}: {e}")
time.sleep(5)
continue
# Parse tool call
try:
json_match = re.search(r'\{.*\}', assistant_msg, re.DOTALL)
if json_match:
tool_call = json.loads(json_match.group())
else:
tool_call = json.loads(assistant_msg)
except json.JSONDecodeError:
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": 'Respond with valid JSON: {"tool": "<name>", "params": {<args>}}'})
error_count += 1
continue
tool_name = tool_call.get("tool", "")
params = tool_call.get("params", {})
if tool_name == "__done__":
break
action = HROnboardingAction(tool_name=tool_name, arguments=params)
obs = env.step(action)
steps_taken += 1
result_str = json.dumps(obs.tool_result, indent=2)
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": f"Tool result:\n{result_str}\n\nContinue with next tool call, or {{\"tool\": \"__done__\", \"params\": {{}}}} if done."})
if obs.done:
break
# Evaluate
eval_result = evaluator.evaluate(task, env.world.action_log)
result = {
"task_id": task_id,
"difficulty": difficulty,
"category": category,
"score": eval_result["score"],
"passed": eval_result["passed"],
"passed_count": eval_result["passed_count"],
"total_criteria": eval_result["total_criteria"],
"steps_taken": steps_taken,
"parse_errors": error_count,
}
results.append(result)
status = "PASS" if result["passed"] else "FAIL"
print(f" [{task_idx+1:2d}/77] {task_id:10s} [{difficulty:10s}] [{category:14s}] "
f"Score: {result['score']:.0%} ({result['passed_count']}/{result['total_criteria']}) "
f"Steps: {steps_taken:2d} {status}")
# --- Aggregate metrics ---
print("\n" + "=" * 70)
print("AGGREGATE RESULTS")
print("=" * 70)
total = len(results)
pass_count = sum(1 for r in results if r["passed"])
mean_score = sum(r["score"] for r in results) / total
mean_steps = sum(r["steps_taken"] for r in results) / total
total_criteria = sum(r["total_criteria"] for r in results)
total_passed_criteria = sum(r["passed_count"] for r in results)
print(f"\nOverall:")
print(f" Tasks: {total}")
print(f" Pass rate: {pass_count}/{total} ({pass_count/total:.1%})")
print(f" Mean score: {mean_score:.3f}")
print(f" Mean steps: {mean_steps:.1f}")
print(f" Criteria hit: {total_passed_criteria}/{total_criteria} ({total_passed_criteria/total_criteria:.1%})")
# By difficulty
print(f"\nBy Difficulty:")
for diff in ["simple", "medium", "complex", "edge_case"]:
subset = [r for r in results if r["difficulty"] == diff]
if not subset:
continue
n = len(subset)
p = sum(1 for r in subset if r["passed"])
s = sum(r["score"] for r in subset) / n
st = sum(r["steps_taken"] for r in subset) / n
print(f" {diff:10s}: {p:2d}/{n:2d} pass ({p/n:.0%}) mean_score={s:.2f} mean_steps={st:.1f}")
# By category
print(f"\nBy Category:")
for cat in ["lookup", "onboarding", "offboarding", "cross_workflow"]:
subset = [r for r in results if r["category"] == cat]
if not subset:
continue
n = len(subset)
p = sum(1 for r in subset if r["passed"])
s = sum(r["score"] for r in subset) / n
print(f" {cat:14s}: {p:2d}/{n:2d} pass ({p/n:.0%}) mean_score={s:.2f}")
# Save results
os.makedirs("outputs", exist_ok=True)
with open("outputs/full_eval_results.json", "w") as f:
json.dump({
"model": "gpt-4o-mini",
"total_tasks": total,
"pass_count": pass_count,
"pass_rate": pass_count / total,
"mean_score": mean_score,
"mean_steps": mean_steps,
"criteria_hit_rate": total_passed_criteria / total_criteria,
"results": results,
}, f, indent=2)
print(f"\nDetailed results saved to outputs/full_eval_results.json")
|