Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- README.md +88 -14
- pyproject.toml +15 -0
- reward_curve.png +3 -0
- server/app.py +1 -0
- server/static/index.html +69 -1
- server/tasks.py +310 -12
- test_all_tasks.py +186 -0
- train_hr_agent.ipynb +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
reward_curve.png filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -13,9 +13,22 @@ tags:
|
|
| 13 |
|
| 14 |
# HR Onboarding & Offboarding Environment
|
| 15 |
|
| 16 |
-
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
## Quick Start
|
| 21 |
|
|
@@ -226,8 +239,11 @@ rl_hack/
|
|
| 226 |
βββ __init__.py # Module exports
|
| 227 |
βββ client.py # HROnboardingEnv client
|
| 228 |
βββ models.py # Action/Observation Pydantic models
|
| 229 |
-
βββ test_with_llm.py # Test
|
|
|
|
|
|
|
| 230 |
βββ .env # API keys (gitignored)
|
|
|
|
| 231 |
βββ server/
|
| 232 |
βββ __init__.py
|
| 233 |
βββ app.py # FastAPI application
|
|
@@ -260,7 +276,7 @@ You can test the environment locally using GPT (or any OpenAI-compatible model)
|
|
| 260 |
|
| 261 |
2. Install dependencies:
|
| 262 |
```bash
|
| 263 |
-
pip install
|
| 264 |
```
|
| 265 |
|
| 266 |
### Run
|
|
@@ -269,12 +285,15 @@ You can test the environment locally using GPT (or any OpenAI-compatible model)
|
|
| 269 |
cd rl_hack
|
| 270 |
|
| 271 |
# Test on default task (simple lookup)
|
| 272 |
-
uv run python -m test_with_llm
|
| 273 |
|
| 274 |
# Test a specific task by index (0-76)
|
| 275 |
uv run python -m test_with_llm 14 # medium onboarding task
|
| 276 |
uv run python -m test_with_llm 24 # complex full onboarding
|
| 277 |
uv run python -m test_with_llm 55 # edge case (headcount limit)
|
|
|
|
|
|
|
|
|
|
| 278 |
```
|
| 279 |
|
| 280 |
The script will:
|
|
@@ -322,29 +341,84 @@ Passed: True
|
|
| 322 |
| 55-66 | Edge case | Various | Headcount limits, license caps, RBAC |
|
| 323 |
| 67-76 | Complex | Cross-workflow | Transfers, rehires, manager departures |
|
| 324 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
## Building & Running
|
| 326 |
|
| 327 |
```bash
|
|
|
|
|
|
|
|
|
|
| 328 |
# Build Docker image
|
| 329 |
docker build -t hr-onboarding-env:latest -f server/Dockerfile .
|
| 330 |
|
| 331 |
-
# Run locally (as OpenEnv HTTP server)
|
| 332 |
-
uvicorn server.app:app --reload --host 0.0.0.0 --port 7860
|
| 333 |
-
|
| 334 |
# Deploy to HF Spaces
|
| 335 |
openenv push
|
| 336 |
```
|
| 337 |
|
| 338 |
-
## Training
|
|
|
|
|
|
|
| 339 |
|
| 340 |
-
|
| 341 |
|
| 342 |
-
- **Model**:
|
| 343 |
- **Algorithm**: GRPO (Group Relative Policy Optimization)
|
| 344 |
-
- **
|
| 345 |
-
- **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
|
| 347 |
-
|
|
|
|
|
|
|
| 348 |
|
| 349 |
## Live Demo
|
| 350 |
|
|
|
|
| 13 |
|
| 14 |
# HR Onboarding & Offboarding Environment
|
| 15 |
|
| 16 |
+
[](https://colab.research.google.com/github/ravi03071991/rl_hack/blob/master/train_hr_agent.ipynb)
|
| 17 |
|
| 18 |
+
An OpenEnv-compatible RL environment that simulates enterprise HR onboarding and offboarding workflows. The agent orchestrates across **6 enterprise apps** β Workday, ServiceNow, Okta, Email, Slack, and Calendar β using 25 tools to complete multi-step tasks in a realistic HR system (200+ employees, 8 departments, RBAC, approval chains).
|
| 19 |
+
|
| 20 |
+
Built for the [OpenEnv Hackathon SF](https://cerebralvalley.ai/e/openenv-hackathon-sf/details) β **Statement 3.1: Professional Tasks** (Scaler AI Labs partner theme: Multi-App RL Environment for Enterprise Workflows).
|
| 21 |
+
|
| 22 |
+
### Key Results
|
| 23 |
+
|
| 24 |
+
> **GRPO training on Llama 3.2-1B-Instruct improves mean task score by +67% (0.37 β 0.62).**
|
| 25 |
+
> Complex multi-step task scores **more than double** (0.26 β 0.68). Gains generalize to held-out test tasks.
|
| 26 |
+
|
| 27 |
+
| | Baseline | Trained | Improvement |
|
| 28 |
+
|---|---------|---------|-------------|
|
| 29 |
+
| Mean Score | 0.370 | 0.617 | **+67%** |
|
| 30 |
+
| Complex Tasks | 0.26 | 0.68 | **+162%** |
|
| 31 |
+
| Pass Rate | 15.4% | 19.2% | +3.8pp |
|
| 32 |
|
| 33 |
## Quick Start
|
| 34 |
|
|
|
|
| 239 |
βββ __init__.py # Module exports
|
| 240 |
βββ client.py # HROnboardingEnv client
|
| 241 |
βββ models.py # Action/Observation Pydantic models
|
| 242 |
+
βββ test_with_llm.py # Test single task with GPT agent
|
| 243 |
+
βββ test_all_tasks.py # Evaluate all 77 tasks
|
| 244 |
+
βββ train_hr_agent.ipynb # GRPO training notebook (Unsloth)
|
| 245 |
βββ .env # API keys (gitignored)
|
| 246 |
+
βββ outputs/ # Evaluation results
|
| 247 |
βββ server/
|
| 248 |
βββ __init__.py
|
| 249 |
βββ app.py # FastAPI application
|
|
|
|
| 276 |
|
| 277 |
2. Install dependencies:
|
| 278 |
```bash
|
| 279 |
+
uv pip install -e ".[eval]"
|
| 280 |
```
|
| 281 |
|
| 282 |
### Run
|
|
|
|
| 285 |
cd rl_hack
|
| 286 |
|
| 287 |
# Test on default task (simple lookup)
|
| 288 |
+
uv run python -m test_with_llm
|
| 289 |
|
| 290 |
# Test a specific task by index (0-76)
|
| 291 |
uv run python -m test_with_llm 14 # medium onboarding task
|
| 292 |
uv run python -m test_with_llm 24 # complex full onboarding
|
| 293 |
uv run python -m test_with_llm 55 # edge case (headcount limit)
|
| 294 |
+
|
| 295 |
+
# Run full evaluation across all 77 tasks
|
| 296 |
+
uv run python test_all_tasks.py
|
| 297 |
```
|
| 298 |
|
| 299 |
The script will:
|
|
|
|
| 341 |
| 55-66 | Edge case | Various | Headcount limits, license caps, RBAC |
|
| 342 |
| 67-76 | Complex | Cross-workflow | Transfers, rehires, manager departures |
|
| 343 |
|
| 344 |
+
## Installation
|
| 345 |
+
|
| 346 |
+
```bash
|
| 347 |
+
# Clone the repo
|
| 348 |
+
git clone https://github.com/ravi03071991/rl_hack.git
|
| 349 |
+
cd rl_hack
|
| 350 |
+
|
| 351 |
+
# Install core dependencies
|
| 352 |
+
uv pip install -e .
|
| 353 |
+
|
| 354 |
+
# Install with evaluation support (adds openai)
|
| 355 |
+
uv pip install -e ".[eval]"
|
| 356 |
+
|
| 357 |
+
# Install with training support (adds unsloth, trl, torch, etc.)
|
| 358 |
+
uv pip install -e ".[train]"
|
| 359 |
+
|
| 360 |
+
# Install everything
|
| 361 |
+
uv pip install -e ".[eval,train,dev]"
|
| 362 |
+
```
|
| 363 |
+
|
| 364 |
## Building & Running
|
| 365 |
|
| 366 |
```bash
|
| 367 |
+
# Run locally (as OpenEnv HTTP server with playground UI)
|
| 368 |
+
uvicorn server.app:app --reload --host 0.0.0.0 --port 7860
|
| 369 |
+
|
| 370 |
# Build Docker image
|
| 371 |
docker build -t hr-onboarding-env:latest -f server/Dockerfile .
|
| 372 |
|
|
|
|
|
|
|
|
|
|
| 373 |
# Deploy to HF Spaces
|
| 374 |
openenv push
|
| 375 |
```
|
| 376 |
|
| 377 |
+
## Training & Results
|
| 378 |
+
|
| 379 |
+
We use Unsloth + GRPO to train an LLM agent on this environment. See [`train_hr_agent.ipynb`](train_hr_agent.ipynb) for the full training notebook and [W&B run](https://wandb.ai/ravi03071991/hr-agent-training/runs/bgent3o3?nw=nwuserravi03071991) for live training metrics.
|
| 380 |
|
| 381 |
+
### Setup
|
| 382 |
|
| 383 |
+
- **Model**: Llama 3.2-1B-Instruct (4-bit quantized, LoRA rank 8)
|
| 384 |
- **Algorithm**: GRPO (Group Relative Policy Optimization)
|
| 385 |
+
- **Reward functions**: Valid JSON + rubric score + efficiency
|
| 386 |
+
- **Training**: 300 steps, 6 generations per prompt, lr=5e-5 with cosine schedule
|
| 387 |
+
- **Data split**: 70/30 stratified train/test (52 train, 25 test tasks)
|
| 388 |
+
|
| 389 |
+
### Results
|
| 390 |
+
|
| 391 |
+
GRPO training significantly improves the model's ability to complete HR workflows:
|
| 392 |
+
|
| 393 |
+
| Metric | Base Model | Trained | Change |
|
| 394 |
+
|--------|-----------|---------|--------|
|
| 395 |
+
| **Train pass rate** | 15.4% | 19.2% | +3.8% |
|
| 396 |
+
| **Train mean score** | 0.370 | 0.617 | **+0.247 (+67%)** |
|
| 397 |
+
| **Test pass rate** | 12.0% | 16.0% | +4.0% |
|
| 398 |
+
| **Test mean score** | 0.370 | 0.617 | **+0.247 (+67%)** |
|
| 399 |
+
|
| 400 |
+
#### Improvement by difficulty
|
| 401 |
+
|
| 402 |
+
| Difficulty | Baseline | Trained | Change |
|
| 403 |
+
|------------|----------|---------|--------|
|
| 404 |
+
| Simple | 0.23 | 0.50 | +0.27 |
|
| 405 |
+
| Medium | 0.72 | 0.86 | +0.14 |
|
| 406 |
+
| **Complex** | **0.26** | **0.68** | **+0.42** |
|
| 407 |
+
| Edge case | 0.22 | 0.25 | +0.03 |
|
| 408 |
+
|
| 409 |
+
The biggest gains are on **complex multi-step tasks** β scores more than doubled. The improvement **generalizes to held-out test tasks**, proving the model learned transferable HR workflow skills.
|
| 410 |
+
|
| 411 |
+
### Reward Curve
|
| 412 |
+
|
| 413 |
+

|
| 414 |
+
|
| 415 |
+
The moving average reward trends upward from ~2-3 early in training to ~4-5 by the end, showing consistent learning.
|
| 416 |
+
|
| 417 |
+
### Quick start (Colab)
|
| 418 |
|
| 419 |
+
1. Click the Colab badge at the top to open `train_hr_agent.ipynb` in Google Colab
|
| 420 |
+
2. Select a GPU runtime
|
| 421 |
+
3. Run all cells β installs dependencies, trains, and evaluates automatically
|
| 422 |
|
| 423 |
## Live Demo
|
| 424 |
|
pyproject.toml
CHANGED
|
@@ -9,9 +9,24 @@ description = "HR Onboarding/Offboarding environment for OpenEnv β simulates e
|
|
| 9 |
requires-python = ">=3.10"
|
| 10 |
dependencies = [
|
| 11 |
"openenv-core[core]>=0.2.0",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
]
|
| 13 |
|
| 14 |
[project.optional-dependencies]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
dev = [
|
| 16 |
"pytest>=8.0.0",
|
| 17 |
"pytest-cov>=4.0.0",
|
|
|
|
| 9 |
requires-python = ">=3.10"
|
| 10 |
dependencies = [
|
| 11 |
"openenv-core[core]>=0.2.0",
|
| 12 |
+
"fastapi>=0.100.0",
|
| 13 |
+
"uvicorn>=0.20.0",
|
| 14 |
+
"pydantic>=2.0.0",
|
| 15 |
+
"python-dotenv>=1.0.0",
|
| 16 |
]
|
| 17 |
|
| 18 |
[project.optional-dependencies]
|
| 19 |
+
eval = [
|
| 20 |
+
"openai>=1.0.0",
|
| 21 |
+
]
|
| 22 |
+
train = [
|
| 23 |
+
"unsloth",
|
| 24 |
+
"trl>=0.22.0",
|
| 25 |
+
"datasets>=2.0.0",
|
| 26 |
+
"torch>=2.0.0",
|
| 27 |
+
"transformers>=4.40.0",
|
| 28 |
+
"bitsandbytes>=0.43.0",
|
| 29 |
+
]
|
| 30 |
dev = [
|
| 31 |
"pytest>=8.0.0",
|
| 32 |
"pytest-cov>=4.0.0",
|
reward_curve.png
ADDED
|
Git LFS Details
|
server/app.py
CHANGED
|
@@ -77,6 +77,7 @@ def get_tasks():
|
|
| 77 |
"difficulty": task.difficulty,
|
| 78 |
"category": task.category,
|
| 79 |
"expected_tools": task.expected_tools,
|
|
|
|
| 80 |
"num_criteria": len(task.rubric_criteria),
|
| 81 |
})
|
| 82 |
|
|
|
|
| 77 |
"difficulty": task.difficulty,
|
| 78 |
"category": task.category,
|
| 79 |
"expected_tools": task.expected_tools,
|
| 80 |
+
"rubric_criteria": task.rubric_criteria,
|
| 81 |
"num_criteria": len(task.rubric_criteria),
|
| 82 |
})
|
| 83 |
|
server/static/index.html
CHANGED
|
@@ -189,6 +189,41 @@
|
|
| 189 |
color: #d0d0d0;
|
| 190 |
}
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
.step-indicator {
|
| 193 |
display: flex;
|
| 194 |
align-items: center;
|
|
@@ -640,6 +675,7 @@
|
|
| 640 |
body: JSON.stringify({ task_idx: idx }),
|
| 641 |
});
|
| 642 |
const data = await res.json();
|
|
|
|
| 643 |
|
| 644 |
// Update instruction
|
| 645 |
const instrEl = document.getElementById('taskInstruction');
|
|
@@ -650,7 +686,8 @@
|
|
| 650 |
<span class="task-tag tag-${task.category}">${task.category}</span>
|
| 651 |
${data.task_id}
|
| 652 |
</h3>
|
| 653 |
-
<p>${data.instruction}</p>
|
|
|
|
| 654 |
<div class="step-indicator">
|
| 655 |
<span>Step ${currentStep}/${maxSteps}</span>
|
| 656 |
<div class="step-bar"><div class="step-bar-fill" style="width: 0%"></div></div>
|
|
@@ -760,6 +797,28 @@
|
|
| 760 |
log.scrollTop = log.scrollHeight;
|
| 761 |
}
|
| 762 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 763 |
// --- Evaluation ---
|
| 764 |
function showEvaluation(evalData) {
|
| 765 |
const section = document.getElementById('evalSection');
|
|
@@ -810,6 +869,15 @@
|
|
| 810 |
`).join('');
|
| 811 |
}
|
| 812 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 813 |
function selectTool(name) {
|
| 814 |
document.getElementById('toolSelect').value = name;
|
| 815 |
// Show parameter hints
|
|
|
|
| 189 |
color: #d0d0d0;
|
| 190 |
}
|
| 191 |
|
| 192 |
+
.ideal-result {
|
| 193 |
+
margin-top: 14px;
|
| 194 |
+
background: #141a26;
|
| 195 |
+
border: 1px solid #243049;
|
| 196 |
+
border-left: 3px solid #58a6ff;
|
| 197 |
+
border-radius: 8px;
|
| 198 |
+
padding: 12px;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
.ideal-result h4 {
|
| 202 |
+
font-size: 12px;
|
| 203 |
+
color: #9cc7ff;
|
| 204 |
+
margin-bottom: 8px;
|
| 205 |
+
text-transform: uppercase;
|
| 206 |
+
letter-spacing: 0.4px;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
.ideal-result .ideal-label {
|
| 210 |
+
font-size: 11px;
|
| 211 |
+
color: #7f8da3;
|
| 212 |
+
margin: 6px 0 4px;
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
.ideal-result ul {
|
| 216 |
+
margin: 0;
|
| 217 |
+
padding-left: 18px;
|
| 218 |
+
color: #c7d2e1;
|
| 219 |
+
font-size: 12px;
|
| 220 |
+
line-height: 1.45;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
.ideal-result li {
|
| 224 |
+
margin-bottom: 3px;
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
.step-indicator {
|
| 228 |
display: flex;
|
| 229 |
align-items: center;
|
|
|
|
| 675 |
body: JSON.stringify({ task_idx: idx }),
|
| 676 |
});
|
| 677 |
const data = await res.json();
|
| 678 |
+
maxSteps = data.max_steps || maxSteps;
|
| 679 |
|
| 680 |
// Update instruction
|
| 681 |
const instrEl = document.getElementById('taskInstruction');
|
|
|
|
| 686 |
<span class="task-tag tag-${task.category}">${task.category}</span>
|
| 687 |
${data.task_id}
|
| 688 |
</h3>
|
| 689 |
+
<p>${escapeHtml(data.instruction)}</p>
|
| 690 |
+
${renderIdealResult(task)}
|
| 691 |
<div class="step-indicator">
|
| 692 |
<span>Step ${currentStep}/${maxSteps}</span>
|
| 693 |
<div class="step-bar"><div class="step-bar-fill" style="width: 0%"></div></div>
|
|
|
|
| 797 |
log.scrollTop = log.scrollHeight;
|
| 798 |
}
|
| 799 |
|
| 800 |
+
function renderIdealResult(task) {
|
| 801 |
+
if (!task) return '';
|
| 802 |
+
|
| 803 |
+
const expectedTools = (task.expected_tools || [])
|
| 804 |
+
.map(t => `<li><code>${escapeHtml(String(t))}</code></li>`)
|
| 805 |
+
.join('');
|
| 806 |
+
|
| 807 |
+
const criteria = (task.rubric_criteria || [])
|
| 808 |
+
.map(c => `<li>${escapeHtml(c.description || c.name || 'Criterion')}</li>`)
|
| 809 |
+
.join('');
|
| 810 |
+
|
| 811 |
+
return `
|
| 812 |
+
<div class="ideal-result">
|
| 813 |
+
<h4>Ideal Result</h4>
|
| 814 |
+
<div class="ideal-label">Expected tools:</div>
|
| 815 |
+
<ul>${expectedTools || '<li>Not specified</li>'}</ul>
|
| 816 |
+
<div class="ideal-label">Success criteria:</div>
|
| 817 |
+
<ul>${criteria || '<li>Not specified</li>'}</ul>
|
| 818 |
+
</div>
|
| 819 |
+
`;
|
| 820 |
+
}
|
| 821 |
+
|
| 822 |
// --- Evaluation ---
|
| 823 |
function showEvaluation(evalData) {
|
| 824 |
const section = document.getElementById('evalSection');
|
|
|
|
| 869 |
`).join('');
|
| 870 |
}
|
| 871 |
|
| 872 |
+
function escapeHtml(value) {
|
| 873 |
+
return String(value)
|
| 874 |
+
.replace(/&/g, '&')
|
| 875 |
+
.replace(/</g, '<')
|
| 876 |
+
.replace(/>/g, '>')
|
| 877 |
+
.replace(/"/g, '"')
|
| 878 |
+
.replace(/'/g, ''');
|
| 879 |
+
}
|
| 880 |
+
|
| 881 |
function selectTool(name) {
|
| 882 |
document.getElementById('toolSelect').value = name;
|
| 883 |
// Show parameter hints
|
server/tasks.py
CHANGED
|
@@ -85,9 +85,10 @@ class TaskGenerator:
|
|
| 85 |
return f"task_{self._task_counter:04d}"
|
| 86 |
|
| 87 |
def generate_all_tasks(self) -> list[Task]:
|
| 88 |
-
"""Generate the full task set (~
|
| 89 |
tasks = []
|
| 90 |
tasks.extend(self._simple_lookup_tasks())
|
|
|
|
| 91 |
tasks.extend(self._simple_onboarding_tasks())
|
| 92 |
tasks.extend(self._medium_onboarding_tasks())
|
| 93 |
tasks.extend(self._complex_onboarding_tasks())
|
|
@@ -211,6 +212,126 @@ class TaskGenerator:
|
|
| 211 |
|
| 212 |
return tasks
|
| 213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
# ---- Simple Onboarding Tasks (5) ----
|
| 215 |
def _simple_onboarding_tasks(self) -> list[Task]:
|
| 216 |
tasks = []
|
|
@@ -264,6 +385,22 @@ class TaskGenerator:
|
|
| 264 |
("David Brown", "Security", "L2", "Security Analyst"),
|
| 265 |
("Li Wei", "Engineering", "L3", "Senior Engineer"),
|
| 266 |
("Emma Davis", "Product", "L3", "Senior PM"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
]
|
| 268 |
|
| 269 |
for name, dept, level, role in names:
|
|
@@ -303,6 +440,17 @@ class TaskGenerator:
|
|
| 303 |
("Carlos Mendez", "Security", "L3", "Senior Security Engineer"),
|
| 304 |
("Rachel Green", "Product", "L2", "Product Designer"),
|
| 305 |
("Raj Kapoor", "Engineering", "L2", "Backend Developer"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
]
|
| 307 |
|
| 308 |
for name, dept, level, role in complex_hires:
|
|
@@ -347,6 +495,12 @@ class TaskGenerator:
|
|
| 347 |
("Hassan Ahmed", "Data Science", "L3", "Lead Data Scientist"),
|
| 348 |
("Laura Martinez", "Finance", "L3", "Senior Financial Analyst"),
|
| 349 |
("Kevin O'Brien", "Product", "L4", "VP of Product"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
]:
|
| 351 |
manager = _pick_manager_in_dept(self.world, dept, min_level="L4")
|
| 352 |
needs_security = dept == "Security" or int(level[1]) >= 4
|
|
@@ -429,6 +583,17 @@ class TaskGenerator:
|
|
| 429 |
("resignation", "Daniel Park is retiring"),
|
| 430 |
("resignation", "Christina Muller is taking a career break"),
|
| 431 |
("resignation", "Yuki Tanaka is going back to school"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
]
|
| 433 |
|
| 434 |
for reason, scenario in offboarding_scenarios:
|
|
@@ -439,12 +604,14 @@ class TaskGenerator:
|
|
| 439 |
name = emp["name"]
|
| 440 |
instruction = (
|
| 441 |
f"Initiate offboarding for {name} ({emp['emp_id']}) who {scenario.split(' is ')[1] if ' is ' in scenario else 'is leaving'}. "
|
|
|
|
| 442 |
f"Revoke their system access and notify IT."
|
| 443 |
)
|
| 444 |
|
| 445 |
criteria = [
|
| 446 |
{"name": "created_request", "description": "Created offboarding request", "check": "tool_used:offboarding_create_request"},
|
| 447 |
-
{"name": "
|
|
|
|
| 448 |
{"name": "revoked_access", "description": "Revoked IT access", "check": "tool_used:it_revoke_access"},
|
| 449 |
{"name": "notified", "description": "Sent notification", "check": "tool_used_any:email_send,slack_send_message"},
|
| 450 |
]
|
|
@@ -466,7 +633,7 @@ class TaskGenerator:
|
|
| 466 |
tasks = []
|
| 467 |
|
| 468 |
# Full offboarding for managers/directors with reports
|
| 469 |
-
for _ in range(
|
| 470 |
# Find an employee who has direct reports
|
| 471 |
candidates = [e for e in self.world.state["employees"]
|
| 472 |
if e["status"] == "active" and int(e["level"][1]) >= 3]
|
|
@@ -506,7 +673,7 @@ class TaskGenerator:
|
|
| 506 |
))
|
| 507 |
|
| 508 |
# Offboarding with asset reclamation
|
| 509 |
-
for _ in range(
|
| 510 |
emp = _pick_employee(self.world, status="active")
|
| 511 |
if not emp:
|
| 512 |
continue
|
|
@@ -542,7 +709,7 @@ class TaskGenerator:
|
|
| 542 |
tasks = []
|
| 543 |
|
| 544 |
# 1. Headcount limit exceeded
|
| 545 |
-
for dept in ["Marketing", "Finance"]:
|
| 546 |
tasks.append(Task(
|
| 547 |
task_id=self._next_id(),
|
| 548 |
instruction=f"Onboard a new L1 Associate to the {dept} department. Create their employee record and start onboarding.",
|
|
@@ -684,7 +851,7 @@ class TaskGenerator:
|
|
| 684 |
category="offboarding",
|
| 685 |
expected_tools=["offboarding_create_request", "it_revoke_access", "offboarding_complete_step"],
|
| 686 |
rubric_criteria=[
|
| 687 |
-
{"name": "created_request", "description": "Created offboarding with termination reason", "check": "
|
| 688 |
{"name": "revoked_access", "description": "Revoked all access", "check": "tool_used:it_revoke_access"},
|
| 689 |
{"name": "no_farewell", "description": "Did NOT send farewell communications", "check": "tool_not_used:slack_send_message"},
|
| 690 |
{"name": "completed_steps", "description": "Completed termination steps", "check": "tool_used:offboarding_complete_step"},
|
|
@@ -744,6 +911,134 @@ class TaskGenerator:
|
|
| 744 |
context={"edge_case": "policy_check"},
|
| 745 |
))
|
| 746 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 747 |
return tasks
|
| 748 |
|
| 749 |
# ---- Cross-Workflow Tasks (10) ----
|
|
@@ -755,6 +1050,9 @@ class TaskGenerator:
|
|
| 755 |
("Engineering", "Product"),
|
| 756 |
("Sales", "Marketing"),
|
| 757 |
("Data Science", "Engineering"),
|
|
|
|
|
|
|
|
|
|
| 758 |
]
|
| 759 |
for from_dept, to_dept in transfers:
|
| 760 |
emp = _pick_employee(self.world, status="active", department=from_dept)
|
|
@@ -784,8 +1082,8 @@ class TaskGenerator:
|
|
| 784 |
context={"target_emp_id": emp["emp_id"], "from_dept": from_dept, "to_dept": to_dept},
|
| 785 |
))
|
| 786 |
|
| 787 |
-
# 4-
|
| 788 |
-
for _ in range(
|
| 789 |
emp = _pick_employee(self.world, status="offboarded")
|
| 790 |
if not emp:
|
| 791 |
continue
|
|
@@ -812,8 +1110,8 @@ class TaskGenerator:
|
|
| 812 |
context={"target_emp_id": emp["emp_id"], "rehire": True},
|
| 813 |
))
|
| 814 |
|
| 815 |
-
#
|
| 816 |
-
for dept in self.rng.sample(["Engineering", "Product", "Data Science"],
|
| 817 |
tasks.append(Task(
|
| 818 |
task_id=self._next_id(),
|
| 819 |
instruction=(
|
|
@@ -831,8 +1129,8 @@ class TaskGenerator:
|
|
| 831 |
context={"department": dept},
|
| 832 |
))
|
| 833 |
|
| 834 |
-
#
|
| 835 |
-
for _ in range(
|
| 836 |
candidates = [e for e in self.world.state["employees"]
|
| 837 |
if e["status"] == "active" and int(e["level"][1]) >= 3
|
| 838 |
and e.get("manager_id")]
|
|
|
|
| 85 |
return f"task_{self._task_counter:04d}"
|
| 86 |
|
| 87 |
def generate_all_tasks(self) -> list[Task]:
|
| 88 |
+
"""Generate the full task set (~200 tasks)."""
|
| 89 |
tasks = []
|
| 90 |
tasks.extend(self._simple_lookup_tasks())
|
| 91 |
+
tasks.extend(self._additional_lookup_tasks())
|
| 92 |
tasks.extend(self._simple_onboarding_tasks())
|
| 93 |
tasks.extend(self._medium_onboarding_tasks())
|
| 94 |
tasks.extend(self._complex_onboarding_tasks())
|
|
|
|
| 212 |
|
| 213 |
return tasks
|
| 214 |
|
| 215 |
+
# ---- Additional Lookup Tasks ----
|
| 216 |
+
def _additional_lookup_tasks(self) -> list[Task]:
|
| 217 |
+
tasks = []
|
| 218 |
+
depts = ["Engineering", "Product", "Marketing", "Sales", "Finance", "HR", "Data Science", "Security"]
|
| 219 |
+
|
| 220 |
+
# More employee lookups by ID
|
| 221 |
+
for _ in range(5):
|
| 222 |
+
emp = _pick_employee(self.world, status="active")
|
| 223 |
+
if not emp:
|
| 224 |
+
continue
|
| 225 |
+
tasks.append(Task(
|
| 226 |
+
task_id=self._next_id(),
|
| 227 |
+
instruction=f"Find the employee record for {emp['name']} (employee ID: {emp['emp_id']}).",
|
| 228 |
+
difficulty="simple",
|
| 229 |
+
category="lookup",
|
| 230 |
+
expected_tools=["hr_read_employee"],
|
| 231 |
+
rubric_criteria=[
|
| 232 |
+
{"name": "correct_tool", "description": "Used hr_read_employee", "check": "tool_used:hr_read_employee"},
|
| 233 |
+
{"name": "correct_id", "description": "Passed correct emp_id", "check": f"param_value:hr_read_employee.emp_id={emp['emp_id']}"},
|
| 234 |
+
],
|
| 235 |
+
context={"target_emp_id": emp["emp_id"], "target_name": emp["name"]},
|
| 236 |
+
))
|
| 237 |
+
|
| 238 |
+
# More department searches
|
| 239 |
+
for dept in self.rng.sample(depts, 3):
|
| 240 |
+
tasks.append(Task(
|
| 241 |
+
task_id=self._next_id(),
|
| 242 |
+
instruction=f"Show me all team members in the {dept} department.",
|
| 243 |
+
difficulty="simple",
|
| 244 |
+
category="lookup",
|
| 245 |
+
expected_tools=["hr_search_employees"],
|
| 246 |
+
rubric_criteria=[
|
| 247 |
+
{"name": "correct_tool", "description": "Used hr_search_employees", "check": "tool_used:hr_search_employees"},
|
| 248 |
+
{"name": "correct_dept", "description": "Filtered by correct department", "check": f"param_value:hr_search_employees.department={dept}"},
|
| 249 |
+
],
|
| 250 |
+
context={"department": dept},
|
| 251 |
+
))
|
| 252 |
+
|
| 253 |
+
# More org chart lookups
|
| 254 |
+
for dept in self.rng.sample(depts, 2):
|
| 255 |
+
tasks.append(Task(
|
| 256 |
+
task_id=self._next_id(),
|
| 257 |
+
instruction=f"Pull up the org chart for the {dept} team.",
|
| 258 |
+
difficulty="simple",
|
| 259 |
+
category="lookup",
|
| 260 |
+
expected_tools=["hr_get_org_chart"],
|
| 261 |
+
rubric_criteria=[
|
| 262 |
+
{"name": "correct_tool", "description": "Used hr_get_org_chart", "check": "tool_used:hr_get_org_chart"},
|
| 263 |
+
{"name": "correct_dept", "description": "Passed correct department", "check": f"param_value:hr_get_org_chart.department={dept}"},
|
| 264 |
+
],
|
| 265 |
+
context={"department": dept},
|
| 266 |
+
))
|
| 267 |
+
|
| 268 |
+
# Search by level
|
| 269 |
+
for level in ["L3", "L4", "L5"]:
|
| 270 |
+
tasks.append(Task(
|
| 271 |
+
task_id=self._next_id(),
|
| 272 |
+
instruction=f"Find all employees at level {level} across the company.",
|
| 273 |
+
difficulty="simple",
|
| 274 |
+
category="lookup",
|
| 275 |
+
expected_tools=["hr_search_employees"],
|
| 276 |
+
rubric_criteria=[
|
| 277 |
+
{"name": "correct_tool", "description": "Used hr_search_employees", "check": "tool_used:hr_search_employees"},
|
| 278 |
+
{"name": "correct_level", "description": "Filtered by correct level", "check": f"param_value:hr_search_employees.level={level}"},
|
| 279 |
+
],
|
| 280 |
+
context={"level": level},
|
| 281 |
+
))
|
| 282 |
+
|
| 283 |
+
# Policy lookups
|
| 284 |
+
tasks.append(Task(
|
| 285 |
+
task_id=self._next_id(),
|
| 286 |
+
instruction="What is the company's termination policy? Look up the relevant HR policy.",
|
| 287 |
+
difficulty="simple",
|
| 288 |
+
category="lookup",
|
| 289 |
+
expected_tools=["policy_lookup"],
|
| 290 |
+
rubric_criteria=[
|
| 291 |
+
{"name": "correct_tool", "description": "Used policy_lookup", "check": "tool_used:policy_lookup"},
|
| 292 |
+
{"name": "relevant_topic", "description": "Searched for termination topic", "check": "param_contains:policy_lookup.topic=terminat"},
|
| 293 |
+
],
|
| 294 |
+
))
|
| 295 |
+
|
| 296 |
+
tasks.append(Task(
|
| 297 |
+
task_id=self._next_id(),
|
| 298 |
+
instruction="Look up the contractor onboarding policy.",
|
| 299 |
+
difficulty="simple",
|
| 300 |
+
category="lookup",
|
| 301 |
+
expected_tools=["policy_lookup"],
|
| 302 |
+
rubric_criteria=[
|
| 303 |
+
{"name": "correct_tool", "description": "Used policy_lookup", "check": "tool_used:policy_lookup"},
|
| 304 |
+
{"name": "relevant_topic", "description": "Searched for contractor topic", "check": "param_contains:policy_lookup.topic=contractor"},
|
| 305 |
+
],
|
| 306 |
+
))
|
| 307 |
+
|
| 308 |
+
# Asset checks
|
| 309 |
+
tasks.append(Task(
|
| 310 |
+
task_id=self._next_id(),
|
| 311 |
+
instruction="What monitors are currently available for assignment?",
|
| 312 |
+
difficulty="simple",
|
| 313 |
+
category="lookup",
|
| 314 |
+
expected_tools=["it_get_available_assets"],
|
| 315 |
+
rubric_criteria=[
|
| 316 |
+
{"name": "correct_tool", "description": "Used it_get_available_assets", "check": "tool_used:it_get_available_assets"},
|
| 317 |
+
{"name": "correct_type", "description": "Filtered by monitor type", "check": "param_value:it_get_available_assets.asset_type=monitor"},
|
| 318 |
+
],
|
| 319 |
+
))
|
| 320 |
+
|
| 321 |
+
tasks.append(Task(
|
| 322 |
+
task_id=self._next_id(),
|
| 323 |
+
instruction="Check how many phones are available for new hires.",
|
| 324 |
+
difficulty="simple",
|
| 325 |
+
category="lookup",
|
| 326 |
+
expected_tools=["it_get_available_assets"],
|
| 327 |
+
rubric_criteria=[
|
| 328 |
+
{"name": "correct_tool", "description": "Used it_get_available_assets", "check": "tool_used:it_get_available_assets"},
|
| 329 |
+
{"name": "correct_type", "description": "Filtered by phone type", "check": "param_value:it_get_available_assets.asset_type=phone"},
|
| 330 |
+
],
|
| 331 |
+
))
|
| 332 |
+
|
| 333 |
+
return tasks
|
| 334 |
+
|
| 335 |
# ---- Simple Onboarding Tasks (5) ----
|
| 336 |
def _simple_onboarding_tasks(self) -> list[Task]:
|
| 337 |
tasks = []
|
|
|
|
| 385 |
("David Brown", "Security", "L2", "Security Analyst"),
|
| 386 |
("Li Wei", "Engineering", "L3", "Senior Engineer"),
|
| 387 |
("Emma Davis", "Product", "L3", "Senior PM"),
|
| 388 |
+
# --- Additional medium onboarding hires ---
|
| 389 |
+
("Olivia Thompson", "Marketing", "L2", "Content Strategist"),
|
| 390 |
+
("Wei Zhang", "Engineering", "L3", "Staff Engineer"),
|
| 391 |
+
("Rosa Martinez", "Sales", "L2", "Account Executive"),
|
| 392 |
+
("Kofi Asante", "Data Science", "L1", "Junior Data Analyst"),
|
| 393 |
+
("Yuki Sato", "Product", "L1", "Associate PM"),
|
| 394 |
+
("Dmitri Volkov", "Security", "L3", "Senior Security Engineer"),
|
| 395 |
+
("Amara Okafor", "HR", "L2", "HR Business Partner"),
|
| 396 |
+
("Liam O'Connor", "Finance", "L3", "Senior Accountant"),
|
| 397 |
+
("Fatou Diallo", "Engineering", "L1", "Junior Developer"),
|
| 398 |
+
("Ines Moreau", "Marketing", "L3", "Marketing Manager"),
|
| 399 |
+
("Tariq Hassan", "Sales", "L3", "Sales Manager"),
|
| 400 |
+
("Mei-Ling Wu", "Data Science", "L2", "ML Engineer"),
|
| 401 |
+
("Jakob Andersen", "Product", "L2", "UX Researcher"),
|
| 402 |
+
("Chloe Dubois", "HR", "L3", "Senior HR Specialist"),
|
| 403 |
+
("Ravi Krishnan", "Finance", "L1", "Junior Analyst"),
|
| 404 |
]
|
| 405 |
|
| 406 |
for name, dept, level, role in names:
|
|
|
|
| 440 |
("Carlos Mendez", "Security", "L3", "Senior Security Engineer"),
|
| 441 |
("Rachel Green", "Product", "L2", "Product Designer"),
|
| 442 |
("Raj Kapoor", "Engineering", "L2", "Backend Developer"),
|
| 443 |
+
# --- Additional complex hires ---
|
| 444 |
+
("Sofia Andersson", "Marketing", "L3", "Brand Director"),
|
| 445 |
+
("Kwame Mensah", "Sales", "L2", "Enterprise Sales Rep"),
|
| 446 |
+
("Elena Popov", "Finance", "L3", "Senior Controller"),
|
| 447 |
+
("Marcus Washington", "HR", "L2", "Talent Acquisition Lead"),
|
| 448 |
+
("Yuna Park", "Data Science", "L2", "Data Engineer"),
|
| 449 |
+
("Omar Khalil", "Engineering", "L3", "DevOps Lead"),
|
| 450 |
+
("Isabella Romano", "Product", "L3", "Senior Product Manager"),
|
| 451 |
+
("Thabo Ndlovu", "Security", "L2", "Security Operations Analyst"),
|
| 452 |
+
("Annika Johansson", "Marketing", "L2", "Growth Marketing Manager"),
|
| 453 |
+
("Chen Wei", "Finance", "L2", "Financial Systems Analyst"),
|
| 454 |
]
|
| 455 |
|
| 456 |
for name, dept, level, role in complex_hires:
|
|
|
|
| 495 |
("Hassan Ahmed", "Data Science", "L3", "Lead Data Scientist"),
|
| 496 |
("Laura Martinez", "Finance", "L3", "Senior Financial Analyst"),
|
| 497 |
("Kevin O'Brien", "Product", "L4", "VP of Product"),
|
| 498 |
+
# --- Additional approval-chain hires ---
|
| 499 |
+
("Priscilla Nakamura", "Security", "L4", "Head of Security Operations"),
|
| 500 |
+
("Ahmed El-Sayed", "Engineering", "L3", "Principal Architect"),
|
| 501 |
+
("Gabriela Fernandez", "Data Science", "L4", "Director of Analytics"),
|
| 502 |
+
("Vikram Reddy", "Finance", "L4", "VP of Finance"),
|
| 503 |
+
("Nadia Kuznetsova", "HR", "L4", "VP of People"),
|
| 504 |
]:
|
| 505 |
manager = _pick_manager_in_dept(self.world, dept, min_level="L4")
|
| 506 |
needs_security = dept == "Security" or int(level[1]) >= 4
|
|
|
|
| 583 |
("resignation", "Daniel Park is retiring"),
|
| 584 |
("resignation", "Christina Muller is taking a career break"),
|
| 585 |
("resignation", "Yuki Tanaka is going back to school"),
|
| 586 |
+
# --- Additional offboarding scenarios ---
|
| 587 |
+
("resignation", "Ming Chen is pursuing a startup"),
|
| 588 |
+
("resignation", "Rosa Martinez is relocating internationally"),
|
| 589 |
+
("termination", "Brian Foster is being terminated for misconduct"),
|
| 590 |
+
("resignation", "Anika Gupta is joining a competitor"),
|
| 591 |
+
("resignation", "Jean-Pierre Leclerc is taking a sabbatical"),
|
| 592 |
+
("resignation", "Naomi Osei is transitioning to freelance work"),
|
| 593 |
+
("resignation", "Derek Olson is moving into academia"),
|
| 594 |
+
("termination", "Suki Yamamoto is being terminated for underperformance"),
|
| 595 |
+
("resignation", "Alejandro Ruiz is emigrating abroad"),
|
| 596 |
+
("resignation", "Priya Venkatesh is leaving for personal reasons"),
|
| 597 |
]
|
| 598 |
|
| 599 |
for reason, scenario in offboarding_scenarios:
|
|
|
|
| 604 |
name = emp["name"]
|
| 605 |
instruction = (
|
| 606 |
f"Initiate offboarding for {name} ({emp['emp_id']}) who {scenario.split(' is ')[1] if ' is ' in scenario else 'is leaving'}. "
|
| 607 |
+
f"Set the reason to '{reason}'. "
|
| 608 |
f"Revoke their system access and notify IT."
|
| 609 |
)
|
| 610 |
|
| 611 |
criteria = [
|
| 612 |
{"name": "created_request", "description": "Created offboarding request", "check": "tool_used:offboarding_create_request"},
|
| 613 |
+
{"name": "correct_emp", "description": "Used correct employee ID", "check": f"param_value:offboarding_create_request.employee_id={emp['emp_id']}"},
|
| 614 |
+
{"name": "correct_reason", "description": "Set correct reason", "check": f"param_contains:offboarding_create_request.reason={reason}"},
|
| 615 |
{"name": "revoked_access", "description": "Revoked IT access", "check": "tool_used:it_revoke_access"},
|
| 616 |
{"name": "notified", "description": "Sent notification", "check": "tool_used_any:email_send,slack_send_message"},
|
| 617 |
]
|
|
|
|
| 633 |
tasks = []
|
| 634 |
|
| 635 |
# Full offboarding for managers/directors with reports
|
| 636 |
+
for _ in range(8):
|
| 637 |
# Find an employee who has direct reports
|
| 638 |
candidates = [e for e in self.world.state["employees"]
|
| 639 |
if e["status"] == "active" and int(e["level"][1]) >= 3]
|
|
|
|
| 673 |
))
|
| 674 |
|
| 675 |
# Offboarding with asset reclamation
|
| 676 |
+
for _ in range(8):
|
| 677 |
emp = _pick_employee(self.world, status="active")
|
| 678 |
if not emp:
|
| 679 |
continue
|
|
|
|
| 709 |
tasks = []
|
| 710 |
|
| 711 |
# 1. Headcount limit exceeded
|
| 712 |
+
for dept in ["Marketing", "Finance", "HR", "Sales", "Data Science"]:
|
| 713 |
tasks.append(Task(
|
| 714 |
task_id=self._next_id(),
|
| 715 |
instruction=f"Onboard a new L1 Associate to the {dept} department. Create their employee record and start onboarding.",
|
|
|
|
| 851 |
category="offboarding",
|
| 852 |
expected_tools=["offboarding_create_request", "it_revoke_access", "offboarding_complete_step"],
|
| 853 |
rubric_criteria=[
|
| 854 |
+
{"name": "created_request", "description": "Created offboarding with termination reason", "check": "param_contains:offboarding_create_request.reason=terminat"},
|
| 855 |
{"name": "revoked_access", "description": "Revoked all access", "check": "tool_used:it_revoke_access"},
|
| 856 |
{"name": "no_farewell", "description": "Did NOT send farewell communications", "check": "tool_not_used:slack_send_message"},
|
| 857 |
{"name": "completed_steps", "description": "Completed termination steps", "check": "tool_used:offboarding_complete_step"},
|
|
|
|
| 911 |
context={"edge_case": "policy_check"},
|
| 912 |
))
|
| 913 |
|
| 914 |
+
# 12. Software license full β Salesforce
|
| 915 |
+
tasks.append(Task(
|
| 916 |
+
task_id=self._next_id(),
|
| 917 |
+
instruction="Check if there are available Salesforce licenses for a new Sales hire.",
|
| 918 |
+
difficulty="edge_case",
|
| 919 |
+
category="onboarding",
|
| 920 |
+
expected_tools=["it_get_software_licenses"],
|
| 921 |
+
rubric_criteria=[
|
| 922 |
+
{"name": "checked_licenses", "description": "Checked licenses", "check": "tool_used:it_get_software_licenses"},
|
| 923 |
+
{"name": "correct_software", "description": "Checked Salesforce", "check": "param_contains:it_get_software_licenses.software_name=Salesforce"},
|
| 924 |
+
],
|
| 925 |
+
context={"edge_case": "license_check", "software": "Salesforce"},
|
| 926 |
+
))
|
| 927 |
+
|
| 928 |
+
# 13. Software license full β Figma
|
| 929 |
+
tasks.append(Task(
|
| 930 |
+
task_id=self._next_id(),
|
| 931 |
+
instruction="A new Product designer needs Figma access. Check if there are available Figma licenses.",
|
| 932 |
+
difficulty="edge_case",
|
| 933 |
+
category="onboarding",
|
| 934 |
+
expected_tools=["it_get_software_licenses"],
|
| 935 |
+
rubric_criteria=[
|
| 936 |
+
{"name": "checked_licenses", "description": "Checked licenses", "check": "tool_used:it_get_software_licenses"},
|
| 937 |
+
{"name": "correct_software", "description": "Checked Figma", "check": "param_contains:it_get_software_licenses.software_name=Figma"},
|
| 938 |
+
],
|
| 939 |
+
context={"edge_case": "license_check", "software": "Figma"},
|
| 940 |
+
))
|
| 941 |
+
|
| 942 |
+
# 14. Contractor onboarding β Marketing
|
| 943 |
+
tasks.append(Task(
|
| 944 |
+
task_id=self._next_id(),
|
| 945 |
+
instruction=(
|
| 946 |
+
"Onboard contractor Lucia Bianchi to Marketing as an L1 Contract Content Writer. "
|
| 947 |
+
"Contractors have limited access β no VPN, restricted to Slack and Google Workspace only, "
|
| 948 |
+
"and require legal approval. Create the record, initiate onboarding, "
|
| 949 |
+
"get legal approval, and provision appropriate (limited) access."
|
| 950 |
+
),
|
| 951 |
+
difficulty="edge_case",
|
| 952 |
+
category="onboarding",
|
| 953 |
+
expected_tools=["hr_create_employee", "onboarding_create_request", "approval_request",
|
| 954 |
+
"it_create_account"],
|
| 955 |
+
rubric_criteria=[
|
| 956 |
+
{"name": "created_contractor", "description": "Created employee with is_contractor=true", "check": "param_value:hr_create_employee.is_contractor=True"},
|
| 957 |
+
{"name": "initiated_onboarding", "description": "Created onboarding request", "check": "tool_used:onboarding_create_request"},
|
| 958 |
+
{"name": "legal_approval", "description": "Got legal approval", "check": "param_value:approval_request.approval_type=legal_approval"},
|
| 959 |
+
{"name": "limited_access", "description": "Created limited accounts", "check": "tool_used:it_create_account"},
|
| 960 |
+
],
|
| 961 |
+
context={"edge_case": "contractor_onboarding", "name": "Lucia Bianchi"},
|
| 962 |
+
))
|
| 963 |
+
|
| 964 |
+
# 15. Second termination scenario β security breach
|
| 965 |
+
emp2 = _pick_employee(self.world, status="active", has_manager=True)
|
| 966 |
+
if emp2:
|
| 967 |
+
tasks.append(Task(
|
| 968 |
+
task_id=self._next_id(),
|
| 969 |
+
instruction=(
|
| 970 |
+
f"{emp2['name']} ({emp2['emp_id']}) is being terminated due to a security breach. "
|
| 971 |
+
f"Immediately revoke all system access and badges, create the termination request, "
|
| 972 |
+
f"and ensure all offboarding steps are completed. Do NOT send farewell messages."
|
| 973 |
+
),
|
| 974 |
+
difficulty="edge_case",
|
| 975 |
+
category="offboarding",
|
| 976 |
+
expected_tools=["offboarding_create_request", "it_revoke_access", "offboarding_complete_step"],
|
| 977 |
+
rubric_criteria=[
|
| 978 |
+
{"name": "created_request", "description": "Created offboarding with termination reason", "check": "param_contains:offboarding_create_request.reason=terminat"},
|
| 979 |
+
{"name": "revoked_access", "description": "Revoked all access", "check": "tool_used:it_revoke_access"},
|
| 980 |
+
{"name": "no_farewell_email", "description": "Did NOT send farewell email", "check": "tool_not_used:email_send"},
|
| 981 |
+
{"name": "no_farewell_slack", "description": "Did NOT send farewell Slack", "check": "tool_not_used:slack_send_message"},
|
| 982 |
+
{"name": "completed_steps", "description": "Completed termination steps", "check": "tool_used:offboarding_complete_step"},
|
| 983 |
+
],
|
| 984 |
+
context={"target_emp_id": emp2["emp_id"], "edge_case": "termination_security_breach"},
|
| 985 |
+
))
|
| 986 |
+
|
| 987 |
+
# 16. Third termination scenario β misconduct
|
| 988 |
+
emp3 = _pick_employee(self.world, status="active", has_manager=True)
|
| 989 |
+
if emp3:
|
| 990 |
+
tasks.append(Task(
|
| 991 |
+
task_id=self._next_id(),
|
| 992 |
+
instruction=(
|
| 993 |
+
f"{emp3['name']} ({emp3['emp_id']}) is being terminated for workplace misconduct. "
|
| 994 |
+
f"Follow the termination policy: revoke all access immediately, "
|
| 995 |
+
f"create the termination offboarding request with reason 'termination', "
|
| 996 |
+
f"and complete the process. No farewell communications."
|
| 997 |
+
),
|
| 998 |
+
difficulty="edge_case",
|
| 999 |
+
category="offboarding",
|
| 1000 |
+
expected_tools=["offboarding_create_request", "it_revoke_access"],
|
| 1001 |
+
rubric_criteria=[
|
| 1002 |
+
{"name": "revoked_first", "description": "Revoked access", "check": "tool_used:it_revoke_access"},
|
| 1003 |
+
{"name": "created_request", "description": "Created termination request", "check": "param_contains:offboarding_create_request.reason=terminat"},
|
| 1004 |
+
{"name": "no_farewell", "description": "No farewell sent", "check": "tool_not_used:slack_send_message"},
|
| 1005 |
+
],
|
| 1006 |
+
context={"target_emp_id": emp3["emp_id"], "edge_case": "termination_misconduct"},
|
| 1007 |
+
))
|
| 1008 |
+
|
| 1009 |
+
# 17. Bulk onboarding resource check
|
| 1010 |
+
tasks.append(Task(
|
| 1011 |
+
task_id=self._next_id(),
|
| 1012 |
+
instruction=(
|
| 1013 |
+
"The Engineering team is hiring 5 new engineers at once. Before proceeding, "
|
| 1014 |
+
"check available laptops, monitors, and software licenses (Jira, GitHub, AWS). "
|
| 1015 |
+
"Report what resources are available."
|
| 1016 |
+
),
|
| 1017 |
+
difficulty="edge_case",
|
| 1018 |
+
category="onboarding",
|
| 1019 |
+
expected_tools=["it_get_available_assets", "it_get_software_licenses"],
|
| 1020 |
+
rubric_criteria=[
|
| 1021 |
+
{"name": "checked_laptops", "description": "Checked laptop availability", "check": "tool_used:it_get_available_assets"},
|
| 1022 |
+
{"name": "checked_licenses", "description": "Checked software licenses", "check": "tool_used:it_get_software_licenses"},
|
| 1023 |
+
{"name": "multiple_checks", "description": "Made multiple resource checks", "check": "tool_count:it_get_software_licenses>=2"},
|
| 1024 |
+
],
|
| 1025 |
+
context={"edge_case": "bulk_onboarding_resources"},
|
| 1026 |
+
))
|
| 1027 |
+
|
| 1028 |
+
# 18. Look up termination policy
|
| 1029 |
+
tasks.append(Task(
|
| 1030 |
+
task_id=self._next_id(),
|
| 1031 |
+
instruction="Look up the company's termination policy and the offboarding policy to understand the required steps.",
|
| 1032 |
+
difficulty="edge_case",
|
| 1033 |
+
category="lookup",
|
| 1034 |
+
expected_tools=["policy_lookup"],
|
| 1035 |
+
rubric_criteria=[
|
| 1036 |
+
{"name": "looked_up_policy", "description": "Looked up policy", "check": "tool_used:policy_lookup"},
|
| 1037 |
+
{"name": "multiple_lookups", "description": "Looked up multiple policies", "check": "tool_count:policy_lookup>=2"},
|
| 1038 |
+
],
|
| 1039 |
+
context={"edge_case": "policy_check_termination"},
|
| 1040 |
+
))
|
| 1041 |
+
|
| 1042 |
return tasks
|
| 1043 |
|
| 1044 |
# ---- Cross-Workflow Tasks (10) ----
|
|
|
|
| 1050 |
("Engineering", "Product"),
|
| 1051 |
("Sales", "Marketing"),
|
| 1052 |
("Data Science", "Engineering"),
|
| 1053 |
+
("Finance", "HR"),
|
| 1054 |
+
("Marketing", "Product"),
|
| 1055 |
+
("Security", "Engineering"),
|
| 1056 |
]
|
| 1057 |
for from_dept, to_dept in transfers:
|
| 1058 |
emp = _pick_employee(self.world, status="active", department=from_dept)
|
|
|
|
| 1082 |
context={"target_emp_id": emp["emp_id"], "from_dept": from_dept, "to_dept": to_dept},
|
| 1083 |
))
|
| 1084 |
|
| 1085 |
+
# 4-7. Rehire previously offboarded employee
|
| 1086 |
+
for _ in range(4):
|
| 1087 |
emp = _pick_employee(self.world, status="offboarded")
|
| 1088 |
if not emp:
|
| 1089 |
continue
|
|
|
|
| 1110 |
context={"target_emp_id": emp["emp_id"], "rehire": True},
|
| 1111 |
))
|
| 1112 |
|
| 1113 |
+
# Bulk operations
|
| 1114 |
+
for dept in self.rng.sample(["Engineering", "Product", "Data Science", "Marketing", "Sales", "Security"], 6):
|
| 1115 |
tasks.append(Task(
|
| 1116 |
task_id=self._next_id(),
|
| 1117 |
instruction=(
|
|
|
|
| 1129 |
context={"department": dept},
|
| 1130 |
))
|
| 1131 |
|
| 1132 |
+
# Manager leaving β handle succession
|
| 1133 |
+
for _ in range(4):
|
| 1134 |
candidates = [e for e in self.world.state["employees"]
|
| 1135 |
if e["status"] == "active" and int(e["level"][1]) >= 3
|
| 1136 |
and e.get("manager_id")]
|
test_all_tasks.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Run all 77 tasks with GPT-4o-mini and compute aggregate metrics."""
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
sys.path.insert(0, ".")
|
| 13 |
+
sys.path.insert(0, "./server")
|
| 14 |
+
|
| 15 |
+
from openai import OpenAI
|
| 16 |
+
from server.hr_onboarding_environment import HROnboardingEnvironment
|
| 17 |
+
from models import HROnboardingAction
|
| 18 |
+
from server.tools import TOOL_DEFINITIONS
|
| 19 |
+
from server.rubrics import RubricEvaluator
|
| 20 |
+
|
| 21 |
+
client = OpenAI()
|
| 22 |
+
tool_desc = json.dumps(TOOL_DEFINITIONS, indent=2)
|
| 23 |
+
|
| 24 |
+
system_prompt = (
|
| 25 |
+
"You are an HR automation agent for AcmeCorp. You help with employee "
|
| 26 |
+
"onboarding and offboarding by calling the appropriate tools.\n\n"
|
| 27 |
+
"For each step, respond with ONLY a JSON tool call in this exact format:\n"
|
| 28 |
+
'{"tool": "<tool_name>", "params": {<parameters>}}\n\n'
|
| 29 |
+
'When you believe the task is complete, respond with:\n'
|
| 30 |
+
'{"tool": "__done__", "params": {}}\n\n'
|
| 31 |
+
"Important rules:\n"
|
| 32 |
+
"- Respond with ONLY the JSON object, no other text\n"
|
| 33 |
+
"- Use the exact tool names and parameter names from the tool definitions\n"
|
| 34 |
+
"- Think about what information you need and what tools to call in what order\n\n"
|
| 35 |
+
f"Available tools:\n{tool_desc}"
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
results = []
|
| 39 |
+
evaluator = RubricEvaluator()
|
| 40 |
+
|
| 41 |
+
num_tasks = 77
|
| 42 |
+
print("=" * 70)
|
| 43 |
+
print("HR ONBOARDING ENVIRONMENT β FULL EVALUATION (77 tasks)")
|
| 44 |
+
print(f"Model: gpt-4o-mini")
|
| 45 |
+
print("=" * 70)
|
| 46 |
+
|
| 47 |
+
for task_idx in range(num_tasks):
|
| 48 |
+
env = HROnboardingEnvironment(seed=42, max_steps=15)
|
| 49 |
+
# Cycle to the desired task
|
| 50 |
+
for _ in range(task_idx + 1):
|
| 51 |
+
obs = env.reset()
|
| 52 |
+
|
| 53 |
+
task = env._current_task
|
| 54 |
+
task_id = obs.task_id
|
| 55 |
+
difficulty = obs.metadata.get("difficulty", "?")
|
| 56 |
+
category = obs.metadata.get("category", "?")
|
| 57 |
+
|
| 58 |
+
messages = [
|
| 59 |
+
{"role": "system", "content": system_prompt},
|
| 60 |
+
{"role": "user", "content": obs.instruction},
|
| 61 |
+
]
|
| 62 |
+
|
| 63 |
+
steps_taken = 0
|
| 64 |
+
error_count = 0
|
| 65 |
+
|
| 66 |
+
for step in range(1, obs.max_steps + 1):
|
| 67 |
+
try:
|
| 68 |
+
response = client.chat.completions.create(
|
| 69 |
+
model="gpt-4o-mini",
|
| 70 |
+
messages=messages,
|
| 71 |
+
temperature=0.1,
|
| 72 |
+
max_tokens=512,
|
| 73 |
+
)
|
| 74 |
+
assistant_msg = response.choices[0].message.content.strip()
|
| 75 |
+
except Exception as e:
|
| 76 |
+
print(f" API error on {task_id} step {step}: {e}")
|
| 77 |
+
time.sleep(5)
|
| 78 |
+
continue
|
| 79 |
+
|
| 80 |
+
# Parse tool call
|
| 81 |
+
try:
|
| 82 |
+
json_match = re.search(r'\{.*\}', assistant_msg, re.DOTALL)
|
| 83 |
+
if json_match:
|
| 84 |
+
tool_call = json.loads(json_match.group())
|
| 85 |
+
else:
|
| 86 |
+
tool_call = json.loads(assistant_msg)
|
| 87 |
+
except json.JSONDecodeError:
|
| 88 |
+
messages.append({"role": "assistant", "content": assistant_msg})
|
| 89 |
+
messages.append({"role": "user", "content": 'Respond with valid JSON: {"tool": "<name>", "params": {<args>}}'})
|
| 90 |
+
error_count += 1
|
| 91 |
+
continue
|
| 92 |
+
|
| 93 |
+
tool_name = tool_call.get("tool", "")
|
| 94 |
+
params = tool_call.get("params", {})
|
| 95 |
+
|
| 96 |
+
if tool_name == "__done__":
|
| 97 |
+
break
|
| 98 |
+
|
| 99 |
+
action = HROnboardingAction(tool_name=tool_name, arguments=params)
|
| 100 |
+
obs = env.step(action)
|
| 101 |
+
steps_taken += 1
|
| 102 |
+
|
| 103 |
+
result_str = json.dumps(obs.tool_result, indent=2)
|
| 104 |
+
messages.append({"role": "assistant", "content": assistant_msg})
|
| 105 |
+
messages.append({"role": "user", "content": f"Tool result:\n{result_str}\n\nContinue with next tool call, or {{\"tool\": \"__done__\", \"params\": {{}}}} if done."})
|
| 106 |
+
|
| 107 |
+
if obs.done:
|
| 108 |
+
break
|
| 109 |
+
|
| 110 |
+
# Evaluate
|
| 111 |
+
eval_result = evaluator.evaluate(task, env.world.action_log)
|
| 112 |
+
|
| 113 |
+
result = {
|
| 114 |
+
"task_id": task_id,
|
| 115 |
+
"difficulty": difficulty,
|
| 116 |
+
"category": category,
|
| 117 |
+
"score": eval_result["score"],
|
| 118 |
+
"passed": eval_result["passed"],
|
| 119 |
+
"passed_count": eval_result["passed_count"],
|
| 120 |
+
"total_criteria": eval_result["total_criteria"],
|
| 121 |
+
"steps_taken": steps_taken,
|
| 122 |
+
"parse_errors": error_count,
|
| 123 |
+
}
|
| 124 |
+
results.append(result)
|
| 125 |
+
|
| 126 |
+
status = "PASS" if result["passed"] else "FAIL"
|
| 127 |
+
print(f" [{task_idx+1:2d}/77] {task_id:10s} [{difficulty:10s}] [{category:14s}] "
|
| 128 |
+
f"Score: {result['score']:.0%} ({result['passed_count']}/{result['total_criteria']}) "
|
| 129 |
+
f"Steps: {steps_taken:2d} {status}")
|
| 130 |
+
|
| 131 |
+
# --- Aggregate metrics ---
|
| 132 |
+
print("\n" + "=" * 70)
|
| 133 |
+
print("AGGREGATE RESULTS")
|
| 134 |
+
print("=" * 70)
|
| 135 |
+
|
| 136 |
+
total = len(results)
|
| 137 |
+
pass_count = sum(1 for r in results if r["passed"])
|
| 138 |
+
mean_score = sum(r["score"] for r in results) / total
|
| 139 |
+
mean_steps = sum(r["steps_taken"] for r in results) / total
|
| 140 |
+
total_criteria = sum(r["total_criteria"] for r in results)
|
| 141 |
+
total_passed_criteria = sum(r["passed_count"] for r in results)
|
| 142 |
+
|
| 143 |
+
print(f"\nOverall:")
|
| 144 |
+
print(f" Tasks: {total}")
|
| 145 |
+
print(f" Pass rate: {pass_count}/{total} ({pass_count/total:.1%})")
|
| 146 |
+
print(f" Mean score: {mean_score:.3f}")
|
| 147 |
+
print(f" Mean steps: {mean_steps:.1f}")
|
| 148 |
+
print(f" Criteria hit: {total_passed_criteria}/{total_criteria} ({total_passed_criteria/total_criteria:.1%})")
|
| 149 |
+
|
| 150 |
+
# By difficulty
|
| 151 |
+
print(f"\nBy Difficulty:")
|
| 152 |
+
for diff in ["simple", "medium", "complex", "edge_case"]:
|
| 153 |
+
subset = [r for r in results if r["difficulty"] == diff]
|
| 154 |
+
if not subset:
|
| 155 |
+
continue
|
| 156 |
+
n = len(subset)
|
| 157 |
+
p = sum(1 for r in subset if r["passed"])
|
| 158 |
+
s = sum(r["score"] for r in subset) / n
|
| 159 |
+
st = sum(r["steps_taken"] for r in subset) / n
|
| 160 |
+
print(f" {diff:10s}: {p:2d}/{n:2d} pass ({p/n:.0%}) mean_score={s:.2f} mean_steps={st:.1f}")
|
| 161 |
+
|
| 162 |
+
# By category
|
| 163 |
+
print(f"\nBy Category:")
|
| 164 |
+
for cat in ["lookup", "onboarding", "offboarding", "cross_workflow"]:
|
| 165 |
+
subset = [r for r in results if r["category"] == cat]
|
| 166 |
+
if not subset:
|
| 167 |
+
continue
|
| 168 |
+
n = len(subset)
|
| 169 |
+
p = sum(1 for r in subset if r["passed"])
|
| 170 |
+
s = sum(r["score"] for r in subset) / n
|
| 171 |
+
print(f" {cat:14s}: {p:2d}/{n:2d} pass ({p/n:.0%}) mean_score={s:.2f}")
|
| 172 |
+
|
| 173 |
+
# Save results
|
| 174 |
+
os.makedirs("outputs", exist_ok=True)
|
| 175 |
+
with open("outputs/full_eval_results.json", "w") as f:
|
| 176 |
+
json.dump({
|
| 177 |
+
"model": "gpt-4o-mini",
|
| 178 |
+
"total_tasks": total,
|
| 179 |
+
"pass_count": pass_count,
|
| 180 |
+
"pass_rate": pass_count / total,
|
| 181 |
+
"mean_score": mean_score,
|
| 182 |
+
"mean_steps": mean_steps,
|
| 183 |
+
"criteria_hit_rate": total_passed_criteria / total_criteria,
|
| 184 |
+
"results": results,
|
| 185 |
+
}, f, indent=2)
|
| 186 |
+
print(f"\nDetailed results saved to outputs/full_eval_results.json")
|
train_hr_agent.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|