Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- evaluator.py +140 -0
- pyproject.toml +1 -0
- server/environment.py +65 -22
- server/requirements.txt +1 -0
- tasks.py +45 -1
evaluator.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Lightweight rubric-based LLM judge for the HR environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
import os
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from typing import Any
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
PASS_THRESHOLD = 0.6
|
| 14 |
+
|
| 15 |
+
SYSTEM_PROMPT = """\
|
| 16 |
+
You are an impartial evaluator assessing whether an AI agent successfully \
|
| 17 |
+
completed an HR task. Score accurately based on evidence from the action trace.
|
| 18 |
+
|
| 19 |
+
Scoring:
|
| 20 |
+
- 0.8-1.0: All requirements fully met with clear evidence.
|
| 21 |
+
- 0.6-0.8: Core requirements met with minor gaps. (0.6 = PASS)
|
| 22 |
+
- 0.4-0.6: Partial completion, significant gaps remain.
|
| 23 |
+
- 0.2-0.4: Minimal progress, most requirements failed.
|
| 24 |
+
- 0.0-0.2: No meaningful progress.
|
| 25 |
+
|
| 26 |
+
Respond with valid JSON (no markdown fences):
|
| 27 |
+
{"score": 0.0, "verdict": "PASS or FAIL", "evidence": ["..."], "failed_criteria": ["..."]}"""
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class EvalResult:
|
| 32 |
+
"""Result from the rubric judge."""
|
| 33 |
+
|
| 34 |
+
score: float
|
| 35 |
+
verdict: str
|
| 36 |
+
evidence: list[str]
|
| 37 |
+
failed_criteria: list[str]
|
| 38 |
+
error: str | None = None
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def evaluate_episode(
|
| 42 |
+
*,
|
| 43 |
+
task_instruction: str,
|
| 44 |
+
rubric: list[str],
|
| 45 |
+
action_history: list[dict[str, Any]],
|
| 46 |
+
) -> EvalResult:
|
| 47 |
+
"""Run the rubric judge on a completed episode. Returns EvalResult with 0.0-1.0 score."""
|
| 48 |
+
model = os.environ.get("VERIFIER_MODEL", "").strip()
|
| 49 |
+
api_key = os.environ.get("VERIFIER_API_KEY", "").strip()
|
| 50 |
+
|
| 51 |
+
if not model or not api_key:
|
| 52 |
+
return EvalResult(
|
| 53 |
+
score=0.0,
|
| 54 |
+
verdict="SKIPPED",
|
| 55 |
+
evidence=[],
|
| 56 |
+
failed_criteria=[],
|
| 57 |
+
error="Set VERIFIER_MODEL and VERIFIER_API_KEY to enable evaluation",
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
provider = os.environ.get("VERIFIER_PROVIDER", "").strip() or None
|
| 61 |
+
base_url = os.environ.get("VERIFIER_BASE_URL", "").strip() or None
|
| 62 |
+
|
| 63 |
+
rubric_text = "\n".join(f"- {r}" for r in rubric) if rubric else "No specific rubric provided."
|
| 64 |
+
|
| 65 |
+
trace = json.dumps(action_history[-50:], indent=2, ensure_ascii=False)
|
| 66 |
+
if len(trace) > 40000:
|
| 67 |
+
trace = trace[:40000] + "\n... [truncated]"
|
| 68 |
+
|
| 69 |
+
user_prompt = f"""# Task
|
| 70 |
+
{task_instruction}
|
| 71 |
+
|
| 72 |
+
# Rubric Criteria
|
| 73 |
+
{rubric_text}
|
| 74 |
+
|
| 75 |
+
# Agent Action Trace
|
| 76 |
+
{trace}"""
|
| 77 |
+
|
| 78 |
+
try:
|
| 79 |
+
import litellm
|
| 80 |
+
|
| 81 |
+
litellm_model = model
|
| 82 |
+
if provider and not model.startswith(f"{provider}/"):
|
| 83 |
+
litellm_model = f"{provider}/{model}"
|
| 84 |
+
|
| 85 |
+
response = litellm.completion(
|
| 86 |
+
model=litellm_model,
|
| 87 |
+
messages=[
|
| 88 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 89 |
+
{"role": "user", "content": user_prompt},
|
| 90 |
+
],
|
| 91 |
+
api_key=api_key,
|
| 92 |
+
base_url=base_url,
|
| 93 |
+
temperature=0.2,
|
| 94 |
+
)
|
| 95 |
+
raw = response.choices[0].message.content or ""
|
| 96 |
+
except Exception as e:
|
| 97 |
+
logger.warning("Rubric judge LLM call failed: %s", e)
|
| 98 |
+
return EvalResult(score=0.0, verdict="ERROR", evidence=[], failed_criteria=[], error=str(e))
|
| 99 |
+
|
| 100 |
+
return _parse_response(raw)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def _parse_response(raw: str) -> EvalResult:
|
| 104 |
+
"""Parse the judge's JSON response."""
|
| 105 |
+
text = raw.strip()
|
| 106 |
+
if text.startswith("```"):
|
| 107 |
+
text = text.strip("`\n")
|
| 108 |
+
if text.lower().startswith("json"):
|
| 109 |
+
text = text[4:].strip()
|
| 110 |
+
|
| 111 |
+
try:
|
| 112 |
+
data = json.loads(text)
|
| 113 |
+
except json.JSONDecodeError:
|
| 114 |
+
import re
|
| 115 |
+
|
| 116 |
+
match = re.search(r"\{.*\}", text, re.DOTALL)
|
| 117 |
+
if match:
|
| 118 |
+
try:
|
| 119 |
+
data = json.loads(match.group(0))
|
| 120 |
+
except json.JSONDecodeError:
|
| 121 |
+
return EvalResult(
|
| 122 |
+
score=0.0, verdict="ERROR", evidence=[], failed_criteria=[],
|
| 123 |
+
error=f"Could not parse judge response: {raw[:300]}",
|
| 124 |
+
)
|
| 125 |
+
else:
|
| 126 |
+
return EvalResult(
|
| 127 |
+
score=0.0, verdict="ERROR", evidence=[], failed_criteria=[],
|
| 128 |
+
error=f"Could not parse judge response: {raw[:300]}",
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
score = max(0.0, min(float(data.get("score", 0.0)), 1.0))
|
| 132 |
+
verdict = data.get("verdict", "PASS" if score >= PASS_THRESHOLD else "FAIL")
|
| 133 |
+
evidence = data.get("evidence", [])
|
| 134 |
+
if isinstance(evidence, str):
|
| 135 |
+
evidence = [evidence]
|
| 136 |
+
failed = data.get("failed_criteria", [])
|
| 137 |
+
if isinstance(failed, str):
|
| 138 |
+
failed = [failed]
|
| 139 |
+
|
| 140 |
+
return EvalResult(score=score, verdict=str(verdict), evidence=evidence, failed_criteria=failed)
|
pyproject.toml
CHANGED
|
@@ -10,6 +10,7 @@ dependencies = [
|
|
| 10 |
"openenv-core>=0.2.3",
|
| 11 |
"pydantic>=2.0",
|
| 12 |
"requests>=2.28",
|
|
|
|
| 13 |
]
|
| 14 |
|
| 15 |
[project.urls]
|
|
|
|
| 10 |
"openenv-core>=0.2.3",
|
| 11 |
"pydantic>=2.0",
|
| 12 |
"requests>=2.28",
|
| 13 |
+
"litellm>=1.80.0",
|
| 14 |
]
|
| 15 |
|
| 16 |
[project.urls]
|
server/environment.py
CHANGED
|
@@ -5,12 +5,14 @@ from __future__ import annotations
|
|
| 5 |
import json
|
| 6 |
import logging
|
| 7 |
import os
|
|
|
|
| 8 |
from uuid import uuid4
|
| 9 |
|
| 10 |
import requests
|
| 11 |
from openenv.core.env_server.interfaces import Environment
|
| 12 |
from openenv.core.env_server.types import State
|
| 13 |
|
|
|
|
| 14 |
from simlab_hr.models import HRAction, HRObservation
|
| 15 |
from simlab_hr.tasks import BUNDLED_TASKS, get_task
|
| 16 |
|
|
@@ -45,19 +47,22 @@ class HREnvironment(Environment):
|
|
| 45 |
self._current_task = BUNDLED_TASKS[0]
|
| 46 |
self._tools: dict[str, list[str]] = {}
|
| 47 |
self._episode_count = 0
|
|
|
|
| 48 |
|
| 49 |
def reset(self) -> HRObservation:
|
| 50 |
self._current_task = get_task(self._episode_count)
|
| 51 |
self._episode_count += 1
|
| 52 |
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 53 |
self._tools = self._discover_all_tools()
|
|
|
|
| 54 |
|
| 55 |
return HRObservation(
|
| 56 |
result=(
|
| 57 |
"HR environment ready. You have access to 4 tool servers: "
|
| 58 |
"hrms (employee records, leave, payroll), email (inbox), "
|
| 59 |
"calendar (scheduling), and rocketchat (team messaging). "
|
| 60 |
-
"
|
|
|
|
| 61 |
),
|
| 62 |
is_error=False,
|
| 63 |
tools_available=self._tools,
|
|
@@ -69,17 +74,44 @@ class HREnvironment(Environment):
|
|
| 69 |
def step(self, action: HRAction) -> HRObservation:
|
| 70 |
self._state.step_count += 1
|
| 71 |
|
|
|
|
|
|
|
|
|
|
| 72 |
server_url = self._server_urls.get(action.tool_server)
|
| 73 |
if server_url is None:
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
|
|
|
|
|
|
| 83 |
payload = {"action": {"tool_name": action.tool_name, "parameters": action.parameters}}
|
| 84 |
try:
|
| 85 |
resp = requests.post(
|
|
@@ -90,37 +122,48 @@ class HREnvironment(Environment):
|
|
| 90 |
)
|
| 91 |
result = resp.text
|
| 92 |
is_error = resp.status_code != 200
|
| 93 |
-
|
| 94 |
try:
|
| 95 |
parsed = resp.json()
|
| 96 |
result = json.dumps(parsed, indent=2) if isinstance(parsed, (dict, list)) else str(parsed)
|
| 97 |
except (json.JSONDecodeError, ValueError):
|
| 98 |
pass
|
| 99 |
-
|
| 100 |
except requests.RequestException as exc:
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
return HRObservation(
|
| 107 |
-
result=
|
| 108 |
-
is_error=
|
| 109 |
tools_available=self._tools,
|
| 110 |
task_instruction=self._current_task.instruction,
|
| 111 |
-
done=
|
| 112 |
-
reward=
|
| 113 |
)
|
| 114 |
|
| 115 |
-
@property
|
| 116 |
-
def state(self) -> State:
|
| 117 |
-
return self._state
|
| 118 |
-
|
| 119 |
def _discover_all_tools(self) -> dict[str, list[str]]:
|
| 120 |
"""Fetch available tools from each tool server."""
|
| 121 |
all_tools: dict[str, list[str]] = {}
|
| 122 |
for name, url in self._server_urls.items():
|
| 123 |
all_tools[name] = self._discover_tools(name, url)
|
|
|
|
| 124 |
return all_tools
|
| 125 |
|
| 126 |
def _discover_tools(self, server_name: str, server_url: str) -> list[str]:
|
|
|
|
| 5 |
import json
|
| 6 |
import logging
|
| 7 |
import os
|
| 8 |
+
from typing import Any
|
| 9 |
from uuid import uuid4
|
| 10 |
|
| 11 |
import requests
|
| 12 |
from openenv.core.env_server.interfaces import Environment
|
| 13 |
from openenv.core.env_server.types import State
|
| 14 |
|
| 15 |
+
from simlab_hr.evaluator import evaluate_episode
|
| 16 |
from simlab_hr.models import HRAction, HRObservation
|
| 17 |
from simlab_hr.tasks import BUNDLED_TASKS, get_task
|
| 18 |
|
|
|
|
| 47 |
self._current_task = BUNDLED_TASKS[0]
|
| 48 |
self._tools: dict[str, list[str]] = {}
|
| 49 |
self._episode_count = 0
|
| 50 |
+
self._action_history: list[dict[str, Any]] = []
|
| 51 |
|
| 52 |
def reset(self) -> HRObservation:
|
| 53 |
self._current_task = get_task(self._episode_count)
|
| 54 |
self._episode_count += 1
|
| 55 |
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 56 |
self._tools = self._discover_all_tools()
|
| 57 |
+
self._action_history = []
|
| 58 |
|
| 59 |
return HRObservation(
|
| 60 |
result=(
|
| 61 |
"HR environment ready. You have access to 4 tool servers: "
|
| 62 |
"hrms (employee records, leave, payroll), email (inbox), "
|
| 63 |
"calendar (scheduling), and rocketchat (team messaging). "
|
| 64 |
+
"When you've completed the task, call tool_name='submit_task' "
|
| 65 |
+
"on any server to trigger evaluation and get your score."
|
| 66 |
),
|
| 67 |
is_error=False,
|
| 68 |
tools_available=self._tools,
|
|
|
|
| 74 |
def step(self, action: HRAction) -> HRObservation:
|
| 75 |
self._state.step_count += 1
|
| 76 |
|
| 77 |
+
if action.tool_name == "submit_task":
|
| 78 |
+
return self._evaluate_and_finish()
|
| 79 |
+
|
| 80 |
server_url = self._server_urls.get(action.tool_server)
|
| 81 |
if server_url is None:
|
| 82 |
+
result = f"Unknown tool server: '{action.tool_server}'. Use one of: hrms, email, calendar, rocketchat."
|
| 83 |
+
is_error = True
|
| 84 |
+
else:
|
| 85 |
+
result, is_error = self._call_tool(server_url, action)
|
| 86 |
+
|
| 87 |
+
self._action_history.append({
|
| 88 |
+
"step": self._state.step_count,
|
| 89 |
+
"server": action.tool_server,
|
| 90 |
+
"tool": action.tool_name,
|
| 91 |
+
"parameters": action.parameters,
|
| 92 |
+
"result": result[:2000],
|
| 93 |
+
"is_error": is_error,
|
| 94 |
+
})
|
| 95 |
+
|
| 96 |
+
at_step_limit = self._state.step_count >= MAX_STEPS_PER_EPISODE
|
| 97 |
+
if at_step_limit:
|
| 98 |
+
return self._evaluate_and_finish()
|
| 99 |
+
|
| 100 |
+
return HRObservation(
|
| 101 |
+
result=result,
|
| 102 |
+
is_error=is_error,
|
| 103 |
+
tools_available=self._tools,
|
| 104 |
+
task_instruction=self._current_task.instruction,
|
| 105 |
+
done=False,
|
| 106 |
+
reward=0.0,
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
@property
|
| 110 |
+
def state(self) -> State:
|
| 111 |
+
return self._state
|
| 112 |
|
| 113 |
+
def _call_tool(self, server_url: str, action: HRAction) -> tuple[str, bool]:
|
| 114 |
+
"""Proxy a tool call to the appropriate server. Returns (result, is_error)."""
|
| 115 |
payload = {"action": {"tool_name": action.tool_name, "parameters": action.parameters}}
|
| 116 |
try:
|
| 117 |
resp = requests.post(
|
|
|
|
| 122 |
)
|
| 123 |
result = resp.text
|
| 124 |
is_error = resp.status_code != 200
|
|
|
|
| 125 |
try:
|
| 126 |
parsed = resp.json()
|
| 127 |
result = json.dumps(parsed, indent=2) if isinstance(parsed, (dict, list)) else str(parsed)
|
| 128 |
except (json.JSONDecodeError, ValueError):
|
| 129 |
pass
|
| 130 |
+
return result, is_error
|
| 131 |
except requests.RequestException as exc:
|
| 132 |
+
return f"Tool invocation failed on {action.tool_server}: {exc}", True
|
| 133 |
+
|
| 134 |
+
def _evaluate_and_finish(self) -> HRObservation:
|
| 135 |
+
"""Run the rubric judge and return the final observation with reward."""
|
| 136 |
+
eval_result = evaluate_episode(
|
| 137 |
+
task_instruction=self._current_task.instruction,
|
| 138 |
+
rubric=self._current_task.rubric,
|
| 139 |
+
action_history=self._action_history,
|
| 140 |
+
)
|
| 141 |
|
| 142 |
+
verdict_msg = (
|
| 143 |
+
f"Episode complete. Score: {eval_result.score:.2f} ({eval_result.verdict})"
|
| 144 |
+
)
|
| 145 |
+
if eval_result.evidence:
|
| 146 |
+
verdict_msg += "\nEvidence: " + "; ".join(eval_result.evidence)
|
| 147 |
+
if eval_result.failed_criteria:
|
| 148 |
+
verdict_msg += "\nFailed: " + "; ".join(eval_result.failed_criteria)
|
| 149 |
+
if eval_result.error:
|
| 150 |
+
verdict_msg += f"\nNote: {eval_result.error}"
|
| 151 |
|
| 152 |
return HRObservation(
|
| 153 |
+
result=verdict_msg,
|
| 154 |
+
is_error=False,
|
| 155 |
tools_available=self._tools,
|
| 156 |
task_instruction=self._current_task.instruction,
|
| 157 |
+
done=True,
|
| 158 |
+
reward=eval_result.score,
|
| 159 |
)
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
def _discover_all_tools(self) -> dict[str, list[str]]:
|
| 162 |
"""Fetch available tools from each tool server."""
|
| 163 |
all_tools: dict[str, list[str]] = {}
|
| 164 |
for name, url in self._server_urls.items():
|
| 165 |
all_tools[name] = self._discover_tools(name, url)
|
| 166 |
+
all_tools.setdefault("_meta", []).append("submit_task")
|
| 167 |
return all_tools
|
| 168 |
|
| 169 |
def _discover_tools(self, server_name: str, server_url: str) -> list[str]:
|
server/requirements.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
openenv-core>=0.2.3
|
| 2 |
pydantic>=2.0
|
| 3 |
requests>=2.28
|
|
|
|
| 4 |
uvicorn>=0.30
|
|
|
|
| 1 |
openenv-core>=0.2.3
|
| 2 |
pydantic>=2.0
|
| 3 |
requests>=2.28
|
| 4 |
+
litellm>=1.80.0
|
| 5 |
uvicorn>=0.30
|
tasks.py
CHANGED
|
@@ -5,7 +5,7 @@ from __future__ import annotations
|
|
| 5 |
import logging
|
| 6 |
import os
|
| 7 |
import random
|
| 8 |
-
from dataclasses import dataclass
|
| 9 |
|
| 10 |
logger = logging.getLogger(__name__)
|
| 11 |
|
|
@@ -20,6 +20,7 @@ class Task:
|
|
| 20 |
id: str
|
| 21 |
instruction: str
|
| 22 |
difficulty: str
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
BUNDLED_TASKS: list[Task] = [
|
|
@@ -32,6 +33,11 @@ BUNDLED_TASKS: list[Task] = [
|
|
| 32 |
"priya.mehta@gmail.com with the interview details."
|
| 33 |
),
|
| 34 |
difficulty="medium",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
),
|
| 36 |
Task(
|
| 37 |
id="hr-002",
|
|
@@ -41,6 +47,11 @@ BUNDLED_TASKS: list[Task] = [
|
|
| 41 |
"if he has sufficient days, and notify his manager Sarah Chen via Rocket.Chat."
|
| 42 |
),
|
| 43 |
difficulty="easy",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
),
|
| 45 |
Task(
|
| 46 |
id="hr-003",
|
|
@@ -50,6 +61,11 @@ BUNDLED_TASKS: list[Task] = [
|
|
| 50 |
"and send a summary email to hr-team@company.com with the findings."
|
| 51 |
),
|
| 52 |
difficulty="medium",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
),
|
| 54 |
Task(
|
| 55 |
id="hr-004",
|
|
@@ -61,6 +77,12 @@ BUNDLED_TASKS: list[Task] = [
|
|
| 61 |
"invites via email to all participants including the candidate at alex.rivera@email.com."
|
| 62 |
),
|
| 63 |
difficulty="hard",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
),
|
| 65 |
Task(
|
| 66 |
id="hr-005",
|
|
@@ -70,6 +92,11 @@ BUNDLED_TASKS: list[Task] = [
|
|
| 70 |
"email, and post an announcement in the #general channel on Rocket.Chat."
|
| 71 |
),
|
| 72 |
difficulty="easy",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
),
|
| 74 |
Task(
|
| 75 |
id="hr-006",
|
|
@@ -81,6 +108,12 @@ BUNDLED_TASKS: list[Task] = [
|
|
| 81 |
"#engineering channel on Rocket.Chat."
|
| 82 |
),
|
| 83 |
difficulty="hard",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
),
|
| 85 |
Task(
|
| 86 |
id="hr-007",
|
|
@@ -90,6 +123,11 @@ BUNDLED_TASKS: list[Task] = [
|
|
| 90 |
"asking them to review the pending requests."
|
| 91 |
),
|
| 92 |
difficulty="medium",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
),
|
| 94 |
Task(
|
| 95 |
id="hr-008",
|
|
@@ -100,6 +138,11 @@ BUNDLED_TASKS: list[Task] = [
|
|
| 100 |
"send each employee an email notification about their scheduled review time."
|
| 101 |
),
|
| 102 |
difficulty="hard",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
),
|
| 104 |
]
|
| 105 |
|
|
@@ -165,4 +208,5 @@ def _fetch_api_task(api_key: str, task_index: int | None) -> Task:
|
|
| 165 |
id=api_task.get("task_id", "api-unknown"),
|
| 166 |
instruction=api_task.get("description", ""),
|
| 167 |
difficulty=api_task.get("difficulty", "unknown"),
|
|
|
|
| 168 |
)
|
|
|
|
| 5 |
import logging
|
| 6 |
import os
|
| 7 |
import random
|
| 8 |
+
from dataclasses import dataclass, field
|
| 9 |
|
| 10 |
logger = logging.getLogger(__name__)
|
| 11 |
|
|
|
|
| 20 |
id: str
|
| 21 |
instruction: str
|
| 22 |
difficulty: str
|
| 23 |
+
rubric: list[str] = field(default_factory=list)
|
| 24 |
|
| 25 |
|
| 26 |
BUNDLED_TASKS: list[Task] = [
|
|
|
|
| 33 |
"priya.mehta@gmail.com with the interview details."
|
| 34 |
),
|
| 35 |
difficulty="medium",
|
| 36 |
+
rubric=[
|
| 37 |
+
"Employee record created in HRMS for Priya Mehta",
|
| 38 |
+
"Phone screening event scheduled on calendar for next Tuesday at 2 PM",
|
| 39 |
+
"Confirmation email sent to priya.mehta@gmail.com with interview details",
|
| 40 |
+
],
|
| 41 |
),
|
| 42 |
Task(
|
| 43 |
id="hr-002",
|
|
|
|
| 47 |
"if he has sufficient days, and notify his manager Sarah Chen via Rocket.Chat."
|
| 48 |
),
|
| 49 |
difficulty="easy",
|
| 50 |
+
rubric=[
|
| 51 |
+
"Leave balance checked for employee EMP-0042",
|
| 52 |
+
"Leave request approved or denied based on balance",
|
| 53 |
+
"Manager Sarah Chen notified via RocketChat",
|
| 54 |
+
],
|
| 55 |
),
|
| 56 |
Task(
|
| 57 |
id="hr-003",
|
|
|
|
| 61 |
"and send a summary email to hr-team@company.com with the findings."
|
| 62 |
),
|
| 63 |
difficulty="medium",
|
| 64 |
+
rubric=[
|
| 65 |
+
"Attendance records retrieved from HRMS",
|
| 66 |
+
"Employees with >2 absences identified",
|
| 67 |
+
"Summary email sent to hr-team@company.com",
|
| 68 |
+
],
|
| 69 |
),
|
| 70 |
Task(
|
| 71 |
id="hr-004",
|
|
|
|
| 77 |
"invites via email to all participants including the candidate at alex.rivera@email.com."
|
| 78 |
),
|
| 79 |
difficulty="hard",
|
| 80 |
+
rubric=[
|
| 81 |
+
"Availability checked for all three interviewers on the calendar",
|
| 82 |
+
"Common 1-hour slot identified",
|
| 83 |
+
"Meeting booked on the calendar",
|
| 84 |
+
"Email invites sent to all participants including alex.rivera@email.com",
|
| 85 |
+
],
|
| 86 |
),
|
| 87 |
Task(
|
| 88 |
id="hr-005",
|
|
|
|
| 92 |
"email, and post an announcement in the #general channel on Rocket.Chat."
|
| 93 |
),
|
| 94 |
difficulty="easy",
|
| 95 |
+
rubric=[
|
| 96 |
+
"Designation updated in HRMS to Senior Developer",
|
| 97 |
+
"Congratulatory email sent to Maria Santos",
|
| 98 |
+
"Announcement posted in #general on RocketChat",
|
| 99 |
+
],
|
| 100 |
),
|
| 101 |
Task(
|
| 102 |
id="hr-006",
|
|
|
|
| 108 |
"#engineering channel on Rocket.Chat."
|
| 109 |
),
|
| 110 |
difficulty="hard",
|
| 111 |
+
rubric=[
|
| 112 |
+
"Employee record created in HRMS with department Engineering",
|
| 113 |
+
"Welcome email sent to david.kim@company.com",
|
| 114 |
+
"Orientation meeting scheduled on calendar for start date",
|
| 115 |
+
"Added to #engineering channel on RocketChat",
|
| 116 |
+
],
|
| 117 |
),
|
| 118 |
Task(
|
| 119 |
id="hr-007",
|
|
|
|
| 123 |
"asking them to review the pending requests."
|
| 124 |
),
|
| 125 |
difficulty="medium",
|
| 126 |
+
rubric=[
|
| 127 |
+
"Pending leave requests retrieved from HRMS",
|
| 128 |
+
"Approving managers identified for each request",
|
| 129 |
+
"Reminder emails sent to respective managers",
|
| 130 |
+
],
|
| 131 |
),
|
| 132 |
Task(
|
| 133 |
id="hr-008",
|
|
|
|
| 138 |
"send each employee an email notification about their scheduled review time."
|
| 139 |
),
|
| 140 |
difficulty="hard",
|
| 141 |
+
rubric=[
|
| 142 |
+
"Engineering department employees retrieved from HRMS",
|
| 143 |
+
"Individual 45-minute review meetings scheduled on calendar",
|
| 144 |
+
"Email notifications sent to each employee with their review time",
|
| 145 |
+
],
|
| 146 |
),
|
| 147 |
]
|
| 148 |
|
|
|
|
| 208 |
id=api_task.get("task_id", "api-unknown"),
|
| 209 |
instruction=api_task.get("description", ""),
|
| 210 |
difficulty=api_task.get("difficulty", "unknown"),
|
| 211 |
+
rubric=[],
|
| 212 |
)
|