renanserrano commited on
Commit
384d994
·
verified ·
1 Parent(s): bd67f06

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. evaluator.py +140 -0
  2. pyproject.toml +1 -0
  3. server/environment.py +65 -22
  4. server/requirements.txt +1 -0
  5. tasks.py +45 -1
evaluator.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Lightweight rubric-based LLM judge for the HR environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import os
8
+ from dataclasses import dataclass
9
+ from typing import Any
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ PASS_THRESHOLD = 0.6
14
+
15
+ SYSTEM_PROMPT = """\
16
+ You are an impartial evaluator assessing whether an AI agent successfully \
17
+ completed an HR task. Score accurately based on evidence from the action trace.
18
+
19
+ Scoring:
20
+ - 0.8-1.0: All requirements fully met with clear evidence.
21
+ - 0.6-0.8: Core requirements met with minor gaps. (0.6 = PASS)
22
+ - 0.4-0.6: Partial completion, significant gaps remain.
23
+ - 0.2-0.4: Minimal progress, most requirements failed.
24
+ - 0.0-0.2: No meaningful progress.
25
+
26
+ Respond with valid JSON (no markdown fences):
27
+ {"score": 0.0, "verdict": "PASS or FAIL", "evidence": ["..."], "failed_criteria": ["..."]}"""
28
+
29
+
30
+ @dataclass
31
+ class EvalResult:
32
+ """Result from the rubric judge."""
33
+
34
+ score: float
35
+ verdict: str
36
+ evidence: list[str]
37
+ failed_criteria: list[str]
38
+ error: str | None = None
39
+
40
+
41
+ def evaluate_episode(
42
+ *,
43
+ task_instruction: str,
44
+ rubric: list[str],
45
+ action_history: list[dict[str, Any]],
46
+ ) -> EvalResult:
47
+ """Run the rubric judge on a completed episode. Returns EvalResult with 0.0-1.0 score."""
48
+ model = os.environ.get("VERIFIER_MODEL", "").strip()
49
+ api_key = os.environ.get("VERIFIER_API_KEY", "").strip()
50
+
51
+ if not model or not api_key:
52
+ return EvalResult(
53
+ score=0.0,
54
+ verdict="SKIPPED",
55
+ evidence=[],
56
+ failed_criteria=[],
57
+ error="Set VERIFIER_MODEL and VERIFIER_API_KEY to enable evaluation",
58
+ )
59
+
60
+ provider = os.environ.get("VERIFIER_PROVIDER", "").strip() or None
61
+ base_url = os.environ.get("VERIFIER_BASE_URL", "").strip() or None
62
+
63
+ rubric_text = "\n".join(f"- {r}" for r in rubric) if rubric else "No specific rubric provided."
64
+
65
+ trace = json.dumps(action_history[-50:], indent=2, ensure_ascii=False)
66
+ if len(trace) > 40000:
67
+ trace = trace[:40000] + "\n... [truncated]"
68
+
69
+ user_prompt = f"""# Task
70
+ {task_instruction}
71
+
72
+ # Rubric Criteria
73
+ {rubric_text}
74
+
75
+ # Agent Action Trace
76
+ {trace}"""
77
+
78
+ try:
79
+ import litellm
80
+
81
+ litellm_model = model
82
+ if provider and not model.startswith(f"{provider}/"):
83
+ litellm_model = f"{provider}/{model}"
84
+
85
+ response = litellm.completion(
86
+ model=litellm_model,
87
+ messages=[
88
+ {"role": "system", "content": SYSTEM_PROMPT},
89
+ {"role": "user", "content": user_prompt},
90
+ ],
91
+ api_key=api_key,
92
+ base_url=base_url,
93
+ temperature=0.2,
94
+ )
95
+ raw = response.choices[0].message.content or ""
96
+ except Exception as e:
97
+ logger.warning("Rubric judge LLM call failed: %s", e)
98
+ return EvalResult(score=0.0, verdict="ERROR", evidence=[], failed_criteria=[], error=str(e))
99
+
100
+ return _parse_response(raw)
101
+
102
+
103
+ def _parse_response(raw: str) -> EvalResult:
104
+ """Parse the judge's JSON response."""
105
+ text = raw.strip()
106
+ if text.startswith("```"):
107
+ text = text.strip("`\n")
108
+ if text.lower().startswith("json"):
109
+ text = text[4:].strip()
110
+
111
+ try:
112
+ data = json.loads(text)
113
+ except json.JSONDecodeError:
114
+ import re
115
+
116
+ match = re.search(r"\{.*\}", text, re.DOTALL)
117
+ if match:
118
+ try:
119
+ data = json.loads(match.group(0))
120
+ except json.JSONDecodeError:
121
+ return EvalResult(
122
+ score=0.0, verdict="ERROR", evidence=[], failed_criteria=[],
123
+ error=f"Could not parse judge response: {raw[:300]}",
124
+ )
125
+ else:
126
+ return EvalResult(
127
+ score=0.0, verdict="ERROR", evidence=[], failed_criteria=[],
128
+ error=f"Could not parse judge response: {raw[:300]}",
129
+ )
130
+
131
+ score = max(0.0, min(float(data.get("score", 0.0)), 1.0))
132
+ verdict = data.get("verdict", "PASS" if score >= PASS_THRESHOLD else "FAIL")
133
+ evidence = data.get("evidence", [])
134
+ if isinstance(evidence, str):
135
+ evidence = [evidence]
136
+ failed = data.get("failed_criteria", [])
137
+ if isinstance(failed, str):
138
+ failed = [failed]
139
+
140
+ return EvalResult(score=score, verdict=str(verdict), evidence=evidence, failed_criteria=failed)
pyproject.toml CHANGED
@@ -10,6 +10,7 @@ dependencies = [
10
  "openenv-core>=0.2.3",
11
  "pydantic>=2.0",
12
  "requests>=2.28",
 
13
  ]
14
 
15
  [project.urls]
 
10
  "openenv-core>=0.2.3",
11
  "pydantic>=2.0",
12
  "requests>=2.28",
13
+ "litellm>=1.80.0",
14
  ]
15
 
16
  [project.urls]
server/environment.py CHANGED
@@ -5,12 +5,14 @@ from __future__ import annotations
5
  import json
6
  import logging
7
  import os
 
8
  from uuid import uuid4
9
 
10
  import requests
11
  from openenv.core.env_server.interfaces import Environment
12
  from openenv.core.env_server.types import State
13
 
 
14
  from simlab_hr.models import HRAction, HRObservation
15
  from simlab_hr.tasks import BUNDLED_TASKS, get_task
16
 
@@ -45,19 +47,22 @@ class HREnvironment(Environment):
45
  self._current_task = BUNDLED_TASKS[0]
46
  self._tools: dict[str, list[str]] = {}
47
  self._episode_count = 0
 
48
 
49
  def reset(self) -> HRObservation:
50
  self._current_task = get_task(self._episode_count)
51
  self._episode_count += 1
52
  self._state = State(episode_id=str(uuid4()), step_count=0)
53
  self._tools = self._discover_all_tools()
 
54
 
55
  return HRObservation(
56
  result=(
57
  "HR environment ready. You have access to 4 tool servers: "
58
  "hrms (employee records, leave, payroll), email (inbox), "
59
  "calendar (scheduling), and rocketchat (team messaging). "
60
- "Use the tools to complete the task."
 
61
  ),
62
  is_error=False,
63
  tools_available=self._tools,
@@ -69,17 +74,44 @@ class HREnvironment(Environment):
69
  def step(self, action: HRAction) -> HRObservation:
70
  self._state.step_count += 1
71
 
 
 
 
72
  server_url = self._server_urls.get(action.tool_server)
73
  if server_url is None:
74
- return HRObservation(
75
- result=f"Unknown tool server: '{action.tool_server}'. Use one of: hrms, email, calendar, rocketchat.",
76
- is_error=True,
77
- tools_available=self._tools,
78
- task_instruction=self._current_task.instruction,
79
- done=False,
80
- reward=0.0,
81
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
 
 
83
  payload = {"action": {"tool_name": action.tool_name, "parameters": action.parameters}}
84
  try:
85
  resp = requests.post(
@@ -90,37 +122,48 @@ class HREnvironment(Environment):
90
  )
91
  result = resp.text
92
  is_error = resp.status_code != 200
93
-
94
  try:
95
  parsed = resp.json()
96
  result = json.dumps(parsed, indent=2) if isinstance(parsed, (dict, list)) else str(parsed)
97
  except (json.JSONDecodeError, ValueError):
98
  pass
99
-
100
  except requests.RequestException as exc:
101
- result = f"Tool invocation failed on {action.tool_server}: {exc}"
102
- is_error = True
 
 
 
 
 
 
 
103
 
104
- done = self._state.step_count >= MAX_STEPS_PER_EPISODE
 
 
 
 
 
 
 
 
105
 
106
  return HRObservation(
107
- result=result,
108
- is_error=is_error,
109
  tools_available=self._tools,
110
  task_instruction=self._current_task.instruction,
111
- done=done,
112
- reward=0.0,
113
  )
114
 
115
- @property
116
- def state(self) -> State:
117
- return self._state
118
-
119
  def _discover_all_tools(self) -> dict[str, list[str]]:
120
  """Fetch available tools from each tool server."""
121
  all_tools: dict[str, list[str]] = {}
122
  for name, url in self._server_urls.items():
123
  all_tools[name] = self._discover_tools(name, url)
 
124
  return all_tools
125
 
126
  def _discover_tools(self, server_name: str, server_url: str) -> list[str]:
 
5
  import json
6
  import logging
7
  import os
8
+ from typing import Any
9
  from uuid import uuid4
10
 
11
  import requests
12
  from openenv.core.env_server.interfaces import Environment
13
  from openenv.core.env_server.types import State
14
 
15
+ from simlab_hr.evaluator import evaluate_episode
16
  from simlab_hr.models import HRAction, HRObservation
17
  from simlab_hr.tasks import BUNDLED_TASKS, get_task
18
 
 
47
  self._current_task = BUNDLED_TASKS[0]
48
  self._tools: dict[str, list[str]] = {}
49
  self._episode_count = 0
50
+ self._action_history: list[dict[str, Any]] = []
51
 
52
  def reset(self) -> HRObservation:
53
  self._current_task = get_task(self._episode_count)
54
  self._episode_count += 1
55
  self._state = State(episode_id=str(uuid4()), step_count=0)
56
  self._tools = self._discover_all_tools()
57
+ self._action_history = []
58
 
59
  return HRObservation(
60
  result=(
61
  "HR environment ready. You have access to 4 tool servers: "
62
  "hrms (employee records, leave, payroll), email (inbox), "
63
  "calendar (scheduling), and rocketchat (team messaging). "
64
+ "When you've completed the task, call tool_name='submit_task' "
65
+ "on any server to trigger evaluation and get your score."
66
  ),
67
  is_error=False,
68
  tools_available=self._tools,
 
74
  def step(self, action: HRAction) -> HRObservation:
75
  self._state.step_count += 1
76
 
77
+ if action.tool_name == "submit_task":
78
+ return self._evaluate_and_finish()
79
+
80
  server_url = self._server_urls.get(action.tool_server)
81
  if server_url is None:
82
+ result = f"Unknown tool server: '{action.tool_server}'. Use one of: hrms, email, calendar, rocketchat."
83
+ is_error = True
84
+ else:
85
+ result, is_error = self._call_tool(server_url, action)
86
+
87
+ self._action_history.append({
88
+ "step": self._state.step_count,
89
+ "server": action.tool_server,
90
+ "tool": action.tool_name,
91
+ "parameters": action.parameters,
92
+ "result": result[:2000],
93
+ "is_error": is_error,
94
+ })
95
+
96
+ at_step_limit = self._state.step_count >= MAX_STEPS_PER_EPISODE
97
+ if at_step_limit:
98
+ return self._evaluate_and_finish()
99
+
100
+ return HRObservation(
101
+ result=result,
102
+ is_error=is_error,
103
+ tools_available=self._tools,
104
+ task_instruction=self._current_task.instruction,
105
+ done=False,
106
+ reward=0.0,
107
+ )
108
+
109
+ @property
110
+ def state(self) -> State:
111
+ return self._state
112
 
113
+ def _call_tool(self, server_url: str, action: HRAction) -> tuple[str, bool]:
114
+ """Proxy a tool call to the appropriate server. Returns (result, is_error)."""
115
  payload = {"action": {"tool_name": action.tool_name, "parameters": action.parameters}}
116
  try:
117
  resp = requests.post(
 
122
  )
123
  result = resp.text
124
  is_error = resp.status_code != 200
 
125
  try:
126
  parsed = resp.json()
127
  result = json.dumps(parsed, indent=2) if isinstance(parsed, (dict, list)) else str(parsed)
128
  except (json.JSONDecodeError, ValueError):
129
  pass
130
+ return result, is_error
131
  except requests.RequestException as exc:
132
+ return f"Tool invocation failed on {action.tool_server}: {exc}", True
133
+
134
+ def _evaluate_and_finish(self) -> HRObservation:
135
+ """Run the rubric judge and return the final observation with reward."""
136
+ eval_result = evaluate_episode(
137
+ task_instruction=self._current_task.instruction,
138
+ rubric=self._current_task.rubric,
139
+ action_history=self._action_history,
140
+ )
141
 
142
+ verdict_msg = (
143
+ f"Episode complete. Score: {eval_result.score:.2f} ({eval_result.verdict})"
144
+ )
145
+ if eval_result.evidence:
146
+ verdict_msg += "\nEvidence: " + "; ".join(eval_result.evidence)
147
+ if eval_result.failed_criteria:
148
+ verdict_msg += "\nFailed: " + "; ".join(eval_result.failed_criteria)
149
+ if eval_result.error:
150
+ verdict_msg += f"\nNote: {eval_result.error}"
151
 
152
  return HRObservation(
153
+ result=verdict_msg,
154
+ is_error=False,
155
  tools_available=self._tools,
156
  task_instruction=self._current_task.instruction,
157
+ done=True,
158
+ reward=eval_result.score,
159
  )
160
 
 
 
 
 
161
  def _discover_all_tools(self) -> dict[str, list[str]]:
162
  """Fetch available tools from each tool server."""
163
  all_tools: dict[str, list[str]] = {}
164
  for name, url in self._server_urls.items():
165
  all_tools[name] = self._discover_tools(name, url)
166
+ all_tools.setdefault("_meta", []).append("submit_task")
167
  return all_tools
168
 
169
  def _discover_tools(self, server_name: str, server_url: str) -> list[str]:
server/requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  openenv-core>=0.2.3
2
  pydantic>=2.0
3
  requests>=2.28
 
4
  uvicorn>=0.30
 
1
  openenv-core>=0.2.3
2
  pydantic>=2.0
3
  requests>=2.28
4
+ litellm>=1.80.0
5
  uvicorn>=0.30
tasks.py CHANGED
@@ -5,7 +5,7 @@ from __future__ import annotations
5
  import logging
6
  import os
7
  import random
8
- from dataclasses import dataclass
9
 
10
  logger = logging.getLogger(__name__)
11
 
@@ -20,6 +20,7 @@ class Task:
20
  id: str
21
  instruction: str
22
  difficulty: str
 
23
 
24
 
25
  BUNDLED_TASKS: list[Task] = [
@@ -32,6 +33,11 @@ BUNDLED_TASKS: list[Task] = [
32
  "priya.mehta@gmail.com with the interview details."
33
  ),
34
  difficulty="medium",
 
 
 
 
 
35
  ),
36
  Task(
37
  id="hr-002",
@@ -41,6 +47,11 @@ BUNDLED_TASKS: list[Task] = [
41
  "if he has sufficient days, and notify his manager Sarah Chen via Rocket.Chat."
42
  ),
43
  difficulty="easy",
 
 
 
 
 
44
  ),
45
  Task(
46
  id="hr-003",
@@ -50,6 +61,11 @@ BUNDLED_TASKS: list[Task] = [
50
  "and send a summary email to hr-team@company.com with the findings."
51
  ),
52
  difficulty="medium",
 
 
 
 
 
53
  ),
54
  Task(
55
  id="hr-004",
@@ -61,6 +77,12 @@ BUNDLED_TASKS: list[Task] = [
61
  "invites via email to all participants including the candidate at alex.rivera@email.com."
62
  ),
63
  difficulty="hard",
 
 
 
 
 
 
64
  ),
65
  Task(
66
  id="hr-005",
@@ -70,6 +92,11 @@ BUNDLED_TASKS: list[Task] = [
70
  "email, and post an announcement in the #general channel on Rocket.Chat."
71
  ),
72
  difficulty="easy",
 
 
 
 
 
73
  ),
74
  Task(
75
  id="hr-006",
@@ -81,6 +108,12 @@ BUNDLED_TASKS: list[Task] = [
81
  "#engineering channel on Rocket.Chat."
82
  ),
83
  difficulty="hard",
 
 
 
 
 
 
84
  ),
85
  Task(
86
  id="hr-007",
@@ -90,6 +123,11 @@ BUNDLED_TASKS: list[Task] = [
90
  "asking them to review the pending requests."
91
  ),
92
  difficulty="medium",
 
 
 
 
 
93
  ),
94
  Task(
95
  id="hr-008",
@@ -100,6 +138,11 @@ BUNDLED_TASKS: list[Task] = [
100
  "send each employee an email notification about their scheduled review time."
101
  ),
102
  difficulty="hard",
 
 
 
 
 
103
  ),
104
  ]
105
 
@@ -165,4 +208,5 @@ def _fetch_api_task(api_key: str, task_index: int | None) -> Task:
165
  id=api_task.get("task_id", "api-unknown"),
166
  instruction=api_task.get("description", ""),
167
  difficulty=api_task.get("difficulty", "unknown"),
 
168
  )
 
5
  import logging
6
  import os
7
  import random
8
+ from dataclasses import dataclass, field
9
 
10
  logger = logging.getLogger(__name__)
11
 
 
20
  id: str
21
  instruction: str
22
  difficulty: str
23
+ rubric: list[str] = field(default_factory=list)
24
 
25
 
26
  BUNDLED_TASKS: list[Task] = [
 
33
  "priya.mehta@gmail.com with the interview details."
34
  ),
35
  difficulty="medium",
36
+ rubric=[
37
+ "Employee record created in HRMS for Priya Mehta",
38
+ "Phone screening event scheduled on calendar for next Tuesday at 2 PM",
39
+ "Confirmation email sent to priya.mehta@gmail.com with interview details",
40
+ ],
41
  ),
42
  Task(
43
  id="hr-002",
 
47
  "if he has sufficient days, and notify his manager Sarah Chen via Rocket.Chat."
48
  ),
49
  difficulty="easy",
50
+ rubric=[
51
+ "Leave balance checked for employee EMP-0042",
52
+ "Leave request approved or denied based on balance",
53
+ "Manager Sarah Chen notified via RocketChat",
54
+ ],
55
  ),
56
  Task(
57
  id="hr-003",
 
61
  "and send a summary email to hr-team@company.com with the findings."
62
  ),
63
  difficulty="medium",
64
+ rubric=[
65
+ "Attendance records retrieved from HRMS",
66
+ "Employees with >2 absences identified",
67
+ "Summary email sent to hr-team@company.com",
68
+ ],
69
  ),
70
  Task(
71
  id="hr-004",
 
77
  "invites via email to all participants including the candidate at alex.rivera@email.com."
78
  ),
79
  difficulty="hard",
80
+ rubric=[
81
+ "Availability checked for all three interviewers on the calendar",
82
+ "Common 1-hour slot identified",
83
+ "Meeting booked on the calendar",
84
+ "Email invites sent to all participants including alex.rivera@email.com",
85
+ ],
86
  ),
87
  Task(
88
  id="hr-005",
 
92
  "email, and post an announcement in the #general channel on Rocket.Chat."
93
  ),
94
  difficulty="easy",
95
+ rubric=[
96
+ "Designation updated in HRMS to Senior Developer",
97
+ "Congratulatory email sent to Maria Santos",
98
+ "Announcement posted in #general on RocketChat",
99
+ ],
100
  ),
101
  Task(
102
  id="hr-006",
 
108
  "#engineering channel on Rocket.Chat."
109
  ),
110
  difficulty="hard",
111
+ rubric=[
112
+ "Employee record created in HRMS with department Engineering",
113
+ "Welcome email sent to david.kim@company.com",
114
+ "Orientation meeting scheduled on calendar for start date",
115
+ "Added to #engineering channel on RocketChat",
116
+ ],
117
  ),
118
  Task(
119
  id="hr-007",
 
123
  "asking them to review the pending requests."
124
  ),
125
  difficulty="medium",
126
+ rubric=[
127
+ "Pending leave requests retrieved from HRMS",
128
+ "Approving managers identified for each request",
129
+ "Reminder emails sent to respective managers",
130
+ ],
131
  ),
132
  Task(
133
  id="hr-008",
 
138
  "send each employee an email notification about their scheduled review time."
139
  ),
140
  difficulty="hard",
141
+ rubric=[
142
+ "Engineering department employees retrieved from HRMS",
143
+ "Individual 45-minute review meetings scheduled on calendar",
144
+ "Email notifications sent to each employee with their review time",
145
+ ],
146
  ),
147
  ]
148
 
 
208
  id=api_task.get("task_id", "api-unknown"),
209
  instruction=api_task.get("description", ""),
210
  difficulty=api_task.get("difficulty", "unknown"),
211
+ rubric=[],
212
  )