Spaces:

renanserrano
/

simulationlab-hr

Runtime error

App Files Files Community

renanserrano commited on 18 days ago

Commit

384d994

verified ·

1 Parent(s): bd67f06

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

evaluator.py +140 -0
pyproject.toml +1 -0
server/environment.py +65 -22
server/requirements.txt +1 -0
tasks.py +45 -1

evaluator.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""Lightweight rubric-based LLM judge for the HR environment."""
+from __future__ import annotations
+import json
+import logging
+import os
+from dataclasses import dataclass
+from typing import Any
+logger = logging.getLogger(__name__)
+PASS_THRESHOLD = 0.6
+SYSTEM_PROMPT = """\
+You are an impartial evaluator assessing whether an AI agent successfully \
+completed an HR task. Score accurately based on evidence from the action trace.
+Scoring:
+- 0.8-1.0: All requirements fully met with clear evidence.
+- 0.6-0.8: Core requirements met with minor gaps. (0.6 = PASS)
+- 0.4-0.6: Partial completion, significant gaps remain.
+- 0.2-0.4: Minimal progress, most requirements failed.
+- 0.0-0.2: No meaningful progress.
+Respond with valid JSON (no markdown fences):
+{"score": 0.0, "verdict": "PASS or FAIL", "evidence": ["..."], "failed_criteria": ["..."]}"""
+@dataclass
+class EvalResult:
+    """Result from the rubric judge."""
+    score: float
+    verdict: str
+    evidence: list[str]
+    failed_criteria: list[str]
+    error: str | None = None
+def evaluate_episode(
+    *,
+    task_instruction: str,
+    rubric: list[str],
+    action_history: list[dict[str, Any]],
+) -> EvalResult:
+    """Run the rubric judge on a completed episode. Returns EvalResult with 0.0-1.0 score."""
+    model = os.environ.get("VERIFIER_MODEL", "").strip()
+    api_key = os.environ.get("VERIFIER_API_KEY", "").strip()
+    if not model or not api_key:
+        return EvalResult(
+            score=0.0,
+            verdict="SKIPPED",
+            evidence=[],
+            failed_criteria=[],
+            error="Set VERIFIER_MODEL and VERIFIER_API_KEY to enable evaluation",
+        )
+    provider = os.environ.get("VERIFIER_PROVIDER", "").strip() or None
+    base_url = os.environ.get("VERIFIER_BASE_URL", "").strip() or None
+    rubric_text = "\n".join(f"- {r}" for r in rubric) if rubric else "No specific rubric provided."
+    trace = json.dumps(action_history[-50:], indent=2, ensure_ascii=False)
+    if len(trace) > 40000:
+        trace = trace[:40000] + "\n... [truncated]"
+    user_prompt = f"""# Task
+{task_instruction}
+# Rubric Criteria
+{rubric_text}
+# Agent Action Trace
+{trace}"""
+    try:
+        import litellm
+        litellm_model = model
+        if provider and not model.startswith(f"{provider}/"):
+            litellm_model = f"{provider}/{model}"
+        response = litellm.completion(
+            model=litellm_model,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_prompt},
+            ],
+            api_key=api_key,
+            base_url=base_url,
+            temperature=0.2,
+        )
+        raw = response.choices[0].message.content or ""
+    except Exception as e:
+        logger.warning("Rubric judge LLM call failed: %s", e)
+        return EvalResult(score=0.0, verdict="ERROR", evidence=[], failed_criteria=[], error=str(e))
+    return _parse_response(raw)
+def _parse_response(raw: str) -> EvalResult:
+    """Parse the judge's JSON response."""
+    text = raw.strip()
+    if text.startswith("```"):
+        text = text.strip("`\n")
+        if text.lower().startswith("json"):
+            text = text[4:].strip()
+    try:
+        data = json.loads(text)
+    except json.JSONDecodeError:
+        import re
+        match = re.search(r"\{.*\}", text, re.DOTALL)
+        if match:
+            try:
+                data = json.loads(match.group(0))
+            except json.JSONDecodeError:
+                return EvalResult(
+                    score=0.0, verdict="ERROR", evidence=[], failed_criteria=[],
+                    error=f"Could not parse judge response: {raw[:300]}",
+                )
+        else:
+            return EvalResult(
+                score=0.0, verdict="ERROR", evidence=[], failed_criteria=[],
+                error=f"Could not parse judge response: {raw[:300]}",
+            )
+    score = max(0.0, min(float(data.get("score", 0.0)), 1.0))
+    verdict = data.get("verdict", "PASS" if score >= PASS_THRESHOLD else "FAIL")
+    evidence = data.get("evidence", [])
+    if isinstance(evidence, str):
+        evidence = [evidence]
+    failed = data.get("failed_criteria", [])
+    if isinstance(failed, str):
+        failed = [failed]
+    return EvalResult(score=score, verdict=str(verdict), evidence=evidence, failed_criteria=failed)

pyproject.toml CHANGED Viewed

@@ -10,6 +10,7 @@ dependencies = [
     "openenv-core>=0.2.3",
     "pydantic>=2.0",
     "requests>=2.28",
 ]
 [project.urls]

     "openenv-core>=0.2.3",
     "pydantic>=2.0",
     "requests>=2.28",
+    "litellm>=1.80.0",
 ]
 [project.urls]

server/environment.py CHANGED Viewed

@@ -5,12 +5,14 @@ from __future__ import annotations
 import json
 import logging
 import os
 from uuid import uuid4
 import requests
 from openenv.core.env_server.interfaces import Environment
 from openenv.core.env_server.types import State
 from simlab_hr.models import HRAction, HRObservation
 from simlab_hr.tasks import BUNDLED_TASKS, get_task
@@ -45,19 +47,22 @@ class HREnvironment(Environment):
         self._current_task = BUNDLED_TASKS[0]
         self._tools: dict[str, list[str]] = {}
         self._episode_count = 0
     def reset(self) -> HRObservation:
         self._current_task = get_task(self._episode_count)
         self._episode_count += 1
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._tools = self._discover_all_tools()
         return HRObservation(
             result=(
                 "HR environment ready. You have access to 4 tool servers: "
                 "hrms (employee records, leave, payroll), email (inbox), "
                 "calendar (scheduling), and rocketchat (team messaging). "
-                "Use the tools to complete the task."
             ),
             is_error=False,
             tools_available=self._tools,
@@ -69,17 +74,44 @@ class HREnvironment(Environment):
     def step(self, action: HRAction) -> HRObservation:
         self._state.step_count += 1
         server_url = self._server_urls.get(action.tool_server)
         if server_url is None:
-            return HRObservation(
-                result=f"Unknown tool server: '{action.tool_server}'. Use one of: hrms, email, calendar, rocketchat.",
-                is_error=True,
-                tools_available=self._tools,
-                task_instruction=self._current_task.instruction,
-                done=False,
-                reward=0.0,
-            )
         payload = {"action": {"tool_name": action.tool_name, "parameters": action.parameters}}
         try:
             resp = requests.post(
@@ -90,37 +122,48 @@ class HREnvironment(Environment):
             )
             result = resp.text
             is_error = resp.status_code != 200
             try:
                 parsed = resp.json()
                 result = json.dumps(parsed, indent=2) if isinstance(parsed, (dict, list)) else str(parsed)
             except (json.JSONDecodeError, ValueError):
                 pass
         except requests.RequestException as exc:
-            result = f"Tool invocation failed on {action.tool_server}: {exc}"
-            is_error = True
-        done = self._state.step_count >= MAX_STEPS_PER_EPISODE
         return HRObservation(
-            result=result,
-            is_error=is_error,
             tools_available=self._tools,
             task_instruction=self._current_task.instruction,
-            done=done,
-            reward=0.0,
         )
-    @property
-    def state(self) -> State:
-        return self._state
     def _discover_all_tools(self) -> dict[str, list[str]]:
         """Fetch available tools from each tool server."""
         all_tools: dict[str, list[str]] = {}
         for name, url in self._server_urls.items():
             all_tools[name] = self._discover_tools(name, url)
         return all_tools
     def _discover_tools(self, server_name: str, server_url: str) -> list[str]:

 import json
 import logging
 import os
+from typing import Any
 from uuid import uuid4
 import requests
 from openenv.core.env_server.interfaces import Environment
 from openenv.core.env_server.types import State
+from simlab_hr.evaluator import evaluate_episode
 from simlab_hr.models import HRAction, HRObservation
 from simlab_hr.tasks import BUNDLED_TASKS, get_task
         self._current_task = BUNDLED_TASKS[0]
         self._tools: dict[str, list[str]] = {}
         self._episode_count = 0
+        self._action_history: list[dict[str, Any]] = []
     def reset(self) -> HRObservation:
         self._current_task = get_task(self._episode_count)
         self._episode_count += 1
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._tools = self._discover_all_tools()
+        self._action_history = []
         return HRObservation(
             result=(
                 "HR environment ready. You have access to 4 tool servers: "
                 "hrms (employee records, leave, payroll), email (inbox), "
                 "calendar (scheduling), and rocketchat (team messaging). "
+                "When you've completed the task, call tool_name='submit_task' "
+                "on any server to trigger evaluation and get your score."
             ),
             is_error=False,
             tools_available=self._tools,
     def step(self, action: HRAction) -> HRObservation:
         self._state.step_count += 1
+        if action.tool_name == "submit_task":
+            return self._evaluate_and_finish()
         server_url = self._server_urls.get(action.tool_server)
         if server_url is None:
+            result = f"Unknown tool server: '{action.tool_server}'. Use one of: hrms, email, calendar, rocketchat."
+            is_error = True
+        else:
+            result, is_error = self._call_tool(server_url, action)
+        self._action_history.append({
+            "step": self._state.step_count,
+            "server": action.tool_server,
+            "tool": action.tool_name,
+            "parameters": action.parameters,
+            "result": result[:2000],
+            "is_error": is_error,
+        })
+        at_step_limit = self._state.step_count >= MAX_STEPS_PER_EPISODE
+        if at_step_limit:
+            return self._evaluate_and_finish()
+        return HRObservation(
+            result=result,
+            is_error=is_error,
+            tools_available=self._tools,
+            task_instruction=self._current_task.instruction,
+            done=False,
+            reward=0.0,
+        )
+    @property
+    def state(self) -> State:
+        return self._state
+    def _call_tool(self, server_url: str, action: HRAction) -> tuple[str, bool]:
+        """Proxy a tool call to the appropriate server. Returns (result, is_error)."""
         payload = {"action": {"tool_name": action.tool_name, "parameters": action.parameters}}
         try:
             resp = requests.post(
             )
             result = resp.text
             is_error = resp.status_code != 200
             try:
                 parsed = resp.json()
                 result = json.dumps(parsed, indent=2) if isinstance(parsed, (dict, list)) else str(parsed)
             except (json.JSONDecodeError, ValueError):
                 pass
+            return result, is_error
         except requests.RequestException as exc:
+            return f"Tool invocation failed on {action.tool_server}: {exc}", True
+    def _evaluate_and_finish(self) -> HRObservation:
+        """Run the rubric judge and return the final observation with reward."""
+        eval_result = evaluate_episode(
+            task_instruction=self._current_task.instruction,
+            rubric=self._current_task.rubric,
+            action_history=self._action_history,
+        )
+        verdict_msg = (
+            f"Episode complete. Score: {eval_result.score:.2f} ({eval_result.verdict})"
+        )
+        if eval_result.evidence:
+            verdict_msg += "\nEvidence: " + "; ".join(eval_result.evidence)
+        if eval_result.failed_criteria:
+            verdict_msg += "\nFailed: " + "; ".join(eval_result.failed_criteria)
+        if eval_result.error:
+            verdict_msg += f"\nNote: {eval_result.error}"
         return HRObservation(
+            result=verdict_msg,
+            is_error=False,
             tools_available=self._tools,
             task_instruction=self._current_task.instruction,
+            done=True,
+            reward=eval_result.score,
         )
     def _discover_all_tools(self) -> dict[str, list[str]]:
         """Fetch available tools from each tool server."""
         all_tools: dict[str, list[str]] = {}
         for name, url in self._server_urls.items():
             all_tools[name] = self._discover_tools(name, url)
+        all_tools.setdefault("_meta", []).append("submit_task")
         return all_tools
     def _discover_tools(self, server_name: str, server_url: str) -> list[str]:

server/requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 openenv-core>=0.2.3
 pydantic>=2.0
 requests>=2.28
 uvicorn>=0.30

 openenv-core>=0.2.3
 pydantic>=2.0
 requests>=2.28
+litellm>=1.80.0
 uvicorn>=0.30

tasks.py CHANGED Viewed

@@ -5,7 +5,7 @@ from __future__ import annotations
 import logging
 import os
 import random
-from dataclasses import dataclass
 logger = logging.getLogger(__name__)
@@ -20,6 +20,7 @@ class Task:
     id: str
     instruction: str
     difficulty: str
 BUNDLED_TASKS: list[Task] = [
@@ -32,6 +33,11 @@ BUNDLED_TASKS: list[Task] = [
             "priya.mehta@gmail.com with the interview details."
         ),
         difficulty="medium",
     ),
     Task(
         id="hr-002",
@@ -41,6 +47,11 @@ BUNDLED_TASKS: list[Task] = [
             "if he has sufficient days, and notify his manager Sarah Chen via Rocket.Chat."
         ),
         difficulty="easy",
     ),
     Task(
         id="hr-003",
@@ -50,6 +61,11 @@ BUNDLED_TASKS: list[Task] = [
             "and send a summary email to hr-team@company.com with the findings."
         ),
         difficulty="medium",
     ),
     Task(
         id="hr-004",
@@ -61,6 +77,12 @@ BUNDLED_TASKS: list[Task] = [
             "invites via email to all participants including the candidate at alex.rivera@email.com."
         ),
         difficulty="hard",
     ),
     Task(
         id="hr-005",
@@ -70,6 +92,11 @@ BUNDLED_TASKS: list[Task] = [
             "email, and post an announcement in the #general channel on Rocket.Chat."
         ),
         difficulty="easy",
     ),
     Task(
         id="hr-006",
@@ -81,6 +108,12 @@ BUNDLED_TASKS: list[Task] = [
             "#engineering channel on Rocket.Chat."
         ),
         difficulty="hard",
     ),
     Task(
         id="hr-007",
@@ -90,6 +123,11 @@ BUNDLED_TASKS: list[Task] = [
             "asking them to review the pending requests."
         ),
         difficulty="medium",
     ),
     Task(
         id="hr-008",
@@ -100,6 +138,11 @@ BUNDLED_TASKS: list[Task] = [
             "send each employee an email notification about their scheduled review time."
         ),
         difficulty="hard",
     ),
 ]
@@ -165,4 +208,5 @@ def _fetch_api_task(api_key: str, task_index: int | None) -> Task:
         id=api_task.get("task_id", "api-unknown"),
         instruction=api_task.get("description", ""),
         difficulty=api_task.get("difficulty", "unknown"),
     )

 import logging
 import os
 import random
+from dataclasses import dataclass, field
 logger = logging.getLogger(__name__)
     id: str
     instruction: str
     difficulty: str
+    rubric: list[str] = field(default_factory=list)
 BUNDLED_TASKS: list[Task] = [
             "priya.mehta@gmail.com with the interview details."
         ),
         difficulty="medium",
+        rubric=[
+            "Employee record created in HRMS for Priya Mehta",
+            "Phone screening event scheduled on calendar for next Tuesday at 2 PM",
+            "Confirmation email sent to priya.mehta@gmail.com with interview details",
+        ],
     ),
     Task(
         id="hr-002",
             "if he has sufficient days, and notify his manager Sarah Chen via Rocket.Chat."
         ),
         difficulty="easy",
+        rubric=[
+            "Leave balance checked for employee EMP-0042",
+            "Leave request approved or denied based on balance",
+            "Manager Sarah Chen notified via RocketChat",
+        ],
     ),
     Task(
         id="hr-003",
             "and send a summary email to hr-team@company.com with the findings."
         ),
         difficulty="medium",
+        rubric=[
+            "Attendance records retrieved from HRMS",
+            "Employees with >2 absences identified",
+            "Summary email sent to hr-team@company.com",
+        ],
     ),
     Task(
         id="hr-004",
             "invites via email to all participants including the candidate at alex.rivera@email.com."
         ),
         difficulty="hard",
+        rubric=[
+            "Availability checked for all three interviewers on the calendar",
+            "Common 1-hour slot identified",
+            "Meeting booked on the calendar",
+            "Email invites sent to all participants including alex.rivera@email.com",
+        ],
     ),
     Task(
         id="hr-005",
             "email, and post an announcement in the #general channel on Rocket.Chat."
         ),
         difficulty="easy",
+        rubric=[
+            "Designation updated in HRMS to Senior Developer",
+            "Congratulatory email sent to Maria Santos",
+            "Announcement posted in #general on RocketChat",
+        ],
     ),
     Task(
         id="hr-006",
             "#engineering channel on Rocket.Chat."
         ),
         difficulty="hard",
+        rubric=[
+            "Employee record created in HRMS with department Engineering",
+            "Welcome email sent to david.kim@company.com",
+            "Orientation meeting scheduled on calendar for start date",
+            "Added to #engineering channel on RocketChat",
+        ],
     ),
     Task(
         id="hr-007",
             "asking them to review the pending requests."
         ),
         difficulty="medium",
+        rubric=[
+            "Pending leave requests retrieved from HRMS",
+            "Approving managers identified for each request",
+            "Reminder emails sent to respective managers",
+        ],
     ),
     Task(
         id="hr-008",
             "send each employee an email notification about their scheduled review time."
         ),
         difficulty="hard",
+        rubric=[
+            "Engineering department employees retrieved from HRMS",
+            "Individual 45-minute review meetings scheduled on calendar",
+            "Email notifications sent to each employee with their review time",
+        ],
     ),
 ]
         id=api_task.get("task_id", "api-unknown"),
         instruction=api_task.get("description", ""),
         difficulty=api_task.get("difficulty", "unknown"),
+        rubric=[],
     )