simulationlab-hr / evaluator.py
renanserrano's picture
Upload folder using huggingface_hub
384d994 verified
"""Lightweight rubric-based LLM judge for the HR environment."""
from __future__ import annotations
import json
import logging
import os
from dataclasses import dataclass
from typing import Any
logger = logging.getLogger(__name__)
PASS_THRESHOLD = 0.6
SYSTEM_PROMPT = """\
You are an impartial evaluator assessing whether an AI agent successfully \
completed an HR task. Score accurately based on evidence from the action trace.
Scoring:
- 0.8-1.0: All requirements fully met with clear evidence.
- 0.6-0.8: Core requirements met with minor gaps. (0.6 = PASS)
- 0.4-0.6: Partial completion, significant gaps remain.
- 0.2-0.4: Minimal progress, most requirements failed.
- 0.0-0.2: No meaningful progress.
Respond with valid JSON (no markdown fences):
{"score": 0.0, "verdict": "PASS or FAIL", "evidence": ["..."], "failed_criteria": ["..."]}"""
@dataclass
class EvalResult:
"""Result from the rubric judge."""
score: float
verdict: str
evidence: list[str]
failed_criteria: list[str]
error: str | None = None
def evaluate_episode(
*,
task_instruction: str,
rubric: list[str],
action_history: list[dict[str, Any]],
) -> EvalResult:
"""Run the rubric judge on a completed episode. Returns EvalResult with 0.0-1.0 score."""
model = os.environ.get("VERIFIER_MODEL", "").strip()
api_key = os.environ.get("VERIFIER_API_KEY", "").strip()
if not model or not api_key:
return EvalResult(
score=0.0,
verdict="SKIPPED",
evidence=[],
failed_criteria=[],
error="Set VERIFIER_MODEL and VERIFIER_API_KEY to enable evaluation",
)
provider = os.environ.get("VERIFIER_PROVIDER", "").strip() or None
base_url = os.environ.get("VERIFIER_BASE_URL", "").strip() or None
rubric_text = "\n".join(f"- {r}" for r in rubric) if rubric else "No specific rubric provided."
trace = json.dumps(action_history[-50:], indent=2, ensure_ascii=False)
if len(trace) > 40000:
trace = trace[:40000] + "\n... [truncated]"
user_prompt = f"""# Task
{task_instruction}
# Rubric Criteria
{rubric_text}
# Agent Action Trace
{trace}"""
try:
import litellm
litellm_model = model
if provider and not model.startswith(f"{provider}/"):
litellm_model = f"{provider}/{model}"
response = litellm.completion(
model=litellm_model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_prompt},
],
api_key=api_key,
base_url=base_url,
temperature=0.2,
)
raw = response.choices[0].message.content or ""
except Exception as e:
logger.warning("Rubric judge LLM call failed: %s", e)
return EvalResult(score=0.0, verdict="ERROR", evidence=[], failed_criteria=[], error=str(e))
return _parse_response(raw)
def _parse_response(raw: str) -> EvalResult:
"""Parse the judge's JSON response."""
text = raw.strip()
if text.startswith("```"):
text = text.strip("`\n")
if text.lower().startswith("json"):
text = text[4:].strip()
try:
data = json.loads(text)
except json.JSONDecodeError:
import re
match = re.search(r"\{.*\}", text, re.DOTALL)
if match:
try:
data = json.loads(match.group(0))
except json.JSONDecodeError:
return EvalResult(
score=0.0, verdict="ERROR", evidence=[], failed_criteria=[],
error=f"Could not parse judge response: {raw[:300]}",
)
else:
return EvalResult(
score=0.0, verdict="ERROR", evidence=[], failed_criteria=[],
error=f"Could not parse judge response: {raw[:300]}",
)
score = max(0.0, min(float(data.get("score", 0.0)), 1.0))
verdict = data.get("verdict", "PASS" if score >= PASS_THRESHOLD else "FAIL")
evidence = data.get("evidence", [])
if isinstance(evidence, str):
evidence = [evidence]
failed = data.get("failed_criteria", [])
if isinstance(failed, str):
failed = [failed]
return EvalResult(score=score, verdict=str(verdict), evidence=evidence, failed_criteria=failed)