Spaces:

renanserrano
/

simulationlab-hr

Runtime error

File size: 4,355 Bytes

384d994

"""Lightweight rubric-based LLM judge for the HR environment."""

from __future__ import annotations

import json
import logging
import os
from dataclasses import dataclass
from typing import Any

logger = logging.getLogger(__name__)

PASS_THRESHOLD = 0.6

SYSTEM_PROMPT = """\
You are an impartial evaluator assessing whether an AI agent successfully \
completed an HR task. Score accurately based on evidence from the action trace.

Scoring:
- 0.8-1.0: All requirements fully met with clear evidence.
- 0.6-0.8: Core requirements met with minor gaps. (0.6 = PASS)
- 0.4-0.6: Partial completion, significant gaps remain.
- 0.2-0.4: Minimal progress, most requirements failed.
- 0.0-0.2: No meaningful progress.

Respond with valid JSON (no markdown fences):
{"score": 0.0, "verdict": "PASS or FAIL", "evidence": ["..."], "failed_criteria": ["..."]}"""


@dataclass
class EvalResult:
    """Result from the rubric judge."""

    score: float
    verdict: str
    evidence: list[str]
    failed_criteria: list[str]
    error: str | None = None


def evaluate_episode(
    *,
    task_instruction: str,
    rubric: list[str],
    action_history: list[dict[str, Any]],
) -> EvalResult:
    """Run the rubric judge on a completed episode. Returns EvalResult with 0.0-1.0 score."""
    model = os.environ.get("VERIFIER_MODEL", "").strip()
    api_key = os.environ.get("VERIFIER_API_KEY", "").strip()

    if not model or not api_key:
        return EvalResult(
            score=0.0,
            verdict="SKIPPED",
            evidence=[],
            failed_criteria=[],
            error="Set VERIFIER_MODEL and VERIFIER_API_KEY to enable evaluation",
        )

    provider = os.environ.get("VERIFIER_PROVIDER", "").strip() or None
    base_url = os.environ.get("VERIFIER_BASE_URL", "").strip() or None

    rubric_text = "\n".join(f"- {r}" for r in rubric) if rubric else "No specific rubric provided."

    trace = json.dumps(action_history[-50:], indent=2, ensure_ascii=False)
    if len(trace) > 40000:
        trace = trace[:40000] + "\n... [truncated]"

    user_prompt = f"""# Task
{task_instruction}

# Rubric Criteria
{rubric_text}

# Agent Action Trace
{trace}"""

    try:
        import litellm

        litellm_model = model
        if provider and not model.startswith(f"{provider}/"):
            litellm_model = f"{provider}/{model}"

        response = litellm.completion(
            model=litellm_model,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_prompt},
            ],
            api_key=api_key,
            base_url=base_url,
            temperature=0.2,
        )
        raw = response.choices[0].message.content or ""
    except Exception as e:
        logger.warning("Rubric judge LLM call failed: %s", e)
        return EvalResult(score=0.0, verdict="ERROR", evidence=[], failed_criteria=[], error=str(e))

    return _parse_response(raw)


def _parse_response(raw: str) -> EvalResult:
    """Parse the judge's JSON response."""
    text = raw.strip()
    if text.startswith("```"):
        text = text.strip("`\n")
        if text.lower().startswith("json"):
            text = text[4:].strip()

    try:
        data = json.loads(text)
    except json.JSONDecodeError:
        import re

        match = re.search(r"\{.*\}", text, re.DOTALL)
        if match:
            try:
                data = json.loads(match.group(0))
            except json.JSONDecodeError:
                return EvalResult(
                    score=0.0, verdict="ERROR", evidence=[], failed_criteria=[],
                    error=f"Could not parse judge response: {raw[:300]}",
                )
        else:
            return EvalResult(
                score=0.0, verdict="ERROR", evidence=[], failed_criteria=[],
                error=f"Could not parse judge response: {raw[:300]}",
            )

    score = max(0.0, min(float(data.get("score", 0.0)), 1.0))
    verdict = data.get("verdict", "PASS" if score >= PASS_THRESHOLD else "FAIL")
    evidence = data.get("evidence", [])
    if isinstance(evidence, str):
        evidence = [evidence]
    failed = data.get("failed_criteria", [])
    if isinstance(failed, str):
        failed = [failed]

    return EvalResult(score=score, verdict=str(verdict), evidence=evidence, failed_criteria=failed)