"""Lightweight rubric-based LLM judge for the HR environment.""" from __future__ import annotations import json import logging import os from dataclasses import dataclass from typing import Any logger = logging.getLogger(__name__) PASS_THRESHOLD = 0.6 SYSTEM_PROMPT = """\ You are an impartial evaluator assessing whether an AI agent successfully \ completed an HR task. Score accurately based on evidence from the action trace. Scoring: - 0.8-1.0: All requirements fully met with clear evidence. - 0.6-0.8: Core requirements met with minor gaps. (0.6 = PASS) - 0.4-0.6: Partial completion, significant gaps remain. - 0.2-0.4: Minimal progress, most requirements failed. - 0.0-0.2: No meaningful progress. Respond with valid JSON (no markdown fences): {"score": 0.0, "verdict": "PASS or FAIL", "evidence": ["..."], "failed_criteria": ["..."]}""" @dataclass class EvalResult: """Result from the rubric judge.""" score: float verdict: str evidence: list[str] failed_criteria: list[str] error: str | None = None def evaluate_episode( *, task_instruction: str, rubric: list[str], action_history: list[dict[str, Any]], ) -> EvalResult: """Run the rubric judge on a completed episode. Returns EvalResult with 0.0-1.0 score.""" model = os.environ.get("VERIFIER_MODEL", "").strip() api_key = os.environ.get("VERIFIER_API_KEY", "").strip() if not model or not api_key: return EvalResult( score=0.0, verdict="SKIPPED", evidence=[], failed_criteria=[], error="Set VERIFIER_MODEL and VERIFIER_API_KEY to enable evaluation", ) provider = os.environ.get("VERIFIER_PROVIDER", "").strip() or None base_url = os.environ.get("VERIFIER_BASE_URL", "").strip() or None rubric_text = "\n".join(f"- {r}" for r in rubric) if rubric else "No specific rubric provided." trace = json.dumps(action_history[-50:], indent=2, ensure_ascii=False) if len(trace) > 40000: trace = trace[:40000] + "\n... [truncated]" user_prompt = f"""# Task {task_instruction} # Rubric Criteria {rubric_text} # Agent Action Trace {trace}""" try: import litellm litellm_model = model if provider and not model.startswith(f"{provider}/"): litellm_model = f"{provider}/{model}" response = litellm.completion( model=litellm_model, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_prompt}, ], api_key=api_key, base_url=base_url, temperature=0.2, ) raw = response.choices[0].message.content or "" except Exception as e: logger.warning("Rubric judge LLM call failed: %s", e) return EvalResult(score=0.0, verdict="ERROR", evidence=[], failed_criteria=[], error=str(e)) return _parse_response(raw) def _parse_response(raw: str) -> EvalResult: """Parse the judge's JSON response.""" text = raw.strip() if text.startswith("```"): text = text.strip("`\n") if text.lower().startswith("json"): text = text[4:].strip() try: data = json.loads(text) except json.JSONDecodeError: import re match = re.search(r"\{.*\}", text, re.DOTALL) if match: try: data = json.loads(match.group(0)) except json.JSONDecodeError: return EvalResult( score=0.0, verdict="ERROR", evidence=[], failed_criteria=[], error=f"Could not parse judge response: {raw[:300]}", ) else: return EvalResult( score=0.0, verdict="ERROR", evidence=[], failed_criteria=[], error=f"Could not parse judge response: {raw[:300]}", ) score = max(0.0, min(float(data.get("score", 0.0)), 1.0)) verdict = data.get("verdict", "PASS" if score >= PASS_THRESHOLD else "FAIL") evidence = data.get("evidence", []) if isinstance(evidence, str): evidence = [evidence] failed = data.get("failed_criteria", []) if isinstance(failed, str): failed = [failed] return EvalResult(score=score, verdict=str(verdict), evidence=evidence, failed_criteria=failed)