Spaces:
Runtime error
Runtime error
| """Lightweight rubric-based LLM judge for the HR environment.""" | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import os | |
| from dataclasses import dataclass | |
| from typing import Any | |
| logger = logging.getLogger(__name__) | |
| PASS_THRESHOLD = 0.6 | |
| SYSTEM_PROMPT = """\ | |
| You are an impartial evaluator assessing whether an AI agent successfully \ | |
| completed an HR task. Score accurately based on evidence from the action trace. | |
| Scoring: | |
| - 0.8-1.0: All requirements fully met with clear evidence. | |
| - 0.6-0.8: Core requirements met with minor gaps. (0.6 = PASS) | |
| - 0.4-0.6: Partial completion, significant gaps remain. | |
| - 0.2-0.4: Minimal progress, most requirements failed. | |
| - 0.0-0.2: No meaningful progress. | |
| Respond with valid JSON (no markdown fences): | |
| {"score": 0.0, "verdict": "PASS or FAIL", "evidence": ["..."], "failed_criteria": ["..."]}""" | |
| class EvalResult: | |
| """Result from the rubric judge.""" | |
| score: float | |
| verdict: str | |
| evidence: list[str] | |
| failed_criteria: list[str] | |
| error: str | None = None | |
| def evaluate_episode( | |
| *, | |
| task_instruction: str, | |
| rubric: list[str], | |
| action_history: list[dict[str, Any]], | |
| ) -> EvalResult: | |
| """Run the rubric judge on a completed episode. Returns EvalResult with 0.0-1.0 score.""" | |
| model = os.environ.get("VERIFIER_MODEL", "").strip() | |
| api_key = os.environ.get("VERIFIER_API_KEY", "").strip() | |
| if not model or not api_key: | |
| return EvalResult( | |
| score=0.0, | |
| verdict="SKIPPED", | |
| evidence=[], | |
| failed_criteria=[], | |
| error="Set VERIFIER_MODEL and VERIFIER_API_KEY to enable evaluation", | |
| ) | |
| provider = os.environ.get("VERIFIER_PROVIDER", "").strip() or None | |
| base_url = os.environ.get("VERIFIER_BASE_URL", "").strip() or None | |
| rubric_text = "\n".join(f"- {r}" for r in rubric) if rubric else "No specific rubric provided." | |
| trace = json.dumps(action_history[-50:], indent=2, ensure_ascii=False) | |
| if len(trace) > 40000: | |
| trace = trace[:40000] + "\n... [truncated]" | |
| user_prompt = f"""# Task | |
| {task_instruction} | |
| # Rubric Criteria | |
| {rubric_text} | |
| # Agent Action Trace | |
| {trace}""" | |
| try: | |
| import litellm | |
| litellm_model = model | |
| if provider and not model.startswith(f"{provider}/"): | |
| litellm_model = f"{provider}/{model}" | |
| response = litellm.completion( | |
| model=litellm_model, | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": user_prompt}, | |
| ], | |
| api_key=api_key, | |
| base_url=base_url, | |
| temperature=0.2, | |
| ) | |
| raw = response.choices[0].message.content or "" | |
| except Exception as e: | |
| logger.warning("Rubric judge LLM call failed: %s", e) | |
| return EvalResult(score=0.0, verdict="ERROR", evidence=[], failed_criteria=[], error=str(e)) | |
| return _parse_response(raw) | |
| def _parse_response(raw: str) -> EvalResult: | |
| """Parse the judge's JSON response.""" | |
| text = raw.strip() | |
| if text.startswith("```"): | |
| text = text.strip("`\n") | |
| if text.lower().startswith("json"): | |
| text = text[4:].strip() | |
| try: | |
| data = json.loads(text) | |
| except json.JSONDecodeError: | |
| import re | |
| match = re.search(r"\{.*\}", text, re.DOTALL) | |
| if match: | |
| try: | |
| data = json.loads(match.group(0)) | |
| except json.JSONDecodeError: | |
| return EvalResult( | |
| score=0.0, verdict="ERROR", evidence=[], failed_criteria=[], | |
| error=f"Could not parse judge response: {raw[:300]}", | |
| ) | |
| else: | |
| return EvalResult( | |
| score=0.0, verdict="ERROR", evidence=[], failed_criteria=[], | |
| error=f"Could not parse judge response: {raw[:300]}", | |
| ) | |
| score = max(0.0, min(float(data.get("score", 0.0)), 1.0)) | |
| verdict = data.get("verdict", "PASS" if score >= PASS_THRESHOLD else "FAIL") | |
| evidence = data.get("evidence", []) | |
| if isinstance(evidence, str): | |
| evidence = [evidence] | |
| failed = data.get("failed_criteria", []) | |
| if isinstance(failed, str): | |
| failed = [failed] | |
| return EvalResult(score=score, verdict=str(verdict), evidence=evidence, failed_criteria=failed) | |