Spaces:

renanserrano
/

simulationlab-hr

Runtime error

App Files Files Community

simulationlab-hr / evaluator.py

renanserrano

Upload folder using huggingface_hub

384d994 verified 5 days ago

raw

history blame contribute delete

4.36 kB

	"""Lightweight rubric-based LLM judge for the HR environment."""

	from __future__ import annotations

	import json
	import logging
	import os
	from dataclasses import dataclass
	from typing import Any

	logger = logging.getLogger(__name__)

	PASS_THRESHOLD = 0.6

	SYSTEM_PROMPT = """\
	You are an impartial evaluator assessing whether an AI agent successfully \
	completed an HR task. Score accurately based on evidence from the action trace.

	Scoring:
	- 0.8-1.0: All requirements fully met with clear evidence.
	- 0.6-0.8: Core requirements met with minor gaps. (0.6 = PASS)
	- 0.4-0.6: Partial completion, significant gaps remain.
	- 0.2-0.4: Minimal progress, most requirements failed.
	- 0.0-0.2: No meaningful progress.

	Respond with valid JSON (no markdown fences):
	{"score": 0.0, "verdict": "PASS or FAIL", "evidence": ["..."], "failed_criteria": ["..."]}"""


	@dataclass
	class EvalResult:
	"""Result from the rubric judge."""

	score: float
	verdict: str
	evidence: list[str]
	failed_criteria: list[str]
	error: str \| None = None


	def evaluate_episode(
	*,
	task_instruction: str,
	rubric: list[str],
	action_history: list[dict[str, Any]],
	) -> EvalResult:
	"""Run the rubric judge on a completed episode. Returns EvalResult with 0.0-1.0 score."""
	model = os.environ.get("VERIFIER_MODEL", "").strip()
	api_key = os.environ.get("VERIFIER_API_KEY", "").strip()

	if not model or not api_key:
	return EvalResult(
	score=0.0,
	verdict="SKIPPED",
	evidence=[],
	failed_criteria=[],
	error="Set VERIFIER_MODEL and VERIFIER_API_KEY to enable evaluation",
	)

	provider = os.environ.get("VERIFIER_PROVIDER", "").strip() or None
	base_url = os.environ.get("VERIFIER_BASE_URL", "").strip() or None

	rubric_text = "\n".join(f"- {r}" for r in rubric) if rubric else "No specific rubric provided."

	trace = json.dumps(action_history[-50:], indent=2, ensure_ascii=False)
	if len(trace) > 40000:
	trace = trace[:40000] + "\n... [truncated]"

	user_prompt = f"""# Task
	{task_instruction}

	# Rubric Criteria
	{rubric_text}

	# Agent Action Trace
	{trace}"""

	try:
	import litellm

	litellm_model = model
	if provider and not model.startswith(f"{provider}/"):
	litellm_model = f"{provider}/{model}"

	response = litellm.completion(
	model=litellm_model,
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": user_prompt},
	],
	api_key=api_key,
	base_url=base_url,
	temperature=0.2,
	)
	raw = response.choices[0].message.content or ""
	except Exception as e:
	logger.warning("Rubric judge LLM call failed: %s", e)
	return EvalResult(score=0.0, verdict="ERROR", evidence=[], failed_criteria=[], error=str(e))

	return _parse_response(raw)


	def _parse_response(raw: str) -> EvalResult:
	"""Parse the judge's JSON response."""
	text = raw.strip()
	if text.startswith("```"):
	text = text.strip("`\n")
	if text.lower().startswith("json"):
	text = text[4:].strip()

	try:
	data = json.loads(text)
	except json.JSONDecodeError:
	import re

	match = re.search(r"\{.*\}", text, re.DOTALL)
	if match:
	try:
	data = json.loads(match.group(0))
	except json.JSONDecodeError:
	return EvalResult(
	score=0.0, verdict="ERROR", evidence=[], failed_criteria=[],
	error=f"Could not parse judge response: {raw[:300]}",
	)
	else:
	return EvalResult(
	score=0.0, verdict="ERROR", evidence=[], failed_criteria=[],
	error=f"Could not parse judge response: {raw[:300]}",
	)

	score = max(0.0, min(float(data.get("score", 0.0)), 1.0))
	verdict = data.get("verdict", "PASS" if score >= PASS_THRESHOLD else "FAIL")
	evidence = data.get("evidence", [])
	if isinstance(evidence, str):
	evidence = [evidence]
	failed = data.get("failed_criteria", [])
	if isinstance(failed, str):
	failed = [failed]

	return EvalResult(score=score, verdict=str(verdict), evidence=evidence, failed_criteria=failed)