Spaces:

smolagents
/

ml-agent

Running

App Files Files Community

ml-agent / eval /rubric_eval.py

akseljoonas HF Staff

gpt 5 nano judge

9fe493b 3 months ago

raw

history blame contribute delete

3.82 kB

	"""
	Rubric-based evaluation following the "Rubrics as Rewards" paper.

	Implements RaR-Explicit: Weighted sum of individual criterion scores (Equation 1)
	"""

	from typing import List, Optional

	import litellm
	from pydantic import BaseModel


	class CriterionCheck(BaseModel):
	"""Result of checking a single rubric criterion."""

	title: str
	description: str
	weight: int
	satisfied: bool
	reasoning: Optional[str] = None


	class RubricEvaluation(BaseModel):
	"""Complete rubric-based evaluation result."""

	criterion_checks: List[CriterionCheck]
	raw_score: float # Unnormalized score
	normalized_score: float # Score normalized to [0, 1]


	CRITERION_PROMPT = """You are evaluating whether a response satisfies a specific evaluation criterion.

	Question: {question}

	Response to evaluate: {response}

	Evaluation Criterion:
	{criterion_description}

	Your task: Determine if the response satisfies this criterion.

	Output a JSON object with:
	- "satisfied": true or false
	- "reasoning": Brief explanation (1-2 sentences) of why it does or doesn't satisfy the criterion

	Be strict but fair. The criterion must be clearly satisfied for you to answer true."""


	class RubricData(BaseModel):
	"""Rubric data loaded from file."""

	title: str
	description: str
	weight: int


	def check_criterion(
	question: str, response: str, criterion: RubricData, model: str = "gpt-4o-mini"
	) -> CriterionCheck:
	"""
	Check if response satisfies a single criterion.

	Args:
	question: The question being answered
	response: The response to evaluate
	criterion: The rubric criterion to check
	model: LLM model for judging

	Returns:
	CriterionCheck with satisfaction result
	"""
	prompt = CRITERION_PROMPT.format(
	question=question,
	response=response,
	criterion_description=criterion.description,
	)

	llm_response = litellm.completion(
	model=model,
	messages=[
	{
	"role": "system",
	"content": "You are an expert evaluator for rubric-based assessment.",
	},
	{"role": "user", "content": prompt},
	],
	temperature=0.0,
	response_format=CriterionCheck,
	)

	result = CriterionCheck.model_validate_json(llm_response.choices[0].message.content)

	return result


	def evaluate_with_rubrics(
	question: str,
	response: str,
	rubrics: List[RubricData],
	model: str = "gpt-5-nano",
	) -> RubricEvaluation:
	"""
	Evaluate response using RaR-Explicit method (weighted sum).

	Implements Equation 1 from paper:
	r(x, ŷ) = Σ(w_j * c_j(x, ŷ)) / Σ(w_j)

	Args:
	question: The question
	response: Response to evaluate
	reference_answer: Reference answer (not directly used, but available)
	rubrics: List of rubric criteria
	model: LLM model for judging

	Returns:
	RubricEvaluation with normalized score
	"""
	# Check each criterion independently
	checks = []
	for rubric in rubrics:
	check = check_criterion(question, response, rubric, model)
	checks.append(check)

	# Calculate weighted score (Equation 1)
	# Only positive weights contribute to denominator
	positive_weights = sum(abs(r.weight) for r in rubrics if r.weight > 0)

	raw_score = 0.0
	for check in checks:
	if check.satisfied:
	raw_score += check.weight

	# Normalize to [0, 1]
	normalized_score = raw_score / positive_weights if positive_weights > 0 else 0.0
	# Clip to [0, 1] in case pitfalls make it negative
	normalized_score = max(0.0, min(1.0, normalized_score))

	return RubricEvaluation(
	raw_score=raw_score,
	normalized_score=normalized_score,
	criterion_checks=checks,
	)