ml-agent / eval /rubric_eval.py
akseljoonas's picture
akseljoonas HF Staff
gpt 5 nano judge
9fe493b
"""
Rubric-based evaluation following the "Rubrics as Rewards" paper.
Implements RaR-Explicit: Weighted sum of individual criterion scores (Equation 1)
"""
from typing import List, Optional
import litellm
from pydantic import BaseModel
class CriterionCheck(BaseModel):
"""Result of checking a single rubric criterion."""
title: str
description: str
weight: int
satisfied: bool
reasoning: Optional[str] = None
class RubricEvaluation(BaseModel):
"""Complete rubric-based evaluation result."""
criterion_checks: List[CriterionCheck]
raw_score: float # Unnormalized score
normalized_score: float # Score normalized to [0, 1]
CRITERION_PROMPT = """You are evaluating whether a response satisfies a specific evaluation criterion.
Question: {question}
Response to evaluate: {response}
Evaluation Criterion:
{criterion_description}
Your task: Determine if the response satisfies this criterion.
Output a JSON object with:
- "satisfied": true or false
- "reasoning": Brief explanation (1-2 sentences) of why it does or doesn't satisfy the criterion
Be strict but fair. The criterion must be clearly satisfied for you to answer true."""
class RubricData(BaseModel):
"""Rubric data loaded from file."""
title: str
description: str
weight: int
def check_criterion(
question: str, response: str, criterion: RubricData, model: str = "gpt-4o-mini"
) -> CriterionCheck:
"""
Check if response satisfies a single criterion.
Args:
question: The question being answered
response: The response to evaluate
criterion: The rubric criterion to check
model: LLM model for judging
Returns:
CriterionCheck with satisfaction result
"""
prompt = CRITERION_PROMPT.format(
question=question,
response=response,
criterion_description=criterion.description,
)
llm_response = litellm.completion(
model=model,
messages=[
{
"role": "system",
"content": "You are an expert evaluator for rubric-based assessment.",
},
{"role": "user", "content": prompt},
],
temperature=0.0,
response_format=CriterionCheck,
)
result = CriterionCheck.model_validate_json(llm_response.choices[0].message.content)
return result
def evaluate_with_rubrics(
question: str,
response: str,
rubrics: List[RubricData],
model: str = "gpt-5-nano",
) -> RubricEvaluation:
"""
Evaluate response using RaR-Explicit method (weighted sum).
Implements Equation 1 from paper:
r(x, ŷ) = Σ(w_j * c_j(x, ŷ)) / Σ(w_j)
Args:
question: The question
response: Response to evaluate
reference_answer: Reference answer (not directly used, but available)
rubrics: List of rubric criteria
model: LLM model for judging
Returns:
RubricEvaluation with normalized score
"""
# Check each criterion independently
checks = []
for rubric in rubrics:
check = check_criterion(question, response, rubric, model)
checks.append(check)
# Calculate weighted score (Equation 1)
# Only positive weights contribute to denominator
positive_weights = sum(abs(r.weight) for r in rubrics if r.weight > 0)
raw_score = 0.0
for check in checks:
if check.satisfied:
raw_score += check.weight
# Normalize to [0, 1]
normalized_score = raw_score / positive_weights if positive_weights > 0 else 0.0
# Clip to [0, 1] in case pitfalls make it negative
normalized_score = max(0.0, min(1.0, normalized_score))
return RubricEvaluation(
raw_score=raw_score,
normalized_score=normalized_score,
criterion_checks=checks,
)