| | """
|
| | Rubric-based evaluation following the "Rubrics as Rewards" paper.
|
| |
|
| | Implements RaR-Explicit: Weighted sum of individual criterion scores (Equation 1)
|
| | """
|
| |
|
| | from typing import List, Optional
|
| |
|
| | import litellm
|
| | from pydantic import BaseModel
|
| |
|
| |
|
| | class CriterionCheck(BaseModel):
|
| | """Result of checking a single rubric criterion."""
|
| |
|
| | title: str
|
| | description: str
|
| | weight: int
|
| | satisfied: bool
|
| | reasoning: Optional[str] = None
|
| |
|
| |
|
| | class RubricEvaluation(BaseModel):
|
| | """Complete rubric-based evaluation result."""
|
| |
|
| | criterion_checks: List[CriterionCheck]
|
| | raw_score: float
|
| | normalized_score: float
|
| |
|
| |
|
| | CRITERION_PROMPT = """You are evaluating whether a response satisfies a specific evaluation criterion.
|
| |
|
| | Question: {question}
|
| |
|
| | Response to evaluate: {response}
|
| |
|
| | Evaluation Criterion:
|
| | {criterion_description}
|
| |
|
| | Your task: Determine if the response satisfies this criterion.
|
| |
|
| | Output a JSON object with:
|
| | - "satisfied": true or false
|
| | - "reasoning": Brief explanation (1-2 sentences) of why it does or doesn't satisfy the criterion
|
| |
|
| | Be strict but fair. The criterion must be clearly satisfied for you to answer true."""
|
| |
|
| |
|
| | class RubricData(BaseModel):
|
| | """Rubric data loaded from file."""
|
| |
|
| | title: str
|
| | description: str
|
| | weight: int
|
| |
|
| |
|
| | def check_criterion(
|
| | question: str, response: str, criterion: RubricData, model: str = "gpt-4o-mini"
|
| | ) -> CriterionCheck:
|
| | """
|
| | Check if response satisfies a single criterion.
|
| |
|
| | Args:
|
| | question: The question being answered
|
| | response: The response to evaluate
|
| | criterion: The rubric criterion to check
|
| | model: LLM model for judging
|
| |
|
| | Returns:
|
| | CriterionCheck with satisfaction result
|
| | """
|
| | prompt = CRITERION_PROMPT.format(
|
| | question=question,
|
| | response=response,
|
| | criterion_description=criterion.description,
|
| | )
|
| |
|
| | llm_response = litellm.completion(
|
| | model=model,
|
| | messages=[
|
| | {
|
| | "role": "system",
|
| | "content": "You are an expert evaluator for rubric-based assessment.",
|
| | },
|
| | {"role": "user", "content": prompt},
|
| | ],
|
| | temperature=0.0,
|
| | response_format=CriterionCheck,
|
| | )
|
| |
|
| | result = CriterionCheck.model_validate_json(llm_response.choices[0].message.content)
|
| |
|
| | return result
|
| |
|
| |
|
| | def evaluate_with_rubrics(
|
| | question: str,
|
| | response: str,
|
| | rubrics: List[RubricData],
|
| | model: str = "gpt-5-nano",
|
| | ) -> RubricEvaluation:
|
| | """
|
| | Evaluate response using RaR-Explicit method (weighted sum).
|
| |
|
| | Implements Equation 1 from paper:
|
| | r(x, ŷ) = Σ(w_j * c_j(x, ŷ)) / Σ(w_j)
|
| |
|
| | Args:
|
| | question: The question
|
| | response: Response to evaluate
|
| | reference_answer: Reference answer (not directly used, but available)
|
| | rubrics: List of rubric criteria
|
| | model: LLM model for judging
|
| |
|
| | Returns:
|
| | RubricEvaluation with normalized score
|
| | """
|
| |
|
| | checks = []
|
| | for rubric in rubrics:
|
| | check = check_criterion(question, response, rubric, model)
|
| | checks.append(check)
|
| |
|
| |
|
| |
|
| | positive_weights = sum(abs(r.weight) for r in rubrics if r.weight > 0)
|
| |
|
| | raw_score = 0.0
|
| | for check in checks:
|
| | if check.satisfied:
|
| | raw_score += check.weight
|
| |
|
| |
|
| | normalized_score = raw_score / positive_weights if positive_weights > 0 else 0.0
|
| |
|
| | normalized_score = max(0.0, min(1.0, normalized_score))
|
| |
|
| | return RubricEvaluation(
|
| | raw_score=raw_score,
|
| | normalized_score=normalized_score,
|
| | criterion_checks=checks,
|
| | )
|
| |
|