Spaces:
Running
Running
| """ | |
| Rubric-based evaluation following the "Rubrics as Rewards" paper. | |
| Implements RaR-Explicit: Weighted sum of individual criterion scores (Equation 1) | |
| """ | |
| from typing import List, Optional | |
| import litellm | |
| from pydantic import BaseModel | |
| class CriterionCheck(BaseModel): | |
| """Result of checking a single rubric criterion.""" | |
| title: str | |
| description: str | |
| weight: int | |
| satisfied: bool | |
| reasoning: Optional[str] = None | |
| class RubricEvaluation(BaseModel): | |
| """Complete rubric-based evaluation result.""" | |
| criterion_checks: List[CriterionCheck] | |
| raw_score: float # Unnormalized score | |
| normalized_score: float # Score normalized to [0, 1] | |
| CRITERION_PROMPT = """You are evaluating whether a response satisfies a specific evaluation criterion. | |
| Question: {question} | |
| Response to evaluate: {response} | |
| Evaluation Criterion: | |
| {criterion_description} | |
| Your task: Determine if the response satisfies this criterion. | |
| Output a JSON object with: | |
| - "satisfied": true or false | |
| - "reasoning": Brief explanation (1-2 sentences) of why it does or doesn't satisfy the criterion | |
| Be strict but fair. The criterion must be clearly satisfied for you to answer true.""" | |
| class RubricData(BaseModel): | |
| """Rubric data loaded from file.""" | |
| title: str | |
| description: str | |
| weight: int | |
| def check_criterion( | |
| question: str, response: str, criterion: RubricData, model: str = "gpt-4o-mini" | |
| ) -> CriterionCheck: | |
| """ | |
| Check if response satisfies a single criterion. | |
| Args: | |
| question: The question being answered | |
| response: The response to evaluate | |
| criterion: The rubric criterion to check | |
| model: LLM model for judging | |
| Returns: | |
| CriterionCheck with satisfaction result | |
| """ | |
| prompt = CRITERION_PROMPT.format( | |
| question=question, | |
| response=response, | |
| criterion_description=criterion.description, | |
| ) | |
| llm_response = litellm.completion( | |
| model=model, | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": "You are an expert evaluator for rubric-based assessment.", | |
| }, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| temperature=0.0, | |
| response_format=CriterionCheck, | |
| ) | |
| result = CriterionCheck.model_validate_json(llm_response.choices[0].message.content) | |
| return result | |
| def evaluate_with_rubrics( | |
| question: str, | |
| response: str, | |
| rubrics: List[RubricData], | |
| model: str = "gpt-5-nano", | |
| ) -> RubricEvaluation: | |
| """ | |
| Evaluate response using RaR-Explicit method (weighted sum). | |
| Implements Equation 1 from paper: | |
| r(x, ŷ) = Σ(w_j * c_j(x, ŷ)) / Σ(w_j) | |
| Args: | |
| question: The question | |
| response: Response to evaluate | |
| reference_answer: Reference answer (not directly used, but available) | |
| rubrics: List of rubric criteria | |
| model: LLM model for judging | |
| Returns: | |
| RubricEvaluation with normalized score | |
| """ | |
| # Check each criterion independently | |
| checks = [] | |
| for rubric in rubrics: | |
| check = check_criterion(question, response, rubric, model) | |
| checks.append(check) | |
| # Calculate weighted score (Equation 1) | |
| # Only positive weights contribute to denominator | |
| positive_weights = sum(abs(r.weight) for r in rubrics if r.weight > 0) | |
| raw_score = 0.0 | |
| for check in checks: | |
| if check.satisfied: | |
| raw_score += check.weight | |
| # Normalize to [0, 1] | |
| normalized_score = raw_score / positive_weights if positive_weights > 0 else 0.0 | |
| # Clip to [0, 1] in case pitfalls make it negative | |
| normalized_score = max(0.0, min(1.0, normalized_score)) | |
| return RubricEvaluation( | |
| raw_score=raw_score, | |
| normalized_score=normalized_score, | |
| criterion_checks=checks, | |
| ) | |