Spaces:
Running
Running
| """ | |
| Evaluation — LLM-as-a-Judge framework. | |
| Uses gpt-4o-mini to evaluate the quality of responses generated by cheaper models. | |
| Only runs when explicitly requested (evaluate=True in generate()). | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| from dataclasses import dataclass | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| _JUDGE_PROMPT = """You are an objective AI quality evaluator. | |
| A user asked: | |
| "{query}" | |
| An AI assistant responded: | |
| "{response}" | |
| Please evaluate this response on the following criteria and provide a score from 1-10 for each: | |
| 1. **Accuracy**: Is the information correct and factually accurate? | |
| 2. **Completeness**: Does it fully answer the question? | |
| 3. **Clarity**: Is it clear and easy to understand? | |
| 4. **Conciseness**: Is it appropriately concise without being too brief? | |
| Also provide an **Overall Score** from 1-10. | |
| Respond ONLY in this exact JSON format: | |
| {{ | |
| "accuracy": <1-10>, | |
| "completeness": <1-10>, | |
| "clarity": <1-10>, | |
| "conciseness": <1-10>, | |
| "overall": <1-10>, | |
| "feedback": "<one sentence summary of the main strength or weakness>" | |
| }}""" | |
| class EvaluationResult: | |
| accuracy: float | |
| completeness: float | |
| clarity: float | |
| conciseness: float | |
| overall: float | |
| feedback: str | |
| judge_model: str | |
| def to_dict(self) -> dict: | |
| return self.__dict__.copy() | |
| def __str__(self) -> str: | |
| return ( | |
| f"Evaluation (judge={self.judge_model})\n" | |
| f" Overall : {self.overall}/10\n" | |
| f" Accuracy : {self.accuracy}/10\n" | |
| f" Completeness: {self.completeness}/10\n" | |
| f" Clarity : {self.clarity}/10\n" | |
| f" Conciseness: {self.conciseness}/10\n" | |
| f" Feedback : {self.feedback}" | |
| ) | |
| class LLMJudge: | |
| """ | |
| Evaluates LLM responses using a judge model (default: gpt-4o-mini). | |
| Gracefully disabled if litellm or API keys are not available. | |
| """ | |
| def __init__(self, judge_model: str = "gpt-4o-mini"): | |
| self.judge_model = judge_model | |
| self.enabled = True | |
| try: | |
| import litellm # type: ignore # noqa: F401 | |
| except ImportError: | |
| logger.warning("LLMJudge: litellm not installed. Evaluation disabled.") | |
| self.enabled = False | |
| def evaluate(self, query: str, response: str) -> Optional[EvaluationResult]: | |
| """ | |
| Evaluates a query-response pair using the judge model. | |
| Returns None if evaluation fails or is disabled. | |
| """ | |
| if not self.enabled: | |
| return None | |
| try: | |
| import json | |
| import litellm # type: ignore | |
| prompt = _JUDGE_PROMPT.format(query=query, response=response) | |
| result = litellm.completion( | |
| model=self.judge_model, | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.0, | |
| max_tokens=300, | |
| ) | |
| raw = str(result.choices[0].message.content).strip() # type: ignore | |
| # Extract JSON from the response (handle markdown code blocks) | |
| if "```" in raw: | |
| raw = raw.split("```")[1] | |
| if raw.startswith("json"): | |
| raw = raw[4:] | |
| scores = json.loads(raw) | |
| return EvaluationResult( | |
| accuracy=float(scores.get("accuracy", 0)), | |
| completeness=float(scores.get("completeness", 0)), | |
| clarity=float(scores.get("clarity", 0)), | |
| conciseness=float(scores.get("conciseness", 0)), | |
| overall=float(scores.get("overall", 0)), | |
| feedback=scores.get("feedback", ""), | |
| judge_model=self.judge_model, | |
| ) | |
| except Exception as e: | |
| logger.warning(f"LLMJudge: Evaluation failed: {e}") | |
| return None | |