""" Evaluation — LLM-as-a-Judge framework. Uses gpt-4o-mini to evaluate the quality of responses generated by cheaper models. Only runs when explicitly requested (evaluate=True in generate()). """ from __future__ import annotations import logging from dataclasses import dataclass from typing import Optional logger = logging.getLogger(__name__) _JUDGE_PROMPT = """You are an objective AI quality evaluator. A user asked: "{query}" An AI assistant responded: "{response}" Please evaluate this response on the following criteria and provide a score from 1-10 for each: 1. **Accuracy**: Is the information correct and factually accurate? 2. **Completeness**: Does it fully answer the question? 3. **Clarity**: Is it clear and easy to understand? 4. **Conciseness**: Is it appropriately concise without being too brief? Also provide an **Overall Score** from 1-10. Respond ONLY in this exact JSON format: {{ "accuracy": <1-10>, "completeness": <1-10>, "clarity": <1-10>, "conciseness": <1-10>, "overall": <1-10>, "feedback": "" }}""" @dataclass class EvaluationResult: accuracy: float completeness: float clarity: float conciseness: float overall: float feedback: str judge_model: str def to_dict(self) -> dict: return self.__dict__.copy() def __str__(self) -> str: return ( f"Evaluation (judge={self.judge_model})\n" f" Overall : {self.overall}/10\n" f" Accuracy : {self.accuracy}/10\n" f" Completeness: {self.completeness}/10\n" f" Clarity : {self.clarity}/10\n" f" Conciseness: {self.conciseness}/10\n" f" Feedback : {self.feedback}" ) class LLMJudge: """ Evaluates LLM responses using a judge model (default: gpt-4o-mini). Gracefully disabled if litellm or API keys are not available. """ def __init__(self, judge_model: str = "gpt-4o-mini"): self.judge_model = judge_model self.enabled = True try: import litellm # type: ignore # noqa: F401 except ImportError: logger.warning("LLMJudge: litellm not installed. Evaluation disabled.") self.enabled = False def evaluate(self, query: str, response: str) -> Optional[EvaluationResult]: """ Evaluates a query-response pair using the judge model. Returns None if evaluation fails or is disabled. """ if not self.enabled: return None try: import json import litellm # type: ignore prompt = _JUDGE_PROMPT.format(query=query, response=response) result = litellm.completion( model=self.judge_model, messages=[{"role": "user", "content": prompt}], temperature=0.0, max_tokens=300, ) raw = str(result.choices[0].message.content).strip() # type: ignore # Extract JSON from the response (handle markdown code blocks) if "```" in raw: raw = raw.split("```")[1] if raw.startswith("json"): raw = raw[4:] scores = json.loads(raw) return EvaluationResult( accuracy=float(scores.get("accuracy", 0)), completeness=float(scores.get("completeness", 0)), clarity=float(scores.get("clarity", 0)), conciseness=float(scores.get("conciseness", 0)), overall=float(scores.get("overall", 0)), feedback=scores.get("feedback", ""), judge_model=self.judge_model, ) except Exception as e: logger.warning(f"LLMJudge: Evaluation failed: {e}") return None