llmopt-server / llmopt /evaluation /evaluator.py
Shrot101's picture
feat: upgrade LLMOpt to V2 ML-powered architecture
eff2120
"""
Evaluation — LLM-as-a-Judge framework.
Uses gpt-4o-mini to evaluate the quality of responses generated by cheaper models.
Only runs when explicitly requested (evaluate=True in generate()).
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from typing import Optional
logger = logging.getLogger(__name__)
_JUDGE_PROMPT = """You are an objective AI quality evaluator.
A user asked:
"{query}"
An AI assistant responded:
"{response}"
Please evaluate this response on the following criteria and provide a score from 1-10 for each:
1. **Accuracy**: Is the information correct and factually accurate?
2. **Completeness**: Does it fully answer the question?
3. **Clarity**: Is it clear and easy to understand?
4. **Conciseness**: Is it appropriately concise without being too brief?
Also provide an **Overall Score** from 1-10.
Respond ONLY in this exact JSON format:
{{
"accuracy": <1-10>,
"completeness": <1-10>,
"clarity": <1-10>,
"conciseness": <1-10>,
"overall": <1-10>,
"feedback": "<one sentence summary of the main strength or weakness>"
}}"""
@dataclass
class EvaluationResult:
accuracy: float
completeness: float
clarity: float
conciseness: float
overall: float
feedback: str
judge_model: str
def to_dict(self) -> dict:
return self.__dict__.copy()
def __str__(self) -> str:
return (
f"Evaluation (judge={self.judge_model})\n"
f" Overall : {self.overall}/10\n"
f" Accuracy : {self.accuracy}/10\n"
f" Completeness: {self.completeness}/10\n"
f" Clarity : {self.clarity}/10\n"
f" Conciseness: {self.conciseness}/10\n"
f" Feedback : {self.feedback}"
)
class LLMJudge:
"""
Evaluates LLM responses using a judge model (default: gpt-4o-mini).
Gracefully disabled if litellm or API keys are not available.
"""
def __init__(self, judge_model: str = "gpt-4o-mini"):
self.judge_model = judge_model
self.enabled = True
try:
import litellm # type: ignore # noqa: F401
except ImportError:
logger.warning("LLMJudge: litellm not installed. Evaluation disabled.")
self.enabled = False
def evaluate(self, query: str, response: str) -> Optional[EvaluationResult]:
"""
Evaluates a query-response pair using the judge model.
Returns None if evaluation fails or is disabled.
"""
if not self.enabled:
return None
try:
import json
import litellm # type: ignore
prompt = _JUDGE_PROMPT.format(query=query, response=response)
result = litellm.completion(
model=self.judge_model,
messages=[{"role": "user", "content": prompt}],
temperature=0.0,
max_tokens=300,
)
raw = str(result.choices[0].message.content).strip() # type: ignore
# Extract JSON from the response (handle markdown code blocks)
if "```" in raw:
raw = raw.split("```")[1]
if raw.startswith("json"):
raw = raw[4:]
scores = json.loads(raw)
return EvaluationResult(
accuracy=float(scores.get("accuracy", 0)),
completeness=float(scores.get("completeness", 0)),
clarity=float(scores.get("clarity", 0)),
conciseness=float(scores.get("conciseness", 0)),
overall=float(scores.get("overall", 0)),
feedback=scores.get("feedback", ""),
judge_model=self.judge_model,
)
except Exception as e:
logger.warning(f"LLMJudge: Evaluation failed: {e}")
return None