File size: 4,509 Bytes
f9e2c6d 2397376 f9e2c6d 2397376 f9e2c6d 2397376 f9e2c6d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | """
judge.py β LLM-as-a-judge answer quality evaluation.
Scores the generated answer on four dimensions (each 1β5):
- Accuracy : Is the answer factually correct given the context?
- Completeness : Does it fully address the question?
- Relevance : Is the answer focused and on-topic?
- Groundedness : Is every claim supported by the retrieved context?
Uses a single structured LLM call returning JSON to minimise cost.
"""
import json
import logging
from typing import List
import os
import litellm
from pydantic import BaseModel, Field
from config import JUDGE_MODEL
logger = logging.getLogger(__name__)
# βββ Pydantic output model ββββββββββββββββββββββββββββββββββββββββββββββββββββ
class AnswerQualityScores(BaseModel):
"""Structured LLM-judge evaluation scores."""
accuracy: int = Field(..., ge=1, le=5, description="Factual accuracy (1β5)")
completeness: int = Field(..., ge=1, le=5, description="How fully the question is answered (1β5)")
relevance: int = Field(..., ge=1, le=5, description="Relevance to the question (1β5)")
groundedness: int = Field(..., ge=1, le=5, description="Claims backed by retrieved context (1β5)")
overall: float = Field(..., description="Mean of the four scores")
reasoning: str = Field(..., description="One-sentence justification from the judge")
_JUDGE_SYSTEM = """You are a strict, impartial evaluator of AI-generated answers about codebases.
Given:
- A user question
- Retrieved code context
- A generated answer
Score the answer on FOUR criteria, each from 1 to 5:
accuracy : Is every claim factually correct based on the context?
completeness : Does the answer fully address all parts of the question?
relevance : Is the answer focused on the question without padding?
groundedness : Are all claims directly supported by the retrieved context?
Respond ONLY with valid JSON matching exactly this schema (no extra keys):
{
"accuracy": <int 1-5>,
"completeness": <int 1-5>,
"relevance": <int 1-5>,
"groundedness": <int 1-5>,
"reasoning": "<one sentence justification>"
}"""
def judge_answer(
query: str,
context_docs: List,
answer: str,
) -> AnswerQualityScores:
"""
Evaluate an LLM-generated answer using an LLM judge.
This consumes 1 LLM call. Results are returned as a Pydantic model.
Args:
query: The user's original question.
context_docs: LangChain Documents used as context.
answer: The generated answer to evaluate.
Returns:
AnswerQualityScores with per-dimension scores and overall mean.
"""
OPENAI_API_KEY =os.getenv("OPENAI_API_KEY", "")
if not OPENAI_API_KEY:
# Return neutral scores when no API key is configured.
return AnswerQualityScores(
accuracy=0, completeness=0, relevance=0, groundedness=0,
overall=0.0, reasoning="No API key β evaluation skipped."
)
context_text = "\n\n".join(
f"[{i+1}] {d.page_content[:400]}" for i, d in enumerate(context_docs)
)
user_msg = (
f"Question: {query}\n\n"
f"Retrieved Context:\n{context_text}\n\n"
f"Generated Answer:\n{answer}"
)
try:
response = litellm.completion(
model=JUDGE_MODEL,
messages=[
{"role": "system", "content": _JUDGE_SYSTEM},
{"role": "user", "content": user_msg},
],
max_tokens=200,
temperature=0.0
)
raw = response.choices[0].message.content.strip()
# Strip potential markdown fences
raw = raw.strip().lstrip("```json").lstrip("```").rstrip("```").strip()
data = json.loads(raw)
scores_sum = data["accuracy"] + data["completeness"] + data["relevance"] + data["groundedness"]
return AnswerQualityScores(
accuracy=data["accuracy"],
completeness=data["completeness"],
relevance=data["relevance"],
groundedness=data["groundedness"],
overall=round(scores_sum / 4, 2),
reasoning=data.get("reasoning", ""),
)
except Exception as e:
logger.error(f"Judge evaluation failed: {e}")
return AnswerQualityScores(
accuracy=0, completeness=0, relevance=0, groundedness=0,
overall=0.0, reasoning=f"Evaluation failed: {e}"
) |