Spaces:

boy177
/

DevDocs

Sleeping

manan75

changed key imports

2397376 about 2 months ago

4.51 kB

	"""
	judge.py — LLM-as-a-judge answer quality evaluation.

	Scores the generated answer on four dimensions (each 1–5):
	- Accuracy : Is the answer factually correct given the context?
	- Completeness : Does it fully address the question?
	- Relevance : Is the answer focused and on-topic?
	- Groundedness : Is every claim supported by the retrieved context?

	Uses a single structured LLM call returning JSON to minimise cost.
	"""

	import json
	import logging
	from typing import List
	import os

	import litellm
	from pydantic import BaseModel, Field

	from config import JUDGE_MODEL

	logger = logging.getLogger(__name__)


	# ─── Pydantic output model ────────────────────────────────────────────────────

	class AnswerQualityScores(BaseModel):
	"""Structured LLM-judge evaluation scores."""
	accuracy: int = Field(..., ge=1, le=5, description="Factual accuracy (1–5)")
	completeness: int = Field(..., ge=1, le=5, description="How fully the question is answered (1–5)")
	relevance: int = Field(..., ge=1, le=5, description="Relevance to the question (1–5)")
	groundedness: int = Field(..., ge=1, le=5, description="Claims backed by retrieved context (1–5)")
	overall: float = Field(..., description="Mean of the four scores")
	reasoning: str = Field(..., description="One-sentence justification from the judge")


	_JUDGE_SYSTEM = """You are a strict, impartial evaluator of AI-generated answers about codebases.

	Given:
	- A user question
	- Retrieved code context
	- A generated answer

	Score the answer on FOUR criteria, each from 1 to 5:
	accuracy : Is every claim factually correct based on the context?
	completeness : Does the answer fully address all parts of the question?
	relevance : Is the answer focused on the question without padding?
	groundedness : Are all claims directly supported by the retrieved context?

	Respond ONLY with valid JSON matching exactly this schema (no extra keys):
	{
	"accuracy": <int 1-5>,
	"completeness": <int 1-5>,
	"relevance": <int 1-5>,
	"groundedness": <int 1-5>,
	"reasoning": "<one sentence justification>"
	}"""


	def judge_answer(
	query: str,
	context_docs: List,
	answer: str,
	) -> AnswerQualityScores:
	"""
	Evaluate an LLM-generated answer using an LLM judge.

	This consumes 1 LLM call. Results are returned as a Pydantic model.

	Args:
	query: The user's original question.
	context_docs: LangChain Documents used as context.
	answer: The generated answer to evaluate.

	Returns:
	AnswerQualityScores with per-dimension scores and overall mean.
	"""
	OPENAI_API_KEY =os.getenv("OPENAI_API_KEY", "")
	if not OPENAI_API_KEY:
	# Return neutral scores when no API key is configured.
	return AnswerQualityScores(
	accuracy=0, completeness=0, relevance=0, groundedness=0,
	overall=0.0, reasoning="No API key — evaluation skipped."
	)

	context_text = "\n\n".join(
	f"[{i+1}] {d.page_content[:400]}" for i, d in enumerate(context_docs)
	)
	user_msg = (
	f"Question: {query}\n\n"
	f"Retrieved Context:\n{context_text}\n\n"
	f"Generated Answer:\n{answer}"
	)

	try:
	response = litellm.completion(
	model=JUDGE_MODEL,
	messages=[
	{"role": "system", "content": _JUDGE_SYSTEM},
	{"role": "user", "content": user_msg},
	],
	max_tokens=200,
	temperature=0.0

	)
	raw = response.choices[0].message.content.strip()

	# Strip potential markdown fences
	raw = raw.strip().lstrip("```json").lstrip("```").rstrip("```").strip()
	data = json.loads(raw)

	scores_sum = data["accuracy"] + data["completeness"] + data["relevance"] + data["groundedness"]
	return AnswerQualityScores(
	accuracy=data["accuracy"],
	completeness=data["completeness"],
	relevance=data["relevance"],
	groundedness=data["groundedness"],
	overall=round(scores_sum / 4, 2),
	reasoning=data.get("reasoning", ""),
	)

	except Exception as e:
	logger.error(f"Judge evaluation failed: {e}")
	return AnswerQualityScores(
	accuracy=0, completeness=0, relevance=0, groundedness=0,
	overall=0.0, reasoning=f"Evaluation failed: {e}"
	)