Spaces:

Shrot102
/

llmopt-server

Running

App Files Files Community

llmopt-server / llmopt /evaluation /evaluator.py

Shrot101

feat: upgrade LLMOpt to V2 ML-powered architecture

eff2120 29 days ago

raw

history blame contribute delete

3.87 kB

	"""
	Evaluation — LLM-as-a-Judge framework.

	Uses gpt-4o-mini to evaluate the quality of responses generated by cheaper models.
	Only runs when explicitly requested (evaluate=True in generate()).
	"""
	from __future__ import annotations

	import logging
	from dataclasses import dataclass
	from typing import Optional

	logger = logging.getLogger(__name__)

	_JUDGE_PROMPT = """You are an objective AI quality evaluator.

	A user asked:
	"{query}"

	An AI assistant responded:
	"{response}"

	Please evaluate this response on the following criteria and provide a score from 1-10 for each:
	1. Accuracy: Is the information correct and factually accurate?
	2. Completeness: Does it fully answer the question?
	3. Clarity: Is it clear and easy to understand?
	4. Conciseness: Is it appropriately concise without being too brief?

	Also provide an Overall Score from 1-10.

	Respond ONLY in this exact JSON format:
	{{
	"accuracy": <1-10>,
	"completeness": <1-10>,
	"clarity": <1-10>,
	"conciseness": <1-10>,
	"overall": <1-10>,
	"feedback": "<one sentence summary of the main strength or weakness>"
	}}"""


	@dataclass
	class EvaluationResult:
	accuracy: float
	completeness: float
	clarity: float
	conciseness: float
	overall: float
	feedback: str
	judge_model: str

	def to_dict(self) -> dict:
	return self.__dict__.copy()

	def __str__(self) -> str:
	return (
	f"Evaluation (judge={self.judge_model})\n"
	f" Overall : {self.overall}/10\n"
	f" Accuracy : {self.accuracy}/10\n"
	f" Completeness: {self.completeness}/10\n"
	f" Clarity : {self.clarity}/10\n"
	f" Conciseness: {self.conciseness}/10\n"
	f" Feedback : {self.feedback}"
	)


	class LLMJudge:
	"""
	Evaluates LLM responses using a judge model (default: gpt-4o-mini).
	Gracefully disabled if litellm or API keys are not available.
	"""

	def __init__(self, judge_model: str = "gpt-4o-mini"):
	self.judge_model = judge_model
	self.enabled = True
	try:
	import litellm # type: ignore # noqa: F401
	except ImportError:
	logger.warning("LLMJudge: litellm not installed. Evaluation disabled.")
	self.enabled = False

	def evaluate(self, query: str, response: str) -> Optional[EvaluationResult]:
	"""
	Evaluates a query-response pair using the judge model.
	Returns None if evaluation fails or is disabled.
	"""
	if not self.enabled:
	return None

	try:
	import json
	import litellm # type: ignore

	prompt = _JUDGE_PROMPT.format(query=query, response=response)
	result = litellm.completion(
	model=self.judge_model,
	messages=[{"role": "user", "content": prompt}],
	temperature=0.0,
	max_tokens=300,
	)

	raw = str(result.choices[0].message.content).strip() # type: ignore

	# Extract JSON from the response (handle markdown code blocks)
	if "```" in raw:
	raw = raw.split("```")[1]
	if raw.startswith("json"):
	raw = raw[4:]

	scores = json.loads(raw)

	return EvaluationResult(
	accuracy=float(scores.get("accuracy", 0)),
	completeness=float(scores.get("completeness", 0)),
	clarity=float(scores.get("clarity", 0)),
	conciseness=float(scores.get("conciseness", 0)),
	overall=float(scores.get("overall", 0)),
	feedback=scores.get("feedback", ""),
	judge_model=self.judge_model,
	)

	except Exception as e:
	logger.warning(f"LLMJudge: Evaluation failed: {e}")
	return None