Spaces:
Sleeping
Sleeping
| """ | |
| LLM-as-judge evaluator. | |
| Scores a (query, response) pair on 5 metrics: | |
| relevance, hallucination, coherence, completeness, depth | |
| """ | |
| import time | |
| from functools import lru_cache | |
| from pydantic import BaseModel, Field | |
| from langchain_groq import ChatGroq | |
| from langchain_core.prompts import ChatPromptTemplate | |
| class EvalScore(BaseModel): | |
| relevance: float = Field(..., ge=0.0, le=1.0, | |
| description="0-1 how well the response answers the query") | |
| hallucination: str = Field(..., | |
| description="Exactly one of: No, Possible, Yes") | |
| coherence: float = Field(..., ge=0.0, le=1.0, | |
| description="0-1 how logically structured and easy to follow the response is") | |
| completeness: float = Field(..., ge=0.0, le=1.0, | |
| description="0-1 how thoroughly the response covers all key aspects of the query") | |
| depth: float = Field(..., ge=0.0, le=1.0, | |
| description="0-1 technical depth and quality of explanation") | |
| _SYSTEM = """You are a strict, neutral evaluator for AI-generated research responses. | |
| Score the response on five criteria: | |
| RELEVANCE (float 0.0-1.0): | |
| 0.90-1.00 Comprehensive, accurate, directly addresses every aspect of the query | |
| 0.70-0.89 Good coverage; one or two aspects missing or underdeveloped | |
| 0.50-0.69 Partial answer; significant gaps or tangential content | |
| 0.30-0.49 Mostly off-topic, very incomplete, or significant factual issues | |
| 0.00-0.29 Irrelevant or almost entirely wrong | |
| HALLUCINATION (string, exactly one of: No / Possible / Yes): | |
| No All claims are grounded; nothing fabricated or confidently wrong | |
| Possible A few uncertain or unverifiable statements but no clear fabrications | |
| Yes Clear fabrications, invented statistics, or confident wrong facts | |
| COHERENCE (float 0.0-1.0): | |
| 0.90-1.00 Logically structured, flows naturally, headers/sections used well | |
| 0.70-0.89 Mostly clear; minor structural or flow issues | |
| 0.50-0.69 Disorganized or hard to follow in places | |
| 0.00-0.49 Chaotic, repetitive, or incoherent | |
| COMPLETENESS (float 0.0-1.0): | |
| 0.90-1.00 All key sub-topics and nuances addressed | |
| 0.70-0.89 Most aspects covered; 1-2 important points missing | |
| 0.50-0.69 Covers the basics but misses significant aspects | |
| 0.00-0.49 Major gaps; leaves core parts of the query unanswered | |
| DEPTH (float 0.0-1.0): | |
| 0.90-1.00 Expert-level explanation with mechanisms, tradeoffs, and examples | |
| 0.70-0.89 Good technical detail; could go deeper in 1-2 areas | |
| 0.50-0.69 Surface-level; explains what but not how or why | |
| 0.00-0.49 Shallow, generic, or lacking any technical substance | |
| Return ONLY valid JSON with keys: relevance, hallucination, coherence, completeness, depth. No commentary.""" | |
| _HUMAN = """Query: {query} | |
| Response (truncated to 1800 chars): | |
| {response}""" | |
| def _get_llm(): | |
| return ChatGroq(model="llama-3.1-8b-instant", temperature=0.0).with_structured_output(EvalScore) | |
| def evaluate(query: str, response: str, retries: int = 3) -> EvalScore: | |
| """Score a single (query, response) pair. Retries on rate-limit errors.""" | |
| prompt = ChatPromptTemplate.from_messages([ | |
| ("system", _SYSTEM), | |
| ("human", _HUMAN), | |
| ]) | |
| chain = prompt | _get_llm() | |
| for attempt in range(retries): | |
| try: | |
| return chain.invoke({"query": query, "response": response[:1800]}) | |
| except Exception as exc: | |
| if attempt < retries - 1 and ("rate" in str(exc).lower() or "429" in str(exc)): | |
| wait = 20 * (attempt + 1) | |
| print(f" [eval] Rate limit hit, waiting {wait}s...") | |
| time.sleep(wait) | |
| else: | |
| raise | |
| raise RuntimeError("evaluate() failed after retries") | |