test-ragp / src /rag_pipelines /retrieval_evaluation /usefulness_evaluator.py
awinml's picture
Upload 107 files
336f4a9 verified
from typing import Any, Optional
import weave
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from rag_pipelines.prompts import USEFULNESS_EVALUATOR_PROMPT, UsefulnessEvaluatorResult
SCORE_UPPER_LIMIT = 5
SCORE_LOWER_LIMIT = 1
class UsefulnessEvaluator:
"""Evaluates and filters document contexts based on their usefulness score.
Uses a language model chain to assess document usefulness and retains only contexts
meeting or exceeding the specified score threshold.
Attributes:
llm (ChatGroq): Language model instance for scoring.
prompt (ChatPromptTemplate): Template for usefulness evaluation prompt.
evaluator_chain (RunnableSequence): LangChain pipeline for evaluation.
score_threshold (int): Minimum score (1-5) required for context inclusion.
"""
def __init__(self, llm: ChatGroq, score_threshold: Optional[int] = 3) -> None:
"""Initialize evaluator with language model and scoring threshold.
Args:
llm (ChatGroq): Configured ChatGroq instance for evaluation.
score_threshold (Optional[int]): Minimum usefulness score (1-5) to retain.
Defaults to 3. Must be between 1 and 5 inclusive.
"""
if not (SCORE_LOWER_LIMIT <= score_threshold <= SCORE_UPPER_LIMIT):
msg = "score_threshold must be between 1 and 5"
raise ValueError(msg)
self.llm = llm
self.prompt = ChatPromptTemplate.from_messages([("system", USEFULNESS_EVALUATOR_PROMPT)])
self.evaluator_chain = self.prompt | self.llm.with_structured_output(UsefulnessEvaluatorResult)
self.score_threshold = score_threshold
@weave.op()
def score_context(self, question: str, context: str) -> int:
"""Calculate usefulness score for a document context relative to a question.
Args:
question (str): User question to evaluate against.
context (str): Document text content to assess.
Returns:
int: Numerical usefulness score (1-5).
"""
result = self.evaluator_chain.invoke({"question": question, "context": context})
return int(result.decision)
@weave.op()
def __call__(self, state: dict[str, Any]) -> dict[str, Any]:
"""Filter document contexts based on their usefulness scores.
Args:
state (dict[str, Any]): Processing state containing:
- "question" (str): Original user question
- "documents" (list[Document]): Retrieved documents
- "context" (list[str]): Document texts to filter
Returns:
dict[str, Any]: Updated state with filtered contexts:
- "question": Original question
- "context": Texts with score >= threshold
- "documents": Original documents (unfiltered)
"""
question: str = state["question"]
documents: list[Document] = state["documents"]
relevant_context: list[str] = state["context"]
filtered_context: list[str] = [
context for context in relevant_context if self.score_context(question, context) >= self.score_threshold
]
return {
"question": question,
"context": filtered_context,
"documents": documents,
}