Rabbook / evaluation /eval_common.py
Matcry's picture
Deploy snapshot
c76423f
Raw
History Blame Contribute Delete
4.4 kB
"""
Shared helpers for the three evaluation scripts.
Centralises model construction, dataset loading, and retrieval so that
evaluate_retrieval_metrics.py, evaluate_ragas.py, and evaluate_agent.py
stay short and focused on their own logic.
"""
import json
import warnings
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()
warnings.filterwarnings("ignore", category=DeprecationWarning)
from core.config import (
DB_DIR,
DEFAULT_BM25_CANDIDATE_K,
DEFAULT_LLM_MODEL,
DEFAULT_LLM_PROVIDER,
DEFAULT_RERANK_CANDIDATE_K,
DEFAULT_RERANK_MODEL,
DEFAULT_RETRIEVAL_K,
OLLAMA_BASE_URL,
OLLAMA_NUM_GPU,
OLLAMA_THINKING_MODE,
get_google_api_key,
)
from rag.registry import load_chunk_registry
from rag.retrieve import (
load_bm25_index,
load_vectorstore,
retrieve_documents_with_query_transform,
)
# evaluation/ package directory and project root
PACKAGE_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = PACKAGE_DIR.parent
DATASET_PATH = PACKAGE_DIR / "data" / "eval_dataset.json"
def load_dataset() -> list[dict]:
"""Load the golden evaluation dataset from tests/eval_dataset.json."""
return json.loads(DATASET_PATH.read_text(encoding="utf-8"))
def build_llm():
"""Instantiate the RAG LLM based on RABBOOK_LLM_PROVIDER."""
if DEFAULT_LLM_PROVIDER == "groq":
from langchain_groq import ChatGroq
return ChatGroq(model=DEFAULT_LLM_MODEL, temperature=0.0)
if DEFAULT_LLM_PROVIDER == "gemini":
from langchain_google_genai import ChatGoogleGenerativeAI
return ChatGoogleGenerativeAI(model=DEFAULT_LLM_MODEL, temperature=0.0)
from langchain_ollama import ChatOllama
# thinking=False suppresses <think> blocks (e.g. Gemma) — matches app/web.py
ollama_kwargs = dict(
model=DEFAULT_LLM_MODEL,
base_url=OLLAMA_BASE_URL,
num_gpu=OLLAMA_NUM_GPU,
temperature=0.0,
)
if not OLLAMA_THINKING_MODE:
ollama_kwargs["thinking"] = False
return ChatOllama(**ollama_kwargs)
def build_embeddings():
"""Return a HuggingFace all-MiniLM-L6-v2 embeddings instance."""
from langchain_huggingface import HuggingFaceEmbeddings
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
def build_reranker():
"""Return a CrossEncoder reranker using DEFAULT_RERANK_MODEL."""
from sentence_transformers import CrossEncoder
return CrossEncoder(DEFAULT_RERANK_MODEL)
def build_evaluator_llm():
"""
Return the LLM used for RAGAS judging.
Prefers Gemini gemini-3.1-flash-lite for structured-output quality.
Falls back to the RAG LLM when GEMINI_KEY is not set.
Note: if RABBOOK_LLM_PROVIDER is ollama, set GEMINI_KEY for better results.
"""
from langchain_google_genai import ChatGoogleGenerativeAI
api_key = get_google_api_key()
if not api_key:
print("Warning: GEMINI_KEY not set — falling back to RAG LLM for evaluation.")
return build_llm()
return ChatGoogleGenerativeAI(model="gemini-3.1-flash-lite", temperature=0.0, google_api_key=api_key)
def load_retrieval_bundle(embeddings) -> tuple:
"""
Load and return (vectorstore, bm25_index) from disk.
Both are built from the same chunk registry so BM25 and dense
retrieval are always in sync with the current ingested documents.
"""
vectorstore = load_vectorstore(str(DB_DIR), embeddings)
chunk_registry = load_chunk_registry()
bm25_index = load_bm25_index(chunk_registry=chunk_registry, vectorstore=vectorstore)
return vectorstore, bm25_index
def retrieve_chunk_ids(
question: str,
vectorstore,
bm25_index,
reranker,
llm,
k: int = DEFAULT_RETRIEVAL_K,
) -> list[str]:
"""
Run the retrieval pipeline for a single question and return chunk ids in rank order.
Query transformation is disabled so results are deterministic across runs.
"""
results = retrieve_documents_with_query_transform(
vectorstore,
question,
k=k,
reranker=reranker,
bm25_index=bm25_index,
query_transformer=llm,
enable_query_transform=False,
candidate_k=DEFAULT_RERANK_CANDIDATE_K,
bm25_candidate_k=DEFAULT_BM25_CANDIDATE_K,
metadata_filter=None,
include_debug=False,
)
return [doc.metadata.get("chunk_id") for doc, _score in results]