File size: 4,404 Bytes
c76423f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
Shared helpers for the three evaluation scripts.

Centralises model construction, dataset loading, and retrieval so that
evaluate_retrieval_metrics.py, evaluate_ragas.py, and evaluate_agent.py
stay short and focused on their own logic.
"""
import json
import warnings
from pathlib import Path

from dotenv import load_dotenv

load_dotenv()

warnings.filterwarnings("ignore", category=DeprecationWarning)

from core.config import (
    DB_DIR,
    DEFAULT_BM25_CANDIDATE_K,
    DEFAULT_LLM_MODEL,
    DEFAULT_LLM_PROVIDER,
    DEFAULT_RERANK_CANDIDATE_K,
    DEFAULT_RERANK_MODEL,
    DEFAULT_RETRIEVAL_K,
    OLLAMA_BASE_URL,
    OLLAMA_NUM_GPU,
    OLLAMA_THINKING_MODE,
    get_google_api_key,
)
from rag.registry import load_chunk_registry
from rag.retrieve import (
    load_bm25_index,
    load_vectorstore,
    retrieve_documents_with_query_transform,
)

# evaluation/ package directory and project root
PACKAGE_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = PACKAGE_DIR.parent

DATASET_PATH = PACKAGE_DIR / "data" / "eval_dataset.json"


def load_dataset() -> list[dict]:
    """Load the golden evaluation dataset from tests/eval_dataset.json."""
    return json.loads(DATASET_PATH.read_text(encoding="utf-8"))


def build_llm():
    """Instantiate the RAG LLM based on RABBOOK_LLM_PROVIDER."""
    if DEFAULT_LLM_PROVIDER == "groq":
        from langchain_groq import ChatGroq
        return ChatGroq(model=DEFAULT_LLM_MODEL, temperature=0.0)
    if DEFAULT_LLM_PROVIDER == "gemini":
        from langchain_google_genai import ChatGoogleGenerativeAI
        return ChatGoogleGenerativeAI(model=DEFAULT_LLM_MODEL, temperature=0.0)
    from langchain_ollama import ChatOllama
    # thinking=False suppresses <think> blocks (e.g. Gemma) — matches app/web.py
    ollama_kwargs = dict(
        model=DEFAULT_LLM_MODEL,
        base_url=OLLAMA_BASE_URL,
        num_gpu=OLLAMA_NUM_GPU,
        temperature=0.0,
    )
    if not OLLAMA_THINKING_MODE:
        ollama_kwargs["thinking"] = False
    return ChatOllama(**ollama_kwargs)


def build_embeddings():
    """Return a HuggingFace all-MiniLM-L6-v2 embeddings instance."""
    from langchain_huggingface import HuggingFaceEmbeddings
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


def build_reranker():
    """Return a CrossEncoder reranker using DEFAULT_RERANK_MODEL."""
    from sentence_transformers import CrossEncoder
    return CrossEncoder(DEFAULT_RERANK_MODEL)


def build_evaluator_llm():
    """
    Return the LLM used for RAGAS judging.

    Prefers Gemini gemini-3.1-flash-lite for structured-output quality.
    Falls back to the RAG LLM when GEMINI_KEY is not set.
    Note: if RABBOOK_LLM_PROVIDER is ollama, set GEMINI_KEY for better results.
    """
    from langchain_google_genai import ChatGoogleGenerativeAI
    api_key = get_google_api_key()
    if not api_key:
        print("Warning: GEMINI_KEY not set — falling back to RAG LLM for evaluation.")
        return build_llm()
    return ChatGoogleGenerativeAI(model="gemini-3.1-flash-lite", temperature=0.0, google_api_key=api_key)


def load_retrieval_bundle(embeddings) -> tuple:
    """
    Load and return (vectorstore, bm25_index) from disk.

    Both are built from the same chunk registry so BM25 and dense
    retrieval are always in sync with the current ingested documents.
    """
    vectorstore = load_vectorstore(str(DB_DIR), embeddings)
    chunk_registry = load_chunk_registry()
    bm25_index = load_bm25_index(chunk_registry=chunk_registry, vectorstore=vectorstore)
    return vectorstore, bm25_index


def retrieve_chunk_ids(
    question: str,
    vectorstore,
    bm25_index,
    reranker,
    llm,
    k: int = DEFAULT_RETRIEVAL_K,
) -> list[str]:
    """
    Run the retrieval pipeline for a single question and return chunk ids in rank order.

    Query transformation is disabled so results are deterministic across runs.
    """
    results = retrieve_documents_with_query_transform(
        vectorstore,
        question,
        k=k,
        reranker=reranker,
        bm25_index=bm25_index,
        query_transformer=llm,
        enable_query_transform=False,
        candidate_k=DEFAULT_RERANK_CANDIDATE_K,
        bm25_candidate_k=DEFAULT_BM25_CANDIDATE_K,
        metadata_filter=None,
        include_debug=False,
    )
    return [doc.metadata.get("chunk_id") for doc, _score in results]