File size: 19,001 Bytes
276d415
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2466c8
 
 
 
 
 
276d415
 
c2466c8
276d415
 
 
 
 
 
c2466c8
 
 
 
 
 
 
 
276d415
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2466c8
 
 
276d415
 
c2466c8
 
276d415
c2466c8
 
 
276d415
 
 
 
c2466c8
 
 
276d415
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2466c8
 
 
 
 
 
 
 
 
 
a7eba21
 
c2466c8
 
 
 
 
 
 
 
 
a7eba21
c2466c8
 
 
 
276d415
 
 
 
c2466c8
276d415
 
 
 
 
 
 
 
 
c2466c8
276d415
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2466c8
 
a7eba21
c2466c8
a7eba21
 
 
c2466c8
 
 
 
 
a7eba21
c2466c8
 
a7eba21
c2466c8
a7eba21
c2466c8
a7eba21
 
c2466c8
 
 
 
 
 
 
 
 
 
 
 
 
 
276d415
 
 
c2466c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276d415
 
 
 
 
c2466c8
 
276d415
c2466c8
 
 
 
 
276d415
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
"""
RAGAS evaluation of the retrieval pipeline.

Uses the official RAGAS library (https://docs.ragas.io) to score the pipeline
on four metrics β€” once with the cross-encoder reranker OFF (hybrid baseline)
and once ON β€” over a curated question set with reference answers:

    β€’ Faithfulness        β€” answer claims supported by the retrieved context
    β€’ Answer Relevancy    β€” answer actually addresses the question
    β€’ Context Precision   β€” relevant chunks ranked near the top (with reference)
    β€’ Context Recall      β€” reference answer covered by the retrieved context

Results are written to ``eval_results.json``, which the Gradio app renders in
its "πŸ“Š Evaluation" tab.

The judge is Llama 3.1 8B (Groq, OpenAI-compatible) wrapped for RAGAS;
answer relevancy uses the project's EmbeddingGemma embeddings.

Usage:
    pip install -r requirements.txt -r requirements-eval.txt
    python evaluate.py            # full run, both configs
    python evaluate.py --quick    # first 4 questions only (smoke test)
"""

from __future__ import annotations

import io
import json
import sys
import time
import types
from datetime import datetime, timezone

# Force UTF-8 console output on Windows (Ξ”, etc.)
if hasattr(sys.stdout, "buffer"):
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")

# ---------------------------------------------------------------------------
# Compatibility shim: RAGAS 0.4.3 hard-imports
# ``langchain_community.chat_models.vertexai`` (used only in an internal list of
# supported model types), but that submodule was removed in langchain-community
# 0.4.x. We're on the modern LangChain 1.x stack, so we stub the removed module
# *before* importing ragas. The stub is never exercised β€” our judge is ChatOpenAI
# (Groq) β€” so this keeps the full langchain 1.x app stack intact with no downgrade.
# ---------------------------------------------------------------------------
if "langchain_community.chat_models.vertexai" not in sys.modules:
    _vx = types.ModuleType("langchain_community.chat_models.vertexai")
    _vx.ChatVertexAI = type("ChatVertexAI", (), {})  # never instantiated
    sys.modules["langchain_community.chat_models.vertexai"] = _vx

import numpy as np  # noqa: E402

# NOTE: ragas / langchain_openai are imported lazily inside the judging path
# only. The ``--generate`` phase (torch: embedder + reranker + Chroma) then runs
# with exactly the app's import surface β€” importing the heavy ragas stack
# alongside torch was triggering a native segfault on this Windows / Python 3.14
# box. Generation and judging are therefore split into two processes.

import rag_chain  # noqa: E402
from config import GOOGLE_API_KEY, RERANKER_MODEL, RETRIEVAL_FETCH_K  # noqa: E402

# NOTE: RAGAS's batch ``evaluate()`` uses an async executor whose per-job
# ``asyncio.timeout`` is incompatible with Python 3.14's asyncio. We instead
# call each metric's synchronous ``single_turn_score`` in a plain loop β€” same
# RAGAS metric implementations and prompts, just driven sequentially.

# Judge β€” Gemini 3.1 Flash Lite via Google's OpenAI-compatible endpoint (httpx,
# NOT the grpc client, which segfaults alongside torch on Python 3.14). 500 RPD
# / 250K TPM gives headroom for a full 12-question A/B. (Groq free tiers proved
# too token-limited; Gemma's thinking mode breaks RAGAS JSON parsing.)
JUDGE_MODEL = "gemini-3.1-flash-lite"
GOOGLE_OPENAI_BASE = "https://generativelanguage.googleapis.com/v1beta/openai/"
# Answer generation runs on Google Gemma (the app's default model) β€” a separate
# rate-limit bucket from the judge.
GEN_LLM_LABEL = "Gemma 4 MoE 26B  [Google]"
RESULTS_PATH = rag_chain.VECTORSTORE_DIR.parent / "eval_results.json"

METRIC_ORDER = ["faithfulness", "answer_relevancy", "context_precision", "context_recall"]


# ---------------------------------------------------------------------------
# Curated evaluation set β€” questions + reference (ground-truth) answers
# grounded in the 12 primary texts in the knowledge base.
# ---------------------------------------------------------------------------
EVAL_SET: list[dict] = [
    {
        "question": "What does Nietzsche mean by the death of God?",
        "reference": (
            "Nietzsche uses the death of God to describe the collapse of "
            "belief in a transcendent source of meaning and morality. It is "
            "not a literal claim but a diagnosis of modern culture: with the "
            "divine foundation gone, inherited values lose their ground, "
            "risking nihilism, and humanity must create new values itself."
        ),
    },
    {
        "question": "How does Schopenhauer view suffering and the will to live?",
        "reference": (
            "Schopenhauer holds that the will to live is the blind, insatiable "
            "force underlying all existence. Because desire is endless and its "
            "satisfaction fleeting, life is dominated by suffering. Relief comes "
            "only through denial of the will, aesthetic contemplation, and "
            "compassion."
        ),
    },
    {
        "question": "What is Hume's argument about causation?",
        "reference": (
            "Hume argues that we never perceive a necessary connection between "
            "cause and effect, only the constant conjunction of events. Our "
            "belief in causation is a habit of the mind formed by repeated "
            "experience, not a truth derived from reason."
        ),
    },
    {
        "question": "Can we have certain knowledge of the external world according to Russell?",
        "reference": (
            "Russell distinguishes knowledge by acquaintance from knowledge by "
            "description. We are directly acquainted only with sense-data; the "
            "existence of physical objects is an inference. He argues the "
            "external world is the best hypothesis explaining our sense-data, "
            "though not known with absolute certainty."
        ),
    },
    {
        "question": "What is Kant's categorical imperative?",
        "reference": (
            "Kant's categorical imperative is an unconditional moral law: act "
            "only on a maxim you could will to become a universal law. It "
            "commands independently of desires or consequences, and requires "
            "treating humanity always as an end and never merely as a means."
        ),
    },
    {
        "question": "How does Mill justify the principle of utility?",
        "reference": (
            "Mill grounds morality in the greatest happiness principle: actions "
            "are right insofar as they promote happiness and wrong as they "
            "produce its reverse. He argues happiness is the sole thing desired "
            "as an end, and distinguishes higher (intellectual) from lower "
            "(bodily) pleasures by quality, not only quantity."
        ),
    },
    {
        "question": "What does Marcus Aurelius advise about things outside our control?",
        "reference": (
            "Marcus Aurelius, following Stoic doctrine, advises accepting what "
            "is outside our control as part of nature's order, and focusing only "
            "on our own judgments and actions. Externals cannot harm the rational "
            "self; disturbance comes from our opinions about events, not events "
            "themselves."
        ),
    },
    {
        "question": "What is Epictetus's distinction between what is in our power and what is not?",
        "reference": (
            "Epictetus opens the Enchiridion by dividing things into those in our "
            "power β€” our opinions, desires, and aversions β€” and those not β€” body, "
            "property, reputation. Tranquility comes from caring only about what "
            "is in our power and treating the rest with indifference."
        ),
    },
    {
        "question": "What is Plato's ideal society in The Republic?",
        "reference": (
            "Plato's ideal state is a just city ordered into three classes β€” "
            "rulers (philosopher-kings), guardians, and producers β€” mirroring the "
            "soul's reason, spirit, and appetite. Justice is each part performing "
            "its proper role. The guardian class shares property and family, and "
            "rulers are chosen for wisdom and educated to know the Good."
        ),
    },
    {
        "question": "What is the will to power in Nietzsche's thought?",
        "reference": (
            "For Nietzsche the will to power is the fundamental drive of life: "
            "not mere survival but the striving to grow, overcome, and impose "
            "form. It underlies values and actions, and the higher type affirms "
            "life by creating values out of this creative, self-overcoming force."
        ),
    },
    {
        "question": "How does Nietzsche characterize master and slave morality?",
        "reference": (
            "In the Genealogy of Morality, Nietzsche contrasts master morality, "
            "which originates in the strong and calls 'good' what is noble and "
            "powerful, with slave morality, which arises from the resentment of "
            "the weak and revalues humility, meekness, and pity as good while "
            "branding the strong as evil."
        ),
    },
    {
        "question": "What role does eternal recurrence play in Nietzsche's philosophy?",
        "reference": (
            "Eternal recurrence is the thought that one's life will repeat "
            "identically and infinitely. Nietzsche poses it as a test of life-"
            "affirmation: to will the eternal return of every moment, including "
            "suffering, is the highest expression of amor fati and saying yes to "
            "existence."
        ),
    },
]


# ---------------------------------------------------------------------------
# RAGAS plumbing
# ---------------------------------------------------------------------------

def _build_judge():
    from langchain_openai import ChatOpenAI
    from ragas.llms import LangchainLLMWrapper
    llm = ChatOpenAI(
        model=JUDGE_MODEL,
        api_key=GOOGLE_API_KEY,
        base_url=GOOGLE_OPENAI_BASE,
        temperature=0.0,
        max_tokens=3000,
        timeout=90,
        max_retries=4,       # absorb the occasional 15-RPM 429
    )
    return LangchainLLMWrapper(llm)


# Gemini 3.1 Flash Lite allows 15 RPM. A short pace between metrics keeps
# multi-call metrics under the per-minute request limit.
PACE_SECONDS = 5


def _score_sample(sample: SingleTurnSample, scorers: dict) -> dict[str, float | None]:
    """Score one sample on all four metrics; isolate failures per-metric."""
    out: dict[str, float | None] = {}
    for i, (canon, metric) in enumerate(scorers.items()):
        if i:
            time.sleep(PACE_SECONDS)  # respect the 6000 TPM bucket
        try:
            v = float(metric.single_turn_score(sample))
            out[canon] = None if v != v else round(v, 3)  # NaN β†’ None
        except Exception as exc:
            print(f"      {canon} failed: {str(exc)[:70]}")
            out[canon] = None
    return out


# Generate with Gemma via Google's OpenAI-compatible endpoint (httpx). The
# native google.genai client uses grpc, which segfaults alongside torch on this
# Python 3.14 box β€” so we keep generation on the same httpx path as the judge.
GEN_MODEL_ID = "gemma-4-26b-a4b-it"


def _generate(question: str) -> dict:
    docs, _ = rag_chain.retrieve_docs(question, "All")  # torch retrieval (no grpc)
    context_str = "\n\n".join(d.page_content for d in docs)
    from openai import OpenAI
    client = OpenAI(api_key=GOOGLE_API_KEY, base_url=GOOGLE_OPENAI_BASE,
                    timeout=90, max_retries=2)  # avoid indefinite hangs
    user = (f"Relevant passages from your knowledge base:\n{context_str}\n\n"
            f"Question: {question}")
    resp = client.chat.completions.create(
        model=GEN_MODEL_ID,
        messages=[
            {"role": "system", "content": rag_chain.SYSTEM_PROMPT},
            {"role": "user", "content": user},
        ],
        temperature=0.3,
        timeout=90,
    )
    return {"answer": resp.choices[0].message.content, "context": docs}


def _generate_with_retry(question: str, retries: int = 5):
    """RAG answer generation with backoff on Google RPM (429) limits."""
    for attempt in range(retries):
        try:
            return _generate(question)
        except Exception as exc:
            if attempt == retries - 1:
                raise
            wait = 8 * (attempt + 1)
            print(f"    generation retry in {wait}s ({str(exc)[:50]})")
            time.sleep(wait)


def run_config(name: str, use_reranker: bool, eval_set: list[dict], scorers: dict) -> dict:
    from ragas import SingleTurnSample
    rag_chain.USE_RERANKER = use_reranker  # runtime toggle (see retrieve_docs)
    print(f"\n=== {name}  (reranker={'ON' if use_reranker else 'OFF'}) ===")
    per_question = []
    for i, item in enumerate(eval_set, 1):
        t0 = time.perf_counter()
        result = _generate_with_retry(item["question"])
        sample = SingleTurnSample(
            user_input=item["question"],
            response=result["answer"],
            retrieved_contexts=[d.page_content for d in result["context"]],
            reference=item["reference"],
        )
        scores = _score_sample(sample, scorers)
        per_question.append({"question": item["question"], **scores})
        fmt = lambda k: f"{scores[k]:.2f}" if scores[k] is not None else " β€” "
        print(f"  [{i}/{len(eval_set)}] {time.perf_counter() - t0:5.1f}s  "
              f"F={fmt('faithfulness')} AR={fmt('answer_relevancy')} "
              f"CP={fmt('context_precision')} CR={fmt('context_recall')}  "
              f"{item['question'][:46]}")

    agg = {}
    for m in METRIC_ORDER:
        vals = [r[m] for r in per_question if r[m] is not None]
        agg[m] = round(float(np.mean(vals)), 4) if vals else 0.0
    return {"aggregate": agg, "per_question": per_question}


SAMPLES_PATH = rag_chain.VECTORSTORE_DIR.parent / "eval_samples.json"

# (name, use_reranker, use_query_rewrite) β€” an incremental A/B/C ablation.
CONFIGS = [
    ("Baseline (Hybrid)", False, False),
    ("+ Reranker", True, False),
    ("+ Query Rewrite", True, True),
]


def generate_samples(eval_set: list[dict]) -> dict:
    """Phase A: run the real RAG pipeline (retrieval + generation) for every
    question under each config and dump the samples. No LLM judging here, so
    no rate limits and no torch+judge segfault β€” the judging is a separate phase.
    """
    rag_chain.USE_CORRECTIVE_RAG = False  # never abstain during evaluation
    out: dict[str, list[dict]] = {}
    for cfg_name, use_rr, use_rw in CONFIGS:
        rag_chain.USE_RERANKER = use_rr
        rag_chain.USE_QUERY_REWRITE = use_rw
        print(f"\n=== Generating: {cfg_name}  (rerank={use_rr}, rewrite={use_rw}) ===")
        rows = []
        for i, item in enumerate(eval_set, 1):
            res = _generate_with_retry(item["question"])
            rows.append({
                "question": item["question"],
                "reference": item["reference"],
                "answer": res["answer"],
                "contexts": [d.page_content for d in res["context"]],
            })
            print(f"  [{i}/{len(eval_set)}] {item['question'][:55]}")
        out[cfg_name] = rows
    return out


def main() -> None:
    quick = "--quick" in sys.argv
    full = "--full" in sys.argv

    if "--generate" in sys.argv:
        eval_set = EVAL_SET[:4] if quick else EVAL_SET
        samples = {
            "metadata": {
                "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
                "gen_model": GEN_LLM_LABEL,
                "reranker_model": RERANKER_MODEL,
                "fetch_k": RETRIEVAL_FETCH_K,
                "n_questions": len(eval_set),
            },
            "samples": generate_samples(eval_set),
        }
        SAMPLES_PATH.write_text(json.dumps(samples, indent=2, ensure_ascii=False), encoding="utf-8")
        print(f"\nSaved {len(eval_set)} questions x {len(CONFIGS)} configs β†’ {SAMPLES_PATH}")
        return
    # Default to 6 questions: RAGAS is token-heavy and Groq free tier is
    # 6000 TPM, so a full 12-question A/B (~24 samples) is ~70 min. Six
    # questions Γ— 2 configs is a representative, completable run (~30 min).
    eval_set = EVAL_SET[:4] if quick else (EVAL_SET if full else EVAL_SET[:6])

    if not GOOGLE_API_KEY:
        raise SystemExit("GOOGLE_API_KEY not set β€” needed for the judge model.")

    from ragas.embeddings import LangchainEmbeddingsWrapper
    from ragas.metrics import (
        Faithfulness, LLMContextPrecisionWithReference,
        LLMContextRecall, ResponseRelevancy,
    )
    judge = _build_judge()
    embeddings = LangchainEmbeddingsWrapper(rag_chain._get_embeddings())
    scorers = {
        "faithfulness": Faithfulness(llm=judge),
        # strictness=1 β†’ asks the LLM for n=1 completion (Groq only allows n=1)
        "answer_relevancy": ResponseRelevancy(llm=judge, embeddings=embeddings, strictness=1),
        "context_precision": LLMContextPrecisionWithReference(llm=judge),
        "context_recall": LLMContextRecall(llm=judge),
    }

    baseline = run_config("Baseline β€” Hybrid (no rerank)", False, eval_set, scorers)
    reranked = run_config("With Cross-Encoder Rerank", True, eval_set, scorers)

    deltas = {
        m: round(reranked["aggregate"].get(m, 0.0) - baseline["aggregate"].get(m, 0.0), 4)
        for m in METRIC_ORDER
    }

    import ragas
    out = {
        "metadata": {
            "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
            "framework": f"ragas {ragas.__version__}",
            "judge_model": JUDGE_MODEL,
            "gen_model": GEN_LLM_LABEL,
            "reranker_model": RERANKER_MODEL,
            "fetch_k": RETRIEVAL_FETCH_K,
            "n_questions": len(eval_set),
        },
        "configs": {
            "Baseline (Hybrid, no rerank)": baseline["aggregate"],
            "With Cross-Encoder Rerank": reranked["aggregate"],
        },
        "deltas": deltas,
        "per_question": {
            "Baseline (Hybrid, no rerank)": baseline["per_question"],
            "With Cross-Encoder Rerank": reranked["per_question"],
        },
    }
    RESULTS_PATH.write_text(json.dumps(out, indent=2), encoding="utf-8")

    print("\n" + "=" * 72)
    print(f"{'Metric':<22}{'Baseline':>12}{'Reranked':>12}{'Ξ”':>10}")
    print("-" * 72)
    for m in METRIC_ORDER:
        b = baseline["aggregate"].get(m, 0.0)
        r = reranked["aggregate"].get(m, 0.0)
        print(f"{m:<22}{b:>12.3f}{r:>12.3f}{deltas[m]:>+10.3f}")
    print("=" * 72)
    print(f"Saved β†’ {RESULTS_PATH}")


if __name__ == "__main__":
    main()