philosopher-chat / evaluate.py
fikri0o0's picture
Add query rewriting + corrective RAG + 3-stage RAGAS ablation
a7eba21 verified
"""
RAGAS evaluation of the retrieval pipeline.
Uses the official RAGAS library (https://docs.ragas.io) to score the pipeline
on four metrics β€” once with the cross-encoder reranker OFF (hybrid baseline)
and once ON β€” over a curated question set with reference answers:
β€’ Faithfulness β€” answer claims supported by the retrieved context
β€’ Answer Relevancy β€” answer actually addresses the question
β€’ Context Precision β€” relevant chunks ranked near the top (with reference)
β€’ Context Recall β€” reference answer covered by the retrieved context
Results are written to ``eval_results.json``, which the Gradio app renders in
its "πŸ“Š Evaluation" tab.
The judge is Llama 3.1 8B (Groq, OpenAI-compatible) wrapped for RAGAS;
answer relevancy uses the project's EmbeddingGemma embeddings.
Usage:
pip install -r requirements.txt -r requirements-eval.txt
python evaluate.py # full run, both configs
python evaluate.py --quick # first 4 questions only (smoke test)
"""
from __future__ import annotations
import io
import json
import sys
import time
import types
from datetime import datetime, timezone
# Force UTF-8 console output on Windows (Ξ”, etc.)
if hasattr(sys.stdout, "buffer"):
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
# ---------------------------------------------------------------------------
# Compatibility shim: RAGAS 0.4.3 hard-imports
# ``langchain_community.chat_models.vertexai`` (used only in an internal list of
# supported model types), but that submodule was removed in langchain-community
# 0.4.x. We're on the modern LangChain 1.x stack, so we stub the removed module
# *before* importing ragas. The stub is never exercised β€” our judge is ChatOpenAI
# (Groq) β€” so this keeps the full langchain 1.x app stack intact with no downgrade.
# ---------------------------------------------------------------------------
if "langchain_community.chat_models.vertexai" not in sys.modules:
_vx = types.ModuleType("langchain_community.chat_models.vertexai")
_vx.ChatVertexAI = type("ChatVertexAI", (), {}) # never instantiated
sys.modules["langchain_community.chat_models.vertexai"] = _vx
import numpy as np # noqa: E402
# NOTE: ragas / langchain_openai are imported lazily inside the judging path
# only. The ``--generate`` phase (torch: embedder + reranker + Chroma) then runs
# with exactly the app's import surface β€” importing the heavy ragas stack
# alongside torch was triggering a native segfault on this Windows / Python 3.14
# box. Generation and judging are therefore split into two processes.
import rag_chain # noqa: E402
from config import GOOGLE_API_KEY, RERANKER_MODEL, RETRIEVAL_FETCH_K # noqa: E402
# NOTE: RAGAS's batch ``evaluate()`` uses an async executor whose per-job
# ``asyncio.timeout`` is incompatible with Python 3.14's asyncio. We instead
# call each metric's synchronous ``single_turn_score`` in a plain loop β€” same
# RAGAS metric implementations and prompts, just driven sequentially.
# Judge β€” Gemini 3.1 Flash Lite via Google's OpenAI-compatible endpoint (httpx,
# NOT the grpc client, which segfaults alongside torch on Python 3.14). 500 RPD
# / 250K TPM gives headroom for a full 12-question A/B. (Groq free tiers proved
# too token-limited; Gemma's thinking mode breaks RAGAS JSON parsing.)
JUDGE_MODEL = "gemini-3.1-flash-lite"
GOOGLE_OPENAI_BASE = "https://generativelanguage.googleapis.com/v1beta/openai/"
# Answer generation runs on Google Gemma (the app's default model) β€” a separate
# rate-limit bucket from the judge.
GEN_LLM_LABEL = "Gemma 4 MoE 26B [Google]"
RESULTS_PATH = rag_chain.VECTORSTORE_DIR.parent / "eval_results.json"
METRIC_ORDER = ["faithfulness", "answer_relevancy", "context_precision", "context_recall"]
# ---------------------------------------------------------------------------
# Curated evaluation set β€” questions + reference (ground-truth) answers
# grounded in the 12 primary texts in the knowledge base.
# ---------------------------------------------------------------------------
EVAL_SET: list[dict] = [
{
"question": "What does Nietzsche mean by the death of God?",
"reference": (
"Nietzsche uses the death of God to describe the collapse of "
"belief in a transcendent source of meaning and morality. It is "
"not a literal claim but a diagnosis of modern culture: with the "
"divine foundation gone, inherited values lose their ground, "
"risking nihilism, and humanity must create new values itself."
),
},
{
"question": "How does Schopenhauer view suffering and the will to live?",
"reference": (
"Schopenhauer holds that the will to live is the blind, insatiable "
"force underlying all existence. Because desire is endless and its "
"satisfaction fleeting, life is dominated by suffering. Relief comes "
"only through denial of the will, aesthetic contemplation, and "
"compassion."
),
},
{
"question": "What is Hume's argument about causation?",
"reference": (
"Hume argues that we never perceive a necessary connection between "
"cause and effect, only the constant conjunction of events. Our "
"belief in causation is a habit of the mind formed by repeated "
"experience, not a truth derived from reason."
),
},
{
"question": "Can we have certain knowledge of the external world according to Russell?",
"reference": (
"Russell distinguishes knowledge by acquaintance from knowledge by "
"description. We are directly acquainted only with sense-data; the "
"existence of physical objects is an inference. He argues the "
"external world is the best hypothesis explaining our sense-data, "
"though not known with absolute certainty."
),
},
{
"question": "What is Kant's categorical imperative?",
"reference": (
"Kant's categorical imperative is an unconditional moral law: act "
"only on a maxim you could will to become a universal law. It "
"commands independently of desires or consequences, and requires "
"treating humanity always as an end and never merely as a means."
),
},
{
"question": "How does Mill justify the principle of utility?",
"reference": (
"Mill grounds morality in the greatest happiness principle: actions "
"are right insofar as they promote happiness and wrong as they "
"produce its reverse. He argues happiness is the sole thing desired "
"as an end, and distinguishes higher (intellectual) from lower "
"(bodily) pleasures by quality, not only quantity."
),
},
{
"question": "What does Marcus Aurelius advise about things outside our control?",
"reference": (
"Marcus Aurelius, following Stoic doctrine, advises accepting what "
"is outside our control as part of nature's order, and focusing only "
"on our own judgments and actions. Externals cannot harm the rational "
"self; disturbance comes from our opinions about events, not events "
"themselves."
),
},
{
"question": "What is Epictetus's distinction between what is in our power and what is not?",
"reference": (
"Epictetus opens the Enchiridion by dividing things into those in our "
"power β€” our opinions, desires, and aversions β€” and those not β€” body, "
"property, reputation. Tranquility comes from caring only about what "
"is in our power and treating the rest with indifference."
),
},
{
"question": "What is Plato's ideal society in The Republic?",
"reference": (
"Plato's ideal state is a just city ordered into three classes β€” "
"rulers (philosopher-kings), guardians, and producers β€” mirroring the "
"soul's reason, spirit, and appetite. Justice is each part performing "
"its proper role. The guardian class shares property and family, and "
"rulers are chosen for wisdom and educated to know the Good."
),
},
{
"question": "What is the will to power in Nietzsche's thought?",
"reference": (
"For Nietzsche the will to power is the fundamental drive of life: "
"not mere survival but the striving to grow, overcome, and impose "
"form. It underlies values and actions, and the higher type affirms "
"life by creating values out of this creative, self-overcoming force."
),
},
{
"question": "How does Nietzsche characterize master and slave morality?",
"reference": (
"In the Genealogy of Morality, Nietzsche contrasts master morality, "
"which originates in the strong and calls 'good' what is noble and "
"powerful, with slave morality, which arises from the resentment of "
"the weak and revalues humility, meekness, and pity as good while "
"branding the strong as evil."
),
},
{
"question": "What role does eternal recurrence play in Nietzsche's philosophy?",
"reference": (
"Eternal recurrence is the thought that one's life will repeat "
"identically and infinitely. Nietzsche poses it as a test of life-"
"affirmation: to will the eternal return of every moment, including "
"suffering, is the highest expression of amor fati and saying yes to "
"existence."
),
},
]
# ---------------------------------------------------------------------------
# RAGAS plumbing
# ---------------------------------------------------------------------------
def _build_judge():
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
llm = ChatOpenAI(
model=JUDGE_MODEL,
api_key=GOOGLE_API_KEY,
base_url=GOOGLE_OPENAI_BASE,
temperature=0.0,
max_tokens=3000,
timeout=90,
max_retries=4, # absorb the occasional 15-RPM 429
)
return LangchainLLMWrapper(llm)
# Gemini 3.1 Flash Lite allows 15 RPM. A short pace between metrics keeps
# multi-call metrics under the per-minute request limit.
PACE_SECONDS = 5
def _score_sample(sample: SingleTurnSample, scorers: dict) -> dict[str, float | None]:
"""Score one sample on all four metrics; isolate failures per-metric."""
out: dict[str, float | None] = {}
for i, (canon, metric) in enumerate(scorers.items()):
if i:
time.sleep(PACE_SECONDS) # respect the 6000 TPM bucket
try:
v = float(metric.single_turn_score(sample))
out[canon] = None if v != v else round(v, 3) # NaN β†’ None
except Exception as exc:
print(f" {canon} failed: {str(exc)[:70]}")
out[canon] = None
return out
# Generate with Gemma via Google's OpenAI-compatible endpoint (httpx). The
# native google.genai client uses grpc, which segfaults alongside torch on this
# Python 3.14 box β€” so we keep generation on the same httpx path as the judge.
GEN_MODEL_ID = "gemma-4-26b-a4b-it"
def _generate(question: str) -> dict:
docs, _ = rag_chain.retrieve_docs(question, "All") # torch retrieval (no grpc)
context_str = "\n\n".join(d.page_content for d in docs)
from openai import OpenAI
client = OpenAI(api_key=GOOGLE_API_KEY, base_url=GOOGLE_OPENAI_BASE,
timeout=90, max_retries=2) # avoid indefinite hangs
user = (f"Relevant passages from your knowledge base:\n{context_str}\n\n"
f"Question: {question}")
resp = client.chat.completions.create(
model=GEN_MODEL_ID,
messages=[
{"role": "system", "content": rag_chain.SYSTEM_PROMPT},
{"role": "user", "content": user},
],
temperature=0.3,
timeout=90,
)
return {"answer": resp.choices[0].message.content, "context": docs}
def _generate_with_retry(question: str, retries: int = 5):
"""RAG answer generation with backoff on Google RPM (429) limits."""
for attempt in range(retries):
try:
return _generate(question)
except Exception as exc:
if attempt == retries - 1:
raise
wait = 8 * (attempt + 1)
print(f" generation retry in {wait}s ({str(exc)[:50]})")
time.sleep(wait)
def run_config(name: str, use_reranker: bool, eval_set: list[dict], scorers: dict) -> dict:
from ragas import SingleTurnSample
rag_chain.USE_RERANKER = use_reranker # runtime toggle (see retrieve_docs)
print(f"\n=== {name} (reranker={'ON' if use_reranker else 'OFF'}) ===")
per_question = []
for i, item in enumerate(eval_set, 1):
t0 = time.perf_counter()
result = _generate_with_retry(item["question"])
sample = SingleTurnSample(
user_input=item["question"],
response=result["answer"],
retrieved_contexts=[d.page_content for d in result["context"]],
reference=item["reference"],
)
scores = _score_sample(sample, scorers)
per_question.append({"question": item["question"], **scores})
fmt = lambda k: f"{scores[k]:.2f}" if scores[k] is not None else " β€” "
print(f" [{i}/{len(eval_set)}] {time.perf_counter() - t0:5.1f}s "
f"F={fmt('faithfulness')} AR={fmt('answer_relevancy')} "
f"CP={fmt('context_precision')} CR={fmt('context_recall')} "
f"{item['question'][:46]}")
agg = {}
for m in METRIC_ORDER:
vals = [r[m] for r in per_question if r[m] is not None]
agg[m] = round(float(np.mean(vals)), 4) if vals else 0.0
return {"aggregate": agg, "per_question": per_question}
SAMPLES_PATH = rag_chain.VECTORSTORE_DIR.parent / "eval_samples.json"
# (name, use_reranker, use_query_rewrite) β€” an incremental A/B/C ablation.
CONFIGS = [
("Baseline (Hybrid)", False, False),
("+ Reranker", True, False),
("+ Query Rewrite", True, True),
]
def generate_samples(eval_set: list[dict]) -> dict:
"""Phase A: run the real RAG pipeline (retrieval + generation) for every
question under each config and dump the samples. No LLM judging here, so
no rate limits and no torch+judge segfault β€” the judging is a separate phase.
"""
rag_chain.USE_CORRECTIVE_RAG = False # never abstain during evaluation
out: dict[str, list[dict]] = {}
for cfg_name, use_rr, use_rw in CONFIGS:
rag_chain.USE_RERANKER = use_rr
rag_chain.USE_QUERY_REWRITE = use_rw
print(f"\n=== Generating: {cfg_name} (rerank={use_rr}, rewrite={use_rw}) ===")
rows = []
for i, item in enumerate(eval_set, 1):
res = _generate_with_retry(item["question"])
rows.append({
"question": item["question"],
"reference": item["reference"],
"answer": res["answer"],
"contexts": [d.page_content for d in res["context"]],
})
print(f" [{i}/{len(eval_set)}] {item['question'][:55]}")
out[cfg_name] = rows
return out
def main() -> None:
quick = "--quick" in sys.argv
full = "--full" in sys.argv
if "--generate" in sys.argv:
eval_set = EVAL_SET[:4] if quick else EVAL_SET
samples = {
"metadata": {
"generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
"gen_model": GEN_LLM_LABEL,
"reranker_model": RERANKER_MODEL,
"fetch_k": RETRIEVAL_FETCH_K,
"n_questions": len(eval_set),
},
"samples": generate_samples(eval_set),
}
SAMPLES_PATH.write_text(json.dumps(samples, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"\nSaved {len(eval_set)} questions x {len(CONFIGS)} configs β†’ {SAMPLES_PATH}")
return
# Default to 6 questions: RAGAS is token-heavy and Groq free tier is
# 6000 TPM, so a full 12-question A/B (~24 samples) is ~70 min. Six
# questions Γ— 2 configs is a representative, completable run (~30 min).
eval_set = EVAL_SET[:4] if quick else (EVAL_SET if full else EVAL_SET[:6])
if not GOOGLE_API_KEY:
raise SystemExit("GOOGLE_API_KEY not set β€” needed for the judge model.")
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import (
Faithfulness, LLMContextPrecisionWithReference,
LLMContextRecall, ResponseRelevancy,
)
judge = _build_judge()
embeddings = LangchainEmbeddingsWrapper(rag_chain._get_embeddings())
scorers = {
"faithfulness": Faithfulness(llm=judge),
# strictness=1 β†’ asks the LLM for n=1 completion (Groq only allows n=1)
"answer_relevancy": ResponseRelevancy(llm=judge, embeddings=embeddings, strictness=1),
"context_precision": LLMContextPrecisionWithReference(llm=judge),
"context_recall": LLMContextRecall(llm=judge),
}
baseline = run_config("Baseline β€” Hybrid (no rerank)", False, eval_set, scorers)
reranked = run_config("With Cross-Encoder Rerank", True, eval_set, scorers)
deltas = {
m: round(reranked["aggregate"].get(m, 0.0) - baseline["aggregate"].get(m, 0.0), 4)
for m in METRIC_ORDER
}
import ragas
out = {
"metadata": {
"generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
"framework": f"ragas {ragas.__version__}",
"judge_model": JUDGE_MODEL,
"gen_model": GEN_LLM_LABEL,
"reranker_model": RERANKER_MODEL,
"fetch_k": RETRIEVAL_FETCH_K,
"n_questions": len(eval_set),
},
"configs": {
"Baseline (Hybrid, no rerank)": baseline["aggregate"],
"With Cross-Encoder Rerank": reranked["aggregate"],
},
"deltas": deltas,
"per_question": {
"Baseline (Hybrid, no rerank)": baseline["per_question"],
"With Cross-Encoder Rerank": reranked["per_question"],
},
}
RESULTS_PATH.write_text(json.dumps(out, indent=2), encoding="utf-8")
print("\n" + "=" * 72)
print(f"{'Metric':<22}{'Baseline':>12}{'Reranked':>12}{'Ξ”':>10}")
print("-" * 72)
for m in METRIC_ORDER:
b = baseline["aggregate"].get(m, 0.0)
r = reranked["aggregate"].get(m, 0.0)
print(f"{m:<22}{b:>12.3f}{r:>12.3f}{deltas[m]:>+10.3f}")
print("=" * 72)
print(f"Saved β†’ {RESULTS_PATH}")
if __name__ == "__main__":
main()