Spaces:
Sleeping
Sleeping
File size: 19,001 Bytes
276d415 c2466c8 276d415 c2466c8 276d415 c2466c8 276d415 c2466c8 276d415 c2466c8 276d415 c2466c8 276d415 c2466c8 276d415 c2466c8 a7eba21 c2466c8 a7eba21 c2466c8 276d415 c2466c8 276d415 c2466c8 276d415 c2466c8 a7eba21 c2466c8 a7eba21 c2466c8 a7eba21 c2466c8 a7eba21 c2466c8 a7eba21 c2466c8 a7eba21 c2466c8 276d415 c2466c8 276d415 c2466c8 276d415 c2466c8 276d415 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 | """
RAGAS evaluation of the retrieval pipeline.
Uses the official RAGAS library (https://docs.ragas.io) to score the pipeline
on four metrics β once with the cross-encoder reranker OFF (hybrid baseline)
and once ON β over a curated question set with reference answers:
β’ Faithfulness β answer claims supported by the retrieved context
β’ Answer Relevancy β answer actually addresses the question
β’ Context Precision β relevant chunks ranked near the top (with reference)
β’ Context Recall β reference answer covered by the retrieved context
Results are written to ``eval_results.json``, which the Gradio app renders in
its "π Evaluation" tab.
The judge is Llama 3.1 8B (Groq, OpenAI-compatible) wrapped for RAGAS;
answer relevancy uses the project's EmbeddingGemma embeddings.
Usage:
pip install -r requirements.txt -r requirements-eval.txt
python evaluate.py # full run, both configs
python evaluate.py --quick # first 4 questions only (smoke test)
"""
from __future__ import annotations
import io
import json
import sys
import time
import types
from datetime import datetime, timezone
# Force UTF-8 console output on Windows (Ξ, etc.)
if hasattr(sys.stdout, "buffer"):
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
# ---------------------------------------------------------------------------
# Compatibility shim: RAGAS 0.4.3 hard-imports
# ``langchain_community.chat_models.vertexai`` (used only in an internal list of
# supported model types), but that submodule was removed in langchain-community
# 0.4.x. We're on the modern LangChain 1.x stack, so we stub the removed module
# *before* importing ragas. The stub is never exercised β our judge is ChatOpenAI
# (Groq) β so this keeps the full langchain 1.x app stack intact with no downgrade.
# ---------------------------------------------------------------------------
if "langchain_community.chat_models.vertexai" not in sys.modules:
_vx = types.ModuleType("langchain_community.chat_models.vertexai")
_vx.ChatVertexAI = type("ChatVertexAI", (), {}) # never instantiated
sys.modules["langchain_community.chat_models.vertexai"] = _vx
import numpy as np # noqa: E402
# NOTE: ragas / langchain_openai are imported lazily inside the judging path
# only. The ``--generate`` phase (torch: embedder + reranker + Chroma) then runs
# with exactly the app's import surface β importing the heavy ragas stack
# alongside torch was triggering a native segfault on this Windows / Python 3.14
# box. Generation and judging are therefore split into two processes.
import rag_chain # noqa: E402
from config import GOOGLE_API_KEY, RERANKER_MODEL, RETRIEVAL_FETCH_K # noqa: E402
# NOTE: RAGAS's batch ``evaluate()`` uses an async executor whose per-job
# ``asyncio.timeout`` is incompatible with Python 3.14's asyncio. We instead
# call each metric's synchronous ``single_turn_score`` in a plain loop β same
# RAGAS metric implementations and prompts, just driven sequentially.
# Judge β Gemini 3.1 Flash Lite via Google's OpenAI-compatible endpoint (httpx,
# NOT the grpc client, which segfaults alongside torch on Python 3.14). 500 RPD
# / 250K TPM gives headroom for a full 12-question A/B. (Groq free tiers proved
# too token-limited; Gemma's thinking mode breaks RAGAS JSON parsing.)
JUDGE_MODEL = "gemini-3.1-flash-lite"
GOOGLE_OPENAI_BASE = "https://generativelanguage.googleapis.com/v1beta/openai/"
# Answer generation runs on Google Gemma (the app's default model) β a separate
# rate-limit bucket from the judge.
GEN_LLM_LABEL = "Gemma 4 MoE 26B [Google]"
RESULTS_PATH = rag_chain.VECTORSTORE_DIR.parent / "eval_results.json"
METRIC_ORDER = ["faithfulness", "answer_relevancy", "context_precision", "context_recall"]
# ---------------------------------------------------------------------------
# Curated evaluation set β questions + reference (ground-truth) answers
# grounded in the 12 primary texts in the knowledge base.
# ---------------------------------------------------------------------------
EVAL_SET: list[dict] = [
{
"question": "What does Nietzsche mean by the death of God?",
"reference": (
"Nietzsche uses the death of God to describe the collapse of "
"belief in a transcendent source of meaning and morality. It is "
"not a literal claim but a diagnosis of modern culture: with the "
"divine foundation gone, inherited values lose their ground, "
"risking nihilism, and humanity must create new values itself."
),
},
{
"question": "How does Schopenhauer view suffering and the will to live?",
"reference": (
"Schopenhauer holds that the will to live is the blind, insatiable "
"force underlying all existence. Because desire is endless and its "
"satisfaction fleeting, life is dominated by suffering. Relief comes "
"only through denial of the will, aesthetic contemplation, and "
"compassion."
),
},
{
"question": "What is Hume's argument about causation?",
"reference": (
"Hume argues that we never perceive a necessary connection between "
"cause and effect, only the constant conjunction of events. Our "
"belief in causation is a habit of the mind formed by repeated "
"experience, not a truth derived from reason."
),
},
{
"question": "Can we have certain knowledge of the external world according to Russell?",
"reference": (
"Russell distinguishes knowledge by acquaintance from knowledge by "
"description. We are directly acquainted only with sense-data; the "
"existence of physical objects is an inference. He argues the "
"external world is the best hypothesis explaining our sense-data, "
"though not known with absolute certainty."
),
},
{
"question": "What is Kant's categorical imperative?",
"reference": (
"Kant's categorical imperative is an unconditional moral law: act "
"only on a maxim you could will to become a universal law. It "
"commands independently of desires or consequences, and requires "
"treating humanity always as an end and never merely as a means."
),
},
{
"question": "How does Mill justify the principle of utility?",
"reference": (
"Mill grounds morality in the greatest happiness principle: actions "
"are right insofar as they promote happiness and wrong as they "
"produce its reverse. He argues happiness is the sole thing desired "
"as an end, and distinguishes higher (intellectual) from lower "
"(bodily) pleasures by quality, not only quantity."
),
},
{
"question": "What does Marcus Aurelius advise about things outside our control?",
"reference": (
"Marcus Aurelius, following Stoic doctrine, advises accepting what "
"is outside our control as part of nature's order, and focusing only "
"on our own judgments and actions. Externals cannot harm the rational "
"self; disturbance comes from our opinions about events, not events "
"themselves."
),
},
{
"question": "What is Epictetus's distinction between what is in our power and what is not?",
"reference": (
"Epictetus opens the Enchiridion by dividing things into those in our "
"power β our opinions, desires, and aversions β and those not β body, "
"property, reputation. Tranquility comes from caring only about what "
"is in our power and treating the rest with indifference."
),
},
{
"question": "What is Plato's ideal society in The Republic?",
"reference": (
"Plato's ideal state is a just city ordered into three classes β "
"rulers (philosopher-kings), guardians, and producers β mirroring the "
"soul's reason, spirit, and appetite. Justice is each part performing "
"its proper role. The guardian class shares property and family, and "
"rulers are chosen for wisdom and educated to know the Good."
),
},
{
"question": "What is the will to power in Nietzsche's thought?",
"reference": (
"For Nietzsche the will to power is the fundamental drive of life: "
"not mere survival but the striving to grow, overcome, and impose "
"form. It underlies values and actions, and the higher type affirms "
"life by creating values out of this creative, self-overcoming force."
),
},
{
"question": "How does Nietzsche characterize master and slave morality?",
"reference": (
"In the Genealogy of Morality, Nietzsche contrasts master morality, "
"which originates in the strong and calls 'good' what is noble and "
"powerful, with slave morality, which arises from the resentment of "
"the weak and revalues humility, meekness, and pity as good while "
"branding the strong as evil."
),
},
{
"question": "What role does eternal recurrence play in Nietzsche's philosophy?",
"reference": (
"Eternal recurrence is the thought that one's life will repeat "
"identically and infinitely. Nietzsche poses it as a test of life-"
"affirmation: to will the eternal return of every moment, including "
"suffering, is the highest expression of amor fati and saying yes to "
"existence."
),
},
]
# ---------------------------------------------------------------------------
# RAGAS plumbing
# ---------------------------------------------------------------------------
def _build_judge():
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
llm = ChatOpenAI(
model=JUDGE_MODEL,
api_key=GOOGLE_API_KEY,
base_url=GOOGLE_OPENAI_BASE,
temperature=0.0,
max_tokens=3000,
timeout=90,
max_retries=4, # absorb the occasional 15-RPM 429
)
return LangchainLLMWrapper(llm)
# Gemini 3.1 Flash Lite allows 15 RPM. A short pace between metrics keeps
# multi-call metrics under the per-minute request limit.
PACE_SECONDS = 5
def _score_sample(sample: SingleTurnSample, scorers: dict) -> dict[str, float | None]:
"""Score one sample on all four metrics; isolate failures per-metric."""
out: dict[str, float | None] = {}
for i, (canon, metric) in enumerate(scorers.items()):
if i:
time.sleep(PACE_SECONDS) # respect the 6000 TPM bucket
try:
v = float(metric.single_turn_score(sample))
out[canon] = None if v != v else round(v, 3) # NaN β None
except Exception as exc:
print(f" {canon} failed: {str(exc)[:70]}")
out[canon] = None
return out
# Generate with Gemma via Google's OpenAI-compatible endpoint (httpx). The
# native google.genai client uses grpc, which segfaults alongside torch on this
# Python 3.14 box β so we keep generation on the same httpx path as the judge.
GEN_MODEL_ID = "gemma-4-26b-a4b-it"
def _generate(question: str) -> dict:
docs, _ = rag_chain.retrieve_docs(question, "All") # torch retrieval (no grpc)
context_str = "\n\n".join(d.page_content for d in docs)
from openai import OpenAI
client = OpenAI(api_key=GOOGLE_API_KEY, base_url=GOOGLE_OPENAI_BASE,
timeout=90, max_retries=2) # avoid indefinite hangs
user = (f"Relevant passages from your knowledge base:\n{context_str}\n\n"
f"Question: {question}")
resp = client.chat.completions.create(
model=GEN_MODEL_ID,
messages=[
{"role": "system", "content": rag_chain.SYSTEM_PROMPT},
{"role": "user", "content": user},
],
temperature=0.3,
timeout=90,
)
return {"answer": resp.choices[0].message.content, "context": docs}
def _generate_with_retry(question: str, retries: int = 5):
"""RAG answer generation with backoff on Google RPM (429) limits."""
for attempt in range(retries):
try:
return _generate(question)
except Exception as exc:
if attempt == retries - 1:
raise
wait = 8 * (attempt + 1)
print(f" generation retry in {wait}s ({str(exc)[:50]})")
time.sleep(wait)
def run_config(name: str, use_reranker: bool, eval_set: list[dict], scorers: dict) -> dict:
from ragas import SingleTurnSample
rag_chain.USE_RERANKER = use_reranker # runtime toggle (see retrieve_docs)
print(f"\n=== {name} (reranker={'ON' if use_reranker else 'OFF'}) ===")
per_question = []
for i, item in enumerate(eval_set, 1):
t0 = time.perf_counter()
result = _generate_with_retry(item["question"])
sample = SingleTurnSample(
user_input=item["question"],
response=result["answer"],
retrieved_contexts=[d.page_content for d in result["context"]],
reference=item["reference"],
)
scores = _score_sample(sample, scorers)
per_question.append({"question": item["question"], **scores})
fmt = lambda k: f"{scores[k]:.2f}" if scores[k] is not None else " β "
print(f" [{i}/{len(eval_set)}] {time.perf_counter() - t0:5.1f}s "
f"F={fmt('faithfulness')} AR={fmt('answer_relevancy')} "
f"CP={fmt('context_precision')} CR={fmt('context_recall')} "
f"{item['question'][:46]}")
agg = {}
for m in METRIC_ORDER:
vals = [r[m] for r in per_question if r[m] is not None]
agg[m] = round(float(np.mean(vals)), 4) if vals else 0.0
return {"aggregate": agg, "per_question": per_question}
SAMPLES_PATH = rag_chain.VECTORSTORE_DIR.parent / "eval_samples.json"
# (name, use_reranker, use_query_rewrite) β an incremental A/B/C ablation.
CONFIGS = [
("Baseline (Hybrid)", False, False),
("+ Reranker", True, False),
("+ Query Rewrite", True, True),
]
def generate_samples(eval_set: list[dict]) -> dict:
"""Phase A: run the real RAG pipeline (retrieval + generation) for every
question under each config and dump the samples. No LLM judging here, so
no rate limits and no torch+judge segfault β the judging is a separate phase.
"""
rag_chain.USE_CORRECTIVE_RAG = False # never abstain during evaluation
out: dict[str, list[dict]] = {}
for cfg_name, use_rr, use_rw in CONFIGS:
rag_chain.USE_RERANKER = use_rr
rag_chain.USE_QUERY_REWRITE = use_rw
print(f"\n=== Generating: {cfg_name} (rerank={use_rr}, rewrite={use_rw}) ===")
rows = []
for i, item in enumerate(eval_set, 1):
res = _generate_with_retry(item["question"])
rows.append({
"question": item["question"],
"reference": item["reference"],
"answer": res["answer"],
"contexts": [d.page_content for d in res["context"]],
})
print(f" [{i}/{len(eval_set)}] {item['question'][:55]}")
out[cfg_name] = rows
return out
def main() -> None:
quick = "--quick" in sys.argv
full = "--full" in sys.argv
if "--generate" in sys.argv:
eval_set = EVAL_SET[:4] if quick else EVAL_SET
samples = {
"metadata": {
"generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
"gen_model": GEN_LLM_LABEL,
"reranker_model": RERANKER_MODEL,
"fetch_k": RETRIEVAL_FETCH_K,
"n_questions": len(eval_set),
},
"samples": generate_samples(eval_set),
}
SAMPLES_PATH.write_text(json.dumps(samples, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"\nSaved {len(eval_set)} questions x {len(CONFIGS)} configs β {SAMPLES_PATH}")
return
# Default to 6 questions: RAGAS is token-heavy and Groq free tier is
# 6000 TPM, so a full 12-question A/B (~24 samples) is ~70 min. Six
# questions Γ 2 configs is a representative, completable run (~30 min).
eval_set = EVAL_SET[:4] if quick else (EVAL_SET if full else EVAL_SET[:6])
if not GOOGLE_API_KEY:
raise SystemExit("GOOGLE_API_KEY not set β needed for the judge model.")
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import (
Faithfulness, LLMContextPrecisionWithReference,
LLMContextRecall, ResponseRelevancy,
)
judge = _build_judge()
embeddings = LangchainEmbeddingsWrapper(rag_chain._get_embeddings())
scorers = {
"faithfulness": Faithfulness(llm=judge),
# strictness=1 β asks the LLM for n=1 completion (Groq only allows n=1)
"answer_relevancy": ResponseRelevancy(llm=judge, embeddings=embeddings, strictness=1),
"context_precision": LLMContextPrecisionWithReference(llm=judge),
"context_recall": LLMContextRecall(llm=judge),
}
baseline = run_config("Baseline β Hybrid (no rerank)", False, eval_set, scorers)
reranked = run_config("With Cross-Encoder Rerank", True, eval_set, scorers)
deltas = {
m: round(reranked["aggregate"].get(m, 0.0) - baseline["aggregate"].get(m, 0.0), 4)
for m in METRIC_ORDER
}
import ragas
out = {
"metadata": {
"generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
"framework": f"ragas {ragas.__version__}",
"judge_model": JUDGE_MODEL,
"gen_model": GEN_LLM_LABEL,
"reranker_model": RERANKER_MODEL,
"fetch_k": RETRIEVAL_FETCH_K,
"n_questions": len(eval_set),
},
"configs": {
"Baseline (Hybrid, no rerank)": baseline["aggregate"],
"With Cross-Encoder Rerank": reranked["aggregate"],
},
"deltas": deltas,
"per_question": {
"Baseline (Hybrid, no rerank)": baseline["per_question"],
"With Cross-Encoder Rerank": reranked["per_question"],
},
}
RESULTS_PATH.write_text(json.dumps(out, indent=2), encoding="utf-8")
print("\n" + "=" * 72)
print(f"{'Metric':<22}{'Baseline':>12}{'Reranked':>12}{'Ξ':>10}")
print("-" * 72)
for m in METRIC_ORDER:
b = baseline["aggregate"].get(m, 0.0)
r = reranked["aggregate"].get(m, 0.0)
print(f"{m:<22}{b:>12.3f}{r:>12.3f}{deltas[m]:>+10.3f}")
print("=" * 72)
print(f"Saved β {RESULTS_PATH}")
if __name__ == "__main__":
main()
|