Spaces:
Sleeping
Sleeping
| """ | |
| RAGAS evaluation of the retrieval pipeline. | |
| Uses the official RAGAS library (https://docs.ragas.io) to score the pipeline | |
| on four metrics β once with the cross-encoder reranker OFF (hybrid baseline) | |
| and once ON β over a curated question set with reference answers: | |
| β’ Faithfulness β answer claims supported by the retrieved context | |
| β’ Answer Relevancy β answer actually addresses the question | |
| β’ Context Precision β relevant chunks ranked near the top (with reference) | |
| β’ Context Recall β reference answer covered by the retrieved context | |
| Results are written to ``eval_results.json``, which the Gradio app renders in | |
| its "π Evaluation" tab. | |
| The judge is Llama 3.1 8B (Groq, OpenAI-compatible) wrapped for RAGAS; | |
| answer relevancy uses the project's EmbeddingGemma embeddings. | |
| Usage: | |
| pip install -r requirements.txt -r requirements-eval.txt | |
| python evaluate.py # full run, both configs | |
| python evaluate.py --quick # first 4 questions only (smoke test) | |
| """ | |
| from __future__ import annotations | |
| import io | |
| import json | |
| import sys | |
| import time | |
| import types | |
| from datetime import datetime, timezone | |
| # Force UTF-8 console output on Windows (Ξ, etc.) | |
| if hasattr(sys.stdout, "buffer"): | |
| sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") | |
| # --------------------------------------------------------------------------- | |
| # Compatibility shim: RAGAS 0.4.3 hard-imports | |
| # ``langchain_community.chat_models.vertexai`` (used only in an internal list of | |
| # supported model types), but that submodule was removed in langchain-community | |
| # 0.4.x. We're on the modern LangChain 1.x stack, so we stub the removed module | |
| # *before* importing ragas. The stub is never exercised β our judge is ChatOpenAI | |
| # (Groq) β so this keeps the full langchain 1.x app stack intact with no downgrade. | |
| # --------------------------------------------------------------------------- | |
| if "langchain_community.chat_models.vertexai" not in sys.modules: | |
| _vx = types.ModuleType("langchain_community.chat_models.vertexai") | |
| _vx.ChatVertexAI = type("ChatVertexAI", (), {}) # never instantiated | |
| sys.modules["langchain_community.chat_models.vertexai"] = _vx | |
| import numpy as np # noqa: E402 | |
| # NOTE: ragas / langchain_openai are imported lazily inside the judging path | |
| # only. The ``--generate`` phase (torch: embedder + reranker + Chroma) then runs | |
| # with exactly the app's import surface β importing the heavy ragas stack | |
| # alongside torch was triggering a native segfault on this Windows / Python 3.14 | |
| # box. Generation and judging are therefore split into two processes. | |
| import rag_chain # noqa: E402 | |
| from config import GOOGLE_API_KEY, RERANKER_MODEL, RETRIEVAL_FETCH_K # noqa: E402 | |
| # NOTE: RAGAS's batch ``evaluate()`` uses an async executor whose per-job | |
| # ``asyncio.timeout`` is incompatible with Python 3.14's asyncio. We instead | |
| # call each metric's synchronous ``single_turn_score`` in a plain loop β same | |
| # RAGAS metric implementations and prompts, just driven sequentially. | |
| # Judge β Gemini 3.1 Flash Lite via Google's OpenAI-compatible endpoint (httpx, | |
| # NOT the grpc client, which segfaults alongside torch on Python 3.14). 500 RPD | |
| # / 250K TPM gives headroom for a full 12-question A/B. (Groq free tiers proved | |
| # too token-limited; Gemma's thinking mode breaks RAGAS JSON parsing.) | |
| JUDGE_MODEL = "gemini-3.1-flash-lite" | |
| GOOGLE_OPENAI_BASE = "https://generativelanguage.googleapis.com/v1beta/openai/" | |
| # Answer generation runs on Google Gemma (the app's default model) β a separate | |
| # rate-limit bucket from the judge. | |
| GEN_LLM_LABEL = "Gemma 4 MoE 26B [Google]" | |
| RESULTS_PATH = rag_chain.VECTORSTORE_DIR.parent / "eval_results.json" | |
| METRIC_ORDER = ["faithfulness", "answer_relevancy", "context_precision", "context_recall"] | |
| # --------------------------------------------------------------------------- | |
| # Curated evaluation set β questions + reference (ground-truth) answers | |
| # grounded in the 12 primary texts in the knowledge base. | |
| # --------------------------------------------------------------------------- | |
| EVAL_SET: list[dict] = [ | |
| { | |
| "question": "What does Nietzsche mean by the death of God?", | |
| "reference": ( | |
| "Nietzsche uses the death of God to describe the collapse of " | |
| "belief in a transcendent source of meaning and morality. It is " | |
| "not a literal claim but a diagnosis of modern culture: with the " | |
| "divine foundation gone, inherited values lose their ground, " | |
| "risking nihilism, and humanity must create new values itself." | |
| ), | |
| }, | |
| { | |
| "question": "How does Schopenhauer view suffering and the will to live?", | |
| "reference": ( | |
| "Schopenhauer holds that the will to live is the blind, insatiable " | |
| "force underlying all existence. Because desire is endless and its " | |
| "satisfaction fleeting, life is dominated by suffering. Relief comes " | |
| "only through denial of the will, aesthetic contemplation, and " | |
| "compassion." | |
| ), | |
| }, | |
| { | |
| "question": "What is Hume's argument about causation?", | |
| "reference": ( | |
| "Hume argues that we never perceive a necessary connection between " | |
| "cause and effect, only the constant conjunction of events. Our " | |
| "belief in causation is a habit of the mind formed by repeated " | |
| "experience, not a truth derived from reason." | |
| ), | |
| }, | |
| { | |
| "question": "Can we have certain knowledge of the external world according to Russell?", | |
| "reference": ( | |
| "Russell distinguishes knowledge by acquaintance from knowledge by " | |
| "description. We are directly acquainted only with sense-data; the " | |
| "existence of physical objects is an inference. He argues the " | |
| "external world is the best hypothesis explaining our sense-data, " | |
| "though not known with absolute certainty." | |
| ), | |
| }, | |
| { | |
| "question": "What is Kant's categorical imperative?", | |
| "reference": ( | |
| "Kant's categorical imperative is an unconditional moral law: act " | |
| "only on a maxim you could will to become a universal law. It " | |
| "commands independently of desires or consequences, and requires " | |
| "treating humanity always as an end and never merely as a means." | |
| ), | |
| }, | |
| { | |
| "question": "How does Mill justify the principle of utility?", | |
| "reference": ( | |
| "Mill grounds morality in the greatest happiness principle: actions " | |
| "are right insofar as they promote happiness and wrong as they " | |
| "produce its reverse. He argues happiness is the sole thing desired " | |
| "as an end, and distinguishes higher (intellectual) from lower " | |
| "(bodily) pleasures by quality, not only quantity." | |
| ), | |
| }, | |
| { | |
| "question": "What does Marcus Aurelius advise about things outside our control?", | |
| "reference": ( | |
| "Marcus Aurelius, following Stoic doctrine, advises accepting what " | |
| "is outside our control as part of nature's order, and focusing only " | |
| "on our own judgments and actions. Externals cannot harm the rational " | |
| "self; disturbance comes from our opinions about events, not events " | |
| "themselves." | |
| ), | |
| }, | |
| { | |
| "question": "What is Epictetus's distinction between what is in our power and what is not?", | |
| "reference": ( | |
| "Epictetus opens the Enchiridion by dividing things into those in our " | |
| "power β our opinions, desires, and aversions β and those not β body, " | |
| "property, reputation. Tranquility comes from caring only about what " | |
| "is in our power and treating the rest with indifference." | |
| ), | |
| }, | |
| { | |
| "question": "What is Plato's ideal society in The Republic?", | |
| "reference": ( | |
| "Plato's ideal state is a just city ordered into three classes β " | |
| "rulers (philosopher-kings), guardians, and producers β mirroring the " | |
| "soul's reason, spirit, and appetite. Justice is each part performing " | |
| "its proper role. The guardian class shares property and family, and " | |
| "rulers are chosen for wisdom and educated to know the Good." | |
| ), | |
| }, | |
| { | |
| "question": "What is the will to power in Nietzsche's thought?", | |
| "reference": ( | |
| "For Nietzsche the will to power is the fundamental drive of life: " | |
| "not mere survival but the striving to grow, overcome, and impose " | |
| "form. It underlies values and actions, and the higher type affirms " | |
| "life by creating values out of this creative, self-overcoming force." | |
| ), | |
| }, | |
| { | |
| "question": "How does Nietzsche characterize master and slave morality?", | |
| "reference": ( | |
| "In the Genealogy of Morality, Nietzsche contrasts master morality, " | |
| "which originates in the strong and calls 'good' what is noble and " | |
| "powerful, with slave morality, which arises from the resentment of " | |
| "the weak and revalues humility, meekness, and pity as good while " | |
| "branding the strong as evil." | |
| ), | |
| }, | |
| { | |
| "question": "What role does eternal recurrence play in Nietzsche's philosophy?", | |
| "reference": ( | |
| "Eternal recurrence is the thought that one's life will repeat " | |
| "identically and infinitely. Nietzsche poses it as a test of life-" | |
| "affirmation: to will the eternal return of every moment, including " | |
| "suffering, is the highest expression of amor fati and saying yes to " | |
| "existence." | |
| ), | |
| }, | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # RAGAS plumbing | |
| # --------------------------------------------------------------------------- | |
| def _build_judge(): | |
| from langchain_openai import ChatOpenAI | |
| from ragas.llms import LangchainLLMWrapper | |
| llm = ChatOpenAI( | |
| model=JUDGE_MODEL, | |
| api_key=GOOGLE_API_KEY, | |
| base_url=GOOGLE_OPENAI_BASE, | |
| temperature=0.0, | |
| max_tokens=3000, | |
| timeout=90, | |
| max_retries=4, # absorb the occasional 15-RPM 429 | |
| ) | |
| return LangchainLLMWrapper(llm) | |
| # Gemini 3.1 Flash Lite allows 15 RPM. A short pace between metrics keeps | |
| # multi-call metrics under the per-minute request limit. | |
| PACE_SECONDS = 5 | |
| def _score_sample(sample: SingleTurnSample, scorers: dict) -> dict[str, float | None]: | |
| """Score one sample on all four metrics; isolate failures per-metric.""" | |
| out: dict[str, float | None] = {} | |
| for i, (canon, metric) in enumerate(scorers.items()): | |
| if i: | |
| time.sleep(PACE_SECONDS) # respect the 6000 TPM bucket | |
| try: | |
| v = float(metric.single_turn_score(sample)) | |
| out[canon] = None if v != v else round(v, 3) # NaN β None | |
| except Exception as exc: | |
| print(f" {canon} failed: {str(exc)[:70]}") | |
| out[canon] = None | |
| return out | |
| # Generate with Gemma via Google's OpenAI-compatible endpoint (httpx). The | |
| # native google.genai client uses grpc, which segfaults alongside torch on this | |
| # Python 3.14 box β so we keep generation on the same httpx path as the judge. | |
| GEN_MODEL_ID = "gemma-4-26b-a4b-it" | |
| def _generate(question: str) -> dict: | |
| docs, _ = rag_chain.retrieve_docs(question, "All") # torch retrieval (no grpc) | |
| context_str = "\n\n".join(d.page_content for d in docs) | |
| from openai import OpenAI | |
| client = OpenAI(api_key=GOOGLE_API_KEY, base_url=GOOGLE_OPENAI_BASE, | |
| timeout=90, max_retries=2) # avoid indefinite hangs | |
| user = (f"Relevant passages from your knowledge base:\n{context_str}\n\n" | |
| f"Question: {question}") | |
| resp = client.chat.completions.create( | |
| model=GEN_MODEL_ID, | |
| messages=[ | |
| {"role": "system", "content": rag_chain.SYSTEM_PROMPT}, | |
| {"role": "user", "content": user}, | |
| ], | |
| temperature=0.3, | |
| timeout=90, | |
| ) | |
| return {"answer": resp.choices[0].message.content, "context": docs} | |
| def _generate_with_retry(question: str, retries: int = 5): | |
| """RAG answer generation with backoff on Google RPM (429) limits.""" | |
| for attempt in range(retries): | |
| try: | |
| return _generate(question) | |
| except Exception as exc: | |
| if attempt == retries - 1: | |
| raise | |
| wait = 8 * (attempt + 1) | |
| print(f" generation retry in {wait}s ({str(exc)[:50]})") | |
| time.sleep(wait) | |
| def run_config(name: str, use_reranker: bool, eval_set: list[dict], scorers: dict) -> dict: | |
| from ragas import SingleTurnSample | |
| rag_chain.USE_RERANKER = use_reranker # runtime toggle (see retrieve_docs) | |
| print(f"\n=== {name} (reranker={'ON' if use_reranker else 'OFF'}) ===") | |
| per_question = [] | |
| for i, item in enumerate(eval_set, 1): | |
| t0 = time.perf_counter() | |
| result = _generate_with_retry(item["question"]) | |
| sample = SingleTurnSample( | |
| user_input=item["question"], | |
| response=result["answer"], | |
| retrieved_contexts=[d.page_content for d in result["context"]], | |
| reference=item["reference"], | |
| ) | |
| scores = _score_sample(sample, scorers) | |
| per_question.append({"question": item["question"], **scores}) | |
| fmt = lambda k: f"{scores[k]:.2f}" if scores[k] is not None else " β " | |
| print(f" [{i}/{len(eval_set)}] {time.perf_counter() - t0:5.1f}s " | |
| f"F={fmt('faithfulness')} AR={fmt('answer_relevancy')} " | |
| f"CP={fmt('context_precision')} CR={fmt('context_recall')} " | |
| f"{item['question'][:46]}") | |
| agg = {} | |
| for m in METRIC_ORDER: | |
| vals = [r[m] for r in per_question if r[m] is not None] | |
| agg[m] = round(float(np.mean(vals)), 4) if vals else 0.0 | |
| return {"aggregate": agg, "per_question": per_question} | |
| SAMPLES_PATH = rag_chain.VECTORSTORE_DIR.parent / "eval_samples.json" | |
| # (name, use_reranker, use_query_rewrite) β an incremental A/B/C ablation. | |
| CONFIGS = [ | |
| ("Baseline (Hybrid)", False, False), | |
| ("+ Reranker", True, False), | |
| ("+ Query Rewrite", True, True), | |
| ] | |
| def generate_samples(eval_set: list[dict]) -> dict: | |
| """Phase A: run the real RAG pipeline (retrieval + generation) for every | |
| question under each config and dump the samples. No LLM judging here, so | |
| no rate limits and no torch+judge segfault β the judging is a separate phase. | |
| """ | |
| rag_chain.USE_CORRECTIVE_RAG = False # never abstain during evaluation | |
| out: dict[str, list[dict]] = {} | |
| for cfg_name, use_rr, use_rw in CONFIGS: | |
| rag_chain.USE_RERANKER = use_rr | |
| rag_chain.USE_QUERY_REWRITE = use_rw | |
| print(f"\n=== Generating: {cfg_name} (rerank={use_rr}, rewrite={use_rw}) ===") | |
| rows = [] | |
| for i, item in enumerate(eval_set, 1): | |
| res = _generate_with_retry(item["question"]) | |
| rows.append({ | |
| "question": item["question"], | |
| "reference": item["reference"], | |
| "answer": res["answer"], | |
| "contexts": [d.page_content for d in res["context"]], | |
| }) | |
| print(f" [{i}/{len(eval_set)}] {item['question'][:55]}") | |
| out[cfg_name] = rows | |
| return out | |
| def main() -> None: | |
| quick = "--quick" in sys.argv | |
| full = "--full" in sys.argv | |
| if "--generate" in sys.argv: | |
| eval_set = EVAL_SET[:4] if quick else EVAL_SET | |
| samples = { | |
| "metadata": { | |
| "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"), | |
| "gen_model": GEN_LLM_LABEL, | |
| "reranker_model": RERANKER_MODEL, | |
| "fetch_k": RETRIEVAL_FETCH_K, | |
| "n_questions": len(eval_set), | |
| }, | |
| "samples": generate_samples(eval_set), | |
| } | |
| SAMPLES_PATH.write_text(json.dumps(samples, indent=2, ensure_ascii=False), encoding="utf-8") | |
| print(f"\nSaved {len(eval_set)} questions x {len(CONFIGS)} configs β {SAMPLES_PATH}") | |
| return | |
| # Default to 6 questions: RAGAS is token-heavy and Groq free tier is | |
| # 6000 TPM, so a full 12-question A/B (~24 samples) is ~70 min. Six | |
| # questions Γ 2 configs is a representative, completable run (~30 min). | |
| eval_set = EVAL_SET[:4] if quick else (EVAL_SET if full else EVAL_SET[:6]) | |
| if not GOOGLE_API_KEY: | |
| raise SystemExit("GOOGLE_API_KEY not set β needed for the judge model.") | |
| from ragas.embeddings import LangchainEmbeddingsWrapper | |
| from ragas.metrics import ( | |
| Faithfulness, LLMContextPrecisionWithReference, | |
| LLMContextRecall, ResponseRelevancy, | |
| ) | |
| judge = _build_judge() | |
| embeddings = LangchainEmbeddingsWrapper(rag_chain._get_embeddings()) | |
| scorers = { | |
| "faithfulness": Faithfulness(llm=judge), | |
| # strictness=1 β asks the LLM for n=1 completion (Groq only allows n=1) | |
| "answer_relevancy": ResponseRelevancy(llm=judge, embeddings=embeddings, strictness=1), | |
| "context_precision": LLMContextPrecisionWithReference(llm=judge), | |
| "context_recall": LLMContextRecall(llm=judge), | |
| } | |
| baseline = run_config("Baseline β Hybrid (no rerank)", False, eval_set, scorers) | |
| reranked = run_config("With Cross-Encoder Rerank", True, eval_set, scorers) | |
| deltas = { | |
| m: round(reranked["aggregate"].get(m, 0.0) - baseline["aggregate"].get(m, 0.0), 4) | |
| for m in METRIC_ORDER | |
| } | |
| import ragas | |
| out = { | |
| "metadata": { | |
| "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"), | |
| "framework": f"ragas {ragas.__version__}", | |
| "judge_model": JUDGE_MODEL, | |
| "gen_model": GEN_LLM_LABEL, | |
| "reranker_model": RERANKER_MODEL, | |
| "fetch_k": RETRIEVAL_FETCH_K, | |
| "n_questions": len(eval_set), | |
| }, | |
| "configs": { | |
| "Baseline (Hybrid, no rerank)": baseline["aggregate"], | |
| "With Cross-Encoder Rerank": reranked["aggregate"], | |
| }, | |
| "deltas": deltas, | |
| "per_question": { | |
| "Baseline (Hybrid, no rerank)": baseline["per_question"], | |
| "With Cross-Encoder Rerank": reranked["per_question"], | |
| }, | |
| } | |
| RESULTS_PATH.write_text(json.dumps(out, indent=2), encoding="utf-8") | |
| print("\n" + "=" * 72) | |
| print(f"{'Metric':<22}{'Baseline':>12}{'Reranked':>12}{'Ξ':>10}") | |
| print("-" * 72) | |
| for m in METRIC_ORDER: | |
| b = baseline["aggregate"].get(m, 0.0) | |
| r = reranked["aggregate"].get(m, 0.0) | |
| print(f"{m:<22}{b:>12.3f}{r:>12.3f}{deltas[m]:>+10.3f}") | |
| print("=" * 72) | |
| print(f"Saved β {RESULTS_PATH}") | |
| if __name__ == "__main__": | |
| main() | |