Spaces:

fikri0o0
/

philosopher-chat

Sleeping

File size: 19,001 Bytes

"""
RAGAS evaluation of the retrieval pipeline.

Uses the official RAGAS library (https://docs.ragas.io) to score the pipeline
on four metrics — once with the cross-encoder reranker OFF (hybrid baseline)
and once ON — over a curated question set with reference answers:

    • Faithfulness        — answer claims supported by the retrieved context
    • Answer Relevancy    — answer actually addresses the question
    • Context Precision   — relevant chunks ranked near the top (with reference)
    • Context Recall      — reference answer covered by the retrieved context

Results are written to ``eval_results.json``, which the Gradio app renders in
its "📊 Evaluation" tab.

The judge is Llama 3.1 8B (Groq, OpenAI-compatible) wrapped for RAGAS;
answer relevancy uses the project's EmbeddingGemma embeddings.

Usage:
    pip install -r requirements.txt -r requirements-eval.txt
    python evaluate.py            # full run, both configs
    python evaluate.py --quick    # first 4 questions only (smoke test)
"""

from __future__ import annotations

import io
import json
import sys
import time
import types
from datetime import datetime, timezone

# Force UTF-8 console output on Windows (Δ, etc.)
if hasattr(sys.stdout, "buffer"):
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")

# ---------------------------------------------------------------------------
# Compatibility shim: RAGAS 0.4.3 hard-imports
# ``langchain_community.chat_models.vertexai`` (used only in an internal list of
# supported model types), but that submodule was removed in langchain-community
# 0.4.x. We're on the modern LangChain 1.x stack, so we stub the removed module
# *before* importing ragas. The stub is never exercised — our judge is ChatOpenAI
# (Groq) — so this keeps the full langchain 1.x app stack intact with no downgrade.
# ---------------------------------------------------------------------------
if "langchain_community.chat_models.vertexai" not in sys.modules:
    _vx = types.ModuleType("langchain_community.chat_models.vertexai")
    _vx.ChatVertexAI = type("ChatVertexAI", (), {})  # never instantiated
    sys.modules["langchain_community.chat_models.vertexai"] = _vx

import numpy as np  # noqa: E402

# NOTE: ragas / langchain_openai are imported lazily inside the judging path
# only. The ``--generate`` phase (torch: embedder + reranker + Chroma) then runs
# with exactly the app's import surface — importing the heavy ragas stack
# alongside torch was triggering a native segfault on this Windows / Python 3.14
# box. Generation and judging are therefore split into two processes.

import rag_chain  # noqa: E402
from config import GOOGLE_API_KEY, RERANKER_MODEL, RETRIEVAL_FETCH_K  # noqa: E402

# NOTE: RAGAS's batch ``evaluate()`` uses an async executor whose per-job
# ``asyncio.timeout`` is incompatible with Python 3.14's asyncio. We instead
# call each metric's synchronous ``single_turn_score`` in a plain loop — same
# RAGAS metric implementations and prompts, just driven sequentially.

# Judge — Gemini 3.1 Flash Lite via Google's OpenAI-compatible endpoint (httpx,
# NOT the grpc client, which segfaults alongside torch on Python 3.14). 500 RPD
# / 250K TPM gives headroom for a full 12-question A/B. (Groq free tiers proved
# too token-limited; Gemma's thinking mode breaks RAGAS JSON parsing.)
JUDGE_MODEL = "gemini-3.1-flash-lite"
GOOGLE_OPENAI_BASE = "https://generativelanguage.googleapis.com/v1beta/openai/"
# Answer generation runs on Google Gemma (the app's default model) — a separate
# rate-limit bucket from the judge.
GEN_LLM_LABEL = "Gemma 4 MoE 26B  [Google]"
RESULTS_PATH = rag_chain.VECTORSTORE_DIR.parent / "eval_results.json"

METRIC_ORDER = ["faithfulness", "answer_relevancy", "context_precision", "context_recall"]


# ---------------------------------------------------------------------------
# Curated evaluation set — questions + reference (ground-truth) answers
# grounded in the 12 primary texts in the knowledge base.
# ---------------------------------------------------------------------------
EVAL_SET: list[dict] = [
    {
        "question": "What does Nietzsche mean by the death of God?",
        "reference": (
            "Nietzsche uses the death of God to describe the collapse of "
            "belief in a transcendent source of meaning and morality. It is "
            "not a literal claim but a diagnosis of modern culture: with the "
            "divine foundation gone, inherited values lose their ground, "
            "risking nihilism, and humanity must create new values itself."
        ),
    },
    {
        "question": "How does Schopenhauer view suffering and the will to live?",
        "reference": (
            "Schopenhauer holds that the will to live is the blind, insatiable "
            "force underlying all existence. Because desire is endless and its "
            "satisfaction fleeting, life is dominated by suffering. Relief comes "
            "only through denial of the will, aesthetic contemplation, and "
            "compassion."
        ),
    },
    {
        "question": "What is Hume's argument about causation?",
        "reference": (
            "Hume argues that we never perceive a necessary connection between "
            "cause and effect, only the constant conjunction of events. Our "
            "belief in causation is a habit of the mind formed by repeated "
            "experience, not a truth derived from reason."
        ),
    },
    {
        "question": "Can we have certain knowledge of the external world according to Russell?",
        "reference": (
            "Russell distinguishes knowledge by acquaintance from knowledge by "
            "description. We are directly acquainted only with sense-data; the "
            "existence of physical objects is an inference. He argues the "
            "external world is the best hypothesis explaining our sense-data, "
            "though not known with absolute certainty."
        ),
    },
    {
        "question": "What is Kant's categorical imperative?",
        "reference": (
            "Kant's categorical imperative is an unconditional moral law: act "
            "only on a maxim you could will to become a universal law. It "
            "commands independently of desires or consequences, and requires "
            "treating humanity always as an end and never merely as a means."
        ),
    },
    {
        "question": "How does Mill justify the principle of utility?",
        "reference": (
            "Mill grounds morality in the greatest happiness principle: actions "
            "are right insofar as they promote happiness and wrong as they "
            "produce its reverse. He argues happiness is the sole thing desired "
            "as an end, and distinguishes higher (intellectual) from lower "
            "(bodily) pleasures by quality, not only quantity."
        ),
    },
    {
        "question": "What does Marcus Aurelius advise about things outside our control?",
        "reference": (
            "Marcus Aurelius, following Stoic doctrine, advises accepting what "
            "is outside our control as part of nature's order, and focusing only "
            "on our own judgments and actions. Externals cannot harm the rational "
            "self; disturbance comes from our opinions about events, not events "
            "themselves."
        ),
    },
    {
        "question": "What is Epictetus's distinction between what is in our power and what is not?",
        "reference": (
            "Epictetus opens the Enchiridion by dividing things into those in our "
            "power — our opinions, desires, and aversions — and those not — body, "
            "property, reputation. Tranquility comes from caring only about what "
            "is in our power and treating the rest with indifference."
        ),
    },
    {
        "question": "What is Plato's ideal society in The Republic?",
        "reference": (
            "Plato's ideal state is a just city ordered into three classes — "
            "rulers (philosopher-kings), guardians, and producers — mirroring the "
            "soul's reason, spirit, and appetite. Justice is each part performing "
            "its proper role. The guardian class shares property and family, and "
            "rulers are chosen for wisdom and educated to know the Good."
        ),
    },
    {
        "question": "What is the will to power in Nietzsche's thought?",
        "reference": (
            "For Nietzsche the will to power is the fundamental drive of life: "
            "not mere survival but the striving to grow, overcome, and impose "
            "form. It underlies values and actions, and the higher type affirms "
            "life by creating values out of this creative, self-overcoming force."
        ),
    },
    {
        "question": "How does Nietzsche characterize master and slave morality?",
        "reference": (
            "In the Genealogy of Morality, Nietzsche contrasts master morality, "
            "which originates in the strong and calls 'good' what is noble and "
            "powerful, with slave morality, which arises from the resentment of "
            "the weak and revalues humility, meekness, and pity as good while "
            "branding the strong as evil."
        ),
    },
    {
        "question": "What role does eternal recurrence play in Nietzsche's philosophy?",
        "reference": (
            "Eternal recurrence is the thought that one's life will repeat "
            "identically and infinitely. Nietzsche poses it as a test of life-"
            "affirmation: to will the eternal return of every moment, including "
            "suffering, is the highest expression of amor fati and saying yes to "
            "existence."
        ),
    },
]


# ---------------------------------------------------------------------------
# RAGAS plumbing
# ---------------------------------------------------------------------------

def _build_judge():
    from langchain_openai import ChatOpenAI
    from ragas.llms import LangchainLLMWrapper
    llm = ChatOpenAI(
        model=JUDGE_MODEL,
        api_key=GOOGLE_API_KEY,
        base_url=GOOGLE_OPENAI_BASE,
        temperature=0.0,
        max_tokens=3000,
        timeout=90,
        max_retries=4,       # absorb the occasional 15-RPM 429
    )
    return LangchainLLMWrapper(llm)


# Gemini 3.1 Flash Lite allows 15 RPM. A short pace between metrics keeps
# multi-call metrics under the per-minute request limit.
PACE_SECONDS = 5


def _score_sample(sample: SingleTurnSample, scorers: dict) -> dict[str, float | None]:
    """Score one sample on all four metrics; isolate failures per-metric."""
    out: dict[str, float | None] = {}
    for i, (canon, metric) in enumerate(scorers.items()):
        if i:
            time.sleep(PACE_SECONDS)  # respect the 6000 TPM bucket
        try:
            v = float(metric.single_turn_score(sample))
            out[canon] = None if v != v else round(v, 3)  # NaN → None
        except Exception as exc:
            print(f"      {canon} failed: {str(exc)[:70]}")
            out[canon] = None
    return out


# Generate with Gemma via Google's OpenAI-compatible endpoint (httpx). The
# native google.genai client uses grpc, which segfaults alongside torch on this
# Python 3.14 box — so we keep generation on the same httpx path as the judge.
GEN_MODEL_ID = "gemma-4-26b-a4b-it"


def _generate(question: str) -> dict:
    docs, _ = rag_chain.retrieve_docs(question, "All")  # torch retrieval (no grpc)
    context_str = "\n\n".join(d.page_content for d in docs)
    from openai import OpenAI
    client = OpenAI(api_key=GOOGLE_API_KEY, base_url=GOOGLE_OPENAI_BASE,
                    timeout=90, max_retries=2)  # avoid indefinite hangs
    user = (f"Relevant passages from your knowledge base:\n{context_str}\n\n"
            f"Question: {question}")
    resp = client.chat.completions.create(
        model=GEN_MODEL_ID,
        messages=[
            {"role": "system", "content": rag_chain.SYSTEM_PROMPT},
            {"role": "user", "content": user},
        ],
        temperature=0.3,
        timeout=90,
    )
    return {"answer": resp.choices[0].message.content, "context": docs}


def _generate_with_retry(question: str, retries: int = 5):
    """RAG answer generation with backoff on Google RPM (429) limits."""
    for attempt in range(retries):
        try:
            return _generate(question)
        except Exception as exc:
            if attempt == retries - 1:
                raise
            wait = 8 * (attempt + 1)
            print(f"    generation retry in {wait}s ({str(exc)[:50]})")
            time.sleep(wait)


def run_config(name: str, use_reranker: bool, eval_set: list[dict], scorers: dict) -> dict:
    from ragas import SingleTurnSample
    rag_chain.USE_RERANKER = use_reranker  # runtime toggle (see retrieve_docs)
    print(f"\n=== {name}  (reranker={'ON' if use_reranker else 'OFF'}) ===")
    per_question = []
    for i, item in enumerate(eval_set, 1):
        t0 = time.perf_counter()
        result = _generate_with_retry(item["question"])
        sample = SingleTurnSample(
            user_input=item["question"],
            response=result["answer"],
            retrieved_contexts=[d.page_content for d in result["context"]],
            reference=item["reference"],
        )
        scores = _score_sample(sample, scorers)
        per_question.append({"question": item["question"], **scores})
        fmt = lambda k: f"{scores[k]:.2f}" if scores[k] is not None else " — "
        print(f"  [{i}/{len(eval_set)}] {time.perf_counter() - t0:5.1f}s  "
              f"F={fmt('faithfulness')} AR={fmt('answer_relevancy')} "
              f"CP={fmt('context_precision')} CR={fmt('context_recall')}  "
              f"{item['question'][:46]}")

    agg = {}
    for m in METRIC_ORDER:
        vals = [r[m] for r in per_question if r[m] is not None]
        agg[m] = round(float(np.mean(vals)), 4) if vals else 0.0
    return {"aggregate": agg, "per_question": per_question}


SAMPLES_PATH = rag_chain.VECTORSTORE_DIR.parent / "eval_samples.json"

# (name, use_reranker, use_query_rewrite) — an incremental A/B/C ablation.
CONFIGS = [
    ("Baseline (Hybrid)", False, False),
    ("+ Reranker", True, False),
    ("+ Query Rewrite", True, True),
]


def generate_samples(eval_set: list[dict]) -> dict:
    """Phase A: run the real RAG pipeline (retrieval + generation) for every
    question under each config and dump the samples. No LLM judging here, so
    no rate limits and no torch+judge segfault — the judging is a separate phase.
    """
    rag_chain.USE_CORRECTIVE_RAG = False  # never abstain during evaluation
    out: dict[str, list[dict]] = {}
    for cfg_name, use_rr, use_rw in CONFIGS:
        rag_chain.USE_RERANKER = use_rr
        rag_chain.USE_QUERY_REWRITE = use_rw
        print(f"\n=== Generating: {cfg_name}  (rerank={use_rr}, rewrite={use_rw}) ===")
        rows = []
        for i, item in enumerate(eval_set, 1):
            res = _generate_with_retry(item["question"])
            rows.append({
                "question": item["question"],
                "reference": item["reference"],
                "answer": res["answer"],
                "contexts": [d.page_content for d in res["context"]],
            })
            print(f"  [{i}/{len(eval_set)}] {item['question'][:55]}")
        out[cfg_name] = rows
    return out


def main() -> None:
    quick = "--quick" in sys.argv
    full = "--full" in sys.argv

    if "--generate" in sys.argv:
        eval_set = EVAL_SET[:4] if quick else EVAL_SET
        samples = {
            "metadata": {
                "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
                "gen_model": GEN_LLM_LABEL,
                "reranker_model": RERANKER_MODEL,
                "fetch_k": RETRIEVAL_FETCH_K,
                "n_questions": len(eval_set),
            },
            "samples": generate_samples(eval_set),
        }
        SAMPLES_PATH.write_text(json.dumps(samples, indent=2, ensure_ascii=False), encoding="utf-8")
        print(f"\nSaved {len(eval_set)} questions x {len(CONFIGS)} configs → {SAMPLES_PATH}")
        return
    # Default to 6 questions: RAGAS is token-heavy and Groq free tier is
    # 6000 TPM, so a full 12-question A/B (~24 samples) is ~70 min. Six
    # questions × 2 configs is a representative, completable run (~30 min).
    eval_set = EVAL_SET[:4] if quick else (EVAL_SET if full else EVAL_SET[:6])

    if not GOOGLE_API_KEY:
        raise SystemExit("GOOGLE_API_KEY not set — needed for the judge model.")

    from ragas.embeddings import LangchainEmbeddingsWrapper
    from ragas.metrics import (
        Faithfulness, LLMContextPrecisionWithReference,
        LLMContextRecall, ResponseRelevancy,
    )
    judge = _build_judge()
    embeddings = LangchainEmbeddingsWrapper(rag_chain._get_embeddings())
    scorers = {
        "faithfulness": Faithfulness(llm=judge),
        # strictness=1 → asks the LLM for n=1 completion (Groq only allows n=1)
        "answer_relevancy": ResponseRelevancy(llm=judge, embeddings=embeddings, strictness=1),
        "context_precision": LLMContextPrecisionWithReference(llm=judge),
        "context_recall": LLMContextRecall(llm=judge),
    }

    baseline = run_config("Baseline — Hybrid (no rerank)", False, eval_set, scorers)
    reranked = run_config("With Cross-Encoder Rerank", True, eval_set, scorers)

    deltas = {
        m: round(reranked["aggregate"].get(m, 0.0) - baseline["aggregate"].get(m, 0.0), 4)
        for m in METRIC_ORDER
    }

    import ragas
    out = {
        "metadata": {
            "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
            "framework": f"ragas {ragas.__version__}",
            "judge_model": JUDGE_MODEL,
            "gen_model": GEN_LLM_LABEL,
            "reranker_model": RERANKER_MODEL,
            "fetch_k": RETRIEVAL_FETCH_K,
            "n_questions": len(eval_set),
        },
        "configs": {
            "Baseline (Hybrid, no rerank)": baseline["aggregate"],
            "With Cross-Encoder Rerank": reranked["aggregate"],
        },
        "deltas": deltas,
        "per_question": {
            "Baseline (Hybrid, no rerank)": baseline["per_question"],
            "With Cross-Encoder Rerank": reranked["per_question"],
        },
    }
    RESULTS_PATH.write_text(json.dumps(out, indent=2), encoding="utf-8")

    print("\n" + "=" * 72)
    print(f"{'Metric':<22}{'Baseline':>12}{'Reranked':>12}{'Δ':>10}")
    print("-" * 72)
    for m in METRIC_ORDER:
        b = baseline["aggregate"].get(m, 0.0)
        r = reranked["aggregate"].get(m, 0.0)
        print(f"{m:<22}{b:>12.3f}{r:>12.3f}{deltas[m]:>+10.3f}")
    print("=" * 72)
    print(f"Saved → {RESULTS_PATH}")


if __name__ == "__main__":
    main()