Spaces:

Param20h
/

PDF-Assit_RAG

Running

App Files Files Community

Srushti-Kamble commited on 7 days ago

Commit

234da68

1 Parent(s): cb1048d

test(rag): add RAGAS evaluation pipeline

Browse files

Files changed (8) hide show

.gitignore +2 -1
README.md +6 -0
backend/app/evaluation/__init__.py +2 -0
backend/app/evaluation/ragas_pipeline.py +292 -0
backend/evaluation/ragas_sample_questions.jsonl +50 -0
backend/requirements.txt +1 -0
backend/scripts/run_ragas_eval.py +59 -0
backend/tests/test_ragas_pipeline.py +76 -0

.gitignore CHANGED Viewed

@@ -8,6 +8,7 @@ __pycache__/
 # Data (runtime generated)
 data/
 *.db
 # Environment
 .env
@@ -29,4 +30,4 @@ Thumbs.db
 # Misc
 *.log
 static/
-.planning/

 # Data (runtime generated)
 data/
 *.db
+backend/evaluation/ragas_results.json
 # Environment
 .env
 # Misc
 *.log
 static/
+.planning/

README.md CHANGED Viewed

@@ -524,6 +524,12 @@ docker compose up --build
 |---------|-------------|
 | `uvicorn app.main:app --reload` | Start FastAPI with hot reload |
 | `uvicorn app.main:app --port 8000` | Start FastAPI on port 8000 |
 ### Frontend (`frontend/`)

 |---------|-------------|
 | `uvicorn app.main:app --reload` | Start FastAPI with hot reload |
 | `uvicorn app.main:app --port 8000` | Start FastAPI on port 8000 |
+| `python scripts/run_ragas_eval.py --user-id <user-id>` | Run the 50-question RAGAS comparison for vector search vs GraphRAG |
+The RAGAS script reads `backend/evaluation/ragas_sample_questions.jsonl`,
+generates answers from standard vector contexts and vector-plus-GraphRAG
+contexts, then writes aggregate scores to `backend/evaluation/ragas_results.json`.
+Pass `--document-id <document-id>` to evaluate one indexed document.
 ### Frontend (`frontend/`)

backend/app/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """Evaluation helpers for offline RAG quality checks."""
2	+

backend/app/evaluation/ragas_pipeline.py ADDED Viewed

	@@ -0,0 +1,292 @@

+"""RAGAS evaluation pipeline for vector search versus GraphRAG."""
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from statistics import mean
+from typing import Any, Callable, Iterable, Optional
+from huggingface_hub import InferenceClient
+from app.config import get_settings
+from app.rag.embeddings import embed_query
+from app.rag.graph_retriever import get_entity_context
+from app.rag.vectorstore import query_chunks
+settings = get_settings()
+AnswerGenerator = Callable[[str, list[str]], str]
+@dataclass(frozen=True)
+class EvaluationQuestion:
+    id: str
+    question: str
+    reference: str
+@dataclass(frozen=True)
+class EvaluationRecord:
+    id: str
+    mode: str
+    question: str
+    reference: str
+    response: str
+    contexts: list[str]
+def load_questions(dataset_path: Path, limit: int = 50) -> list[EvaluationQuestion]:
+    """Load a JSONL RAGAS dataset and validate the required fields."""
+    questions: list[EvaluationQuestion] = []
+    with dataset_path.open("r", encoding="utf-8") as handle:
+        for line_number, line in enumerate(handle, start=1):
+            stripped = line.strip()
+            if not stripped:
+                continue
+            try:
+                row = json.loads(stripped)
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"Invalid JSON on line {line_number}: {exc}") from exc
+            missing = {"id", "question", "reference"} - set(row)
+            if missing:
+                fields = ", ".join(sorted(missing))
+                raise ValueError(f"Line {line_number} is missing required field(s): {fields}")
+            questions.append(
+                EvaluationQuestion(
+                    id=str(row["id"]),
+                    question=str(row["question"]).strip(),
+                    reference=str(row["reference"]).strip(),
+                )
+            )
+            if len(questions) >= limit:
+                break
+    if len(questions) < limit:
+        raise ValueError(f"Expected {limit} evaluation questions, found {len(questions)}")
+    return questions
+def retrieve_vector_contexts(
+    question: str,
+    user_id: str,
+    document_id: Optional[str] = None,
+    top_k: Optional[int] = None,
+) -> list[str]:
+    """Retrieve plain vector-search contexts for a question."""
+    query_embedding = embed_query(question)
+    chunks = query_chunks(
+        query_embedding=query_embedding,
+        user_id=user_id,
+        document_id=document_id,
+        top_k=top_k or settings.TOP_K_RETRIEVAL,
+    )
+    return _chunk_texts(chunks)
+def retrieve_graphrag_contexts(
+    question: str,
+    user_id: str,
+    document_id: Optional[str] = None,
+    top_k: Optional[int] = None,
+) -> list[str]:
+    """Retrieve vector contexts and append GraphRAG relationship context."""
+    contexts = retrieve_vector_contexts(
+        question=question,
+        user_id=user_id,
+        document_id=document_id,
+        top_k=top_k,
+    )
+    graph_context = get_entity_context(
+        query=question,
+        user_id=user_id,
+        document_id=document_id,
+    )
+    return append_graph_context(contexts, graph_context)
+def append_graph_context(contexts: list[str], graph_context: str) -> list[str]:
+    """Return contexts plus graph context when GraphRAG found relationships."""
+    clean_graph_context = graph_context.strip()
+    if not clean_graph_context:
+        return contexts
+    return [*contexts, clean_graph_context]
+def generate_grounded_answer(question: str, contexts: list[str]) -> str:
+    """Generate an answer using only retrieved contexts."""
+    if not contexts:
+        return "I do not have enough retrieved context to answer this question."
+    client = InferenceClient(token=settings.HF_TOKEN)
+    context_block = "\n\n".join(
+        f"Context {index}:\n{context}" for index, context in enumerate(contexts, start=1)
+    )
+    prompt = (
+        "Answer the question using only the provided context. "
+        "If the context is insufficient, say that the answer is not available in the context.\n\n"
+        f"{context_block}\n\nQuestion: {question}"
+    )
+    response = client.chat_completion(
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a careful RAG evaluator that only uses supplied evidence.",
+            },
+            {"role": "user", "content": prompt},
+        ],
+        model=settings.LLM_MODEL,
+        max_tokens=min(settings.LLM_MAX_NEW_TOKENS, 512),
+        temperature=0.0,
+    )
+    if not response.choices:
+        return ""
+    return (response.choices[0].message.content or "").strip()
+def collect_records(
+    questions: Iterable[EvaluationQuestion],
+    user_id: str,
+    document_id: Optional[str] = None,
+    answer_generator: AnswerGenerator = generate_grounded_answer,
+) -> dict[str, list[EvaluationRecord]]:
+    """Build vector and GraphRAG samples ready for RAGAS."""
+    grouped: dict[str, list[EvaluationRecord]] = {"vector": [], "graphrag": []}
+    for item in questions:
+        vector_contexts = retrieve_vector_contexts(
+            question=item.question,
+            user_id=user_id,
+            document_id=document_id,
+        )
+        graphrag_contexts = retrieve_graphrag_contexts(
+            question=item.question,
+            user_id=user_id,
+            document_id=document_id,
+        )
+        grouped["vector"].append(
+            EvaluationRecord(
+                id=item.id,
+                mode="vector",
+                question=item.question,
+                reference=item.reference,
+                response=answer_generator(item.question, vector_contexts),
+                contexts=vector_contexts,
+            )
+        )
+        grouped["graphrag"].append(
+            EvaluationRecord(
+                id=item.id,
+                mode="graphrag",
+                question=item.question,
+                reference=item.reference,
+                response=answer_generator(item.question, graphrag_contexts),
+                contexts=graphrag_contexts,
+            )
+        )
+    return grouped
+def evaluate_records(records: list[EvaluationRecord]) -> dict[str, float]:
+    """Run RAGAS over collected records and return mean metric scores."""
+    from langchain_huggingface import HuggingFaceEndpoint
+    from ragas import EvaluationDataset, evaluate
+    from ragas.llms import LangchainLLMWrapper
+    from ragas.metrics import Faithfulness, FactualCorrectness, LLMContextRecall
+    dataset = EvaluationDataset.from_list(
+        [
+            {
+                "user_input": record.question,
+                "retrieved_contexts": record.contexts,
+                "response": record.response,
+                "reference": record.reference,
+            }
+            for record in records
+        ]
+    )
+    evaluator_llm = LangchainLLMWrapper(
+        HuggingFaceEndpoint(
+            repo_id=settings.LLM_MODEL,
+            huggingfacehub_api_token=settings.HF_TOKEN,
+            max_new_tokens=512,
+            temperature=0.0,
+            timeout=300,
+        )
+    )
+    result = evaluate(
+        dataset=dataset,
+        metrics=[
+            Faithfulness(),
+            FactualCorrectness(),
+            LLMContextRecall(),
+        ],
+        llm=evaluator_llm,
+    )
+    return summarize_ragas_result(result)
+def compare_pipelines(grouped_records: dict[str, list[EvaluationRecord]]) -> dict[str, Any]:
+    """Evaluate both retrieval modes and include metric deltas."""
+    vector_scores = evaluate_records(grouped_records["vector"])
+    graphrag_scores = evaluate_records(grouped_records["graphrag"])
+    metrics = sorted(set(vector_scores) | set(graphrag_scores))
+    return {
+        "vector": vector_scores,
+        "graphrag": graphrag_scores,
+        "delta": {
+            metric: round(graphrag_scores.get(metric, 0.0) - vector_scores.get(metric, 0.0), 4)
+            for metric in metrics
+        },
+    }
+def summarize_ragas_result(result: Any) -> dict[str, float]:
+    """Normalize RAGAS result objects into mean metric scores."""
+    if hasattr(result, "to_pandas"):
+        dataframe = result.to_pandas()
+        scores: dict[str, float] = {}
+        for column in dataframe.columns:
+            values = [
+                float(value)
+                for value in dataframe[column].tolist()
+                if isinstance(value, (int, float)) and value == value
+            ]
+            if values:
+                scores[str(column)] = round(mean(values), 4)
+        return scores
+    if isinstance(result, dict):
+        return {
+            str(key): round(float(value), 4)
+            for key, value in result.items()
+            if isinstance(value, (int, float))
+        }
+    scores = getattr(result, "scores", None)
+    if isinstance(scores, list):
+        by_metric: dict[str, list[float]] = {}
+        for row in scores:
+            if not isinstance(row, dict):
+                continue
+            for key, value in row.items():
+                if isinstance(value, (int, float)):
+                    by_metric.setdefault(str(key), []).append(float(value))
+        return {key: round(mean(values), 4) for key, values in by_metric.items()}
+    raise TypeError(f"Unsupported RAGAS result type: {type(result)!r}")
+def _chunk_texts(chunks: list[dict[str, Any]]) -> list[str]:
+    return [str(chunk["text"]) for chunk in chunks if chunk.get("text")]

backend/evaluation/ragas_sample_questions.jsonl ADDED Viewed

	@@ -0,0 +1,50 @@

+{"id":"q001","question":"What is the main purpose of PDF-Assistant-RAG?","reference":"PDF-Assistant-RAG helps users upload documents, retrieve relevant document context, and ask questions answered through a retrieval-augmented generation workflow."}
+{"id":"q002","question":"Which backend framework serves the API?","reference":"The backend API is served by FastAPI."}
+{"id":"q003","question":"Which frontend framework is used for the application interface?","reference":"The frontend is a Next.js application."}
+{"id":"q004","question":"What does the document upload route do before saving permanent state?","reference":"The upload route validates filename, extension, size, MIME type, and parser readability before moving a file into permanent storage."}
+{"id":"q005","question":"Which vector database stores retrieved document chunks?","reference":"ChromaDB stores document chunks for vector retrieval."}
+{"id":"q006","question":"Which embedding model is configured by default?","reference":"The default embedding model is sentence-transformers/all-MiniLM-L6-v2."}
+{"id":"q007","question":"What is the default embedding dimension?","reference":"The default embedding dimension is 384."}
+{"id":"q008","question":"What is the purpose of TOP_K_RETRIEVAL?","reference":"TOP_K_RETRIEVAL controls how many candidate chunks are retrieved before reranking."}
+{"id":"q009","question":"What is the purpose of TOP_K_RERANK?","reference":"TOP_K_RERANK controls how many reranked chunks are finally passed to answer generation."}
+{"id":"q010","question":"Which model family is used for reranking by default?","reference":"The default reranker is a cross-encoder model, cross-encoder/ms-marco-MiniLM-L-6-v2."}
+{"id":"q011","question":"How does the backend identify authenticated users?","reference":"Authenticated routes use JWT identity through the current-user dependency."}
+{"id":"q012","question":"What data must user-facing routes filter by?","reference":"User-facing routes must filter documents, files, vector chunks, and chat data by the authenticated user's id."}
+{"id":"q013","question":"What does the health endpoint check?","reference":"The health endpoint checks service health such as API, SQL database, and Chroma availability."}
+{"id":"q014","question":"What does the chat route provide besides normal JSON answers?","reference":"The chat route supports server-sent events so answers can stream tokens to the frontend."}
+{"id":"q015","question":"What is GraphRAG used for in this project?","reference":"GraphRAG builds and retrieves lightweight entity co-occurrence relationships to add graph context to document answers."}
+{"id":"q016","question":"Where are GraphRAG graph files persisted by default?","reference":"GraphRAG graph files are persisted under the configured GRAPH_PERSIST_DIR, which defaults to ./data/graphs."}
+{"id":"q017","question":"Which graph library is used to store knowledge graph relationships?","reference":"NetworkX is used to build and store knowledge graph relationships."}
+{"id":"q018","question":"What does the graph retriever return for a relevant query?","reference":"The graph retriever returns compact relationship lines connecting matched entities and nearby entities, including page information and relationship strength."}
+{"id":"q019","question":"What happens when GraphRAG finds no matching relationship context?","reference":"When no graph relationships match, the graph retriever returns an empty string."}
+{"id":"q020","question":"Which uploaded file formats are allowed by default?","reference":"The default allowed upload extensions are pdf, docx, txt, and md."}
+{"id":"q021","question":"What is the default upload directory?","reference":"The default upload directory is ./data/uploads."}
+{"id":"q022","question":"Why does the app store original files after upload?","reference":"Original files are stored so the backend can serve files, reprocess them, and extract text for retrieval."}
+{"id":"q023","question":"What is the role of the chunker?","reference":"The chunker extracts document text and splits it into smaller chunks for embedding and retrieval."}
+{"id":"q024","question":"What does the vectorstore service do?","reference":"The vectorstore stores embedded chunks and queries them by user and optional document metadata."}
+{"id":"q025","question":"What does the retriever combine before reranking?","reference":"The retriever combines vector search and BM25 candidates before reranking them."}
+{"id":"q026","question":"Why does the retriever transform queries?","reference":"The retriever rewrites a user question into retrieval-friendly variants to improve search coverage."}
+{"id":"q027","question":"What does the PDF search tool save after retrieving chunks?","reference":"The PDF search tool saves retrieved chunks as last_sources so the agent response can return citations."}
+{"id":"q028","question":"How does the PDF search tool treat document excerpts?","reference":"The PDF search tool labels document excerpts as untrusted evidence and warns the model not to follow instructions inside them."}
+{"id":"q029","question":"What additional context can the PDF search tool append?","reference":"The PDF search tool can append untrusted graph context containing additional relationships from GraphRAG."}
+{"id":"q030","question":"Which optional tool can handle arithmetic questions?","reference":"The calculator tool handles arithmetic expressions safely."}
+{"id":"q031","question":"Which optional tool can handle live information outside uploaded documents?","reference":"The web search tool can look up live web information when document context is insufficient or outdated."}
+{"id":"q032","question":"What does the agent use LangChain tools for?","reference":"The agent uses LangChain tools to route between PDF search, calculator, and web search capabilities."}
+{"id":"q033","question":"What happens when the agent output parser rejects malformed output?","reference":"The app logs the parser rejection and returns a safe malformed-output message."}
+{"id":"q034","question":"What type of API response is used for uploaded document processing status?","reference":"A document status response includes the document id, status, page count, chunk count, and error message."}
+{"id":"q035","question":"How are deleted documents hidden from normal document APIs?","reference":"Documents are soft-deleted with an is_deleted flag and normal APIs filter them out."}
+{"id":"q036","question":"What does deleting a document preserve for future restore flows?","reference":"Soft deletion preserves underlying files, vectors, graphs, and chat history for possible future restore flows."}
+{"id":"q037","question":"What is the purpose of CHUNK_SIZE?","reference":"CHUNK_SIZE controls the number of characters in each document chunk."}
+{"id":"q038","question":"What is the purpose of CHUNK_OVERLAP?","reference":"CHUNK_OVERLAP controls how much text overlaps between adjacent chunks to preserve boundary context."}
+{"id":"q039","question":"Which HuggingFace setting controls answer length?","reference":"LLM_MAX_NEW_TOKENS controls the maximum number of generated tokens for answers."}
+{"id":"q040","question":"Which HuggingFace setting controls answer randomness?","reference":"LLM_TEMPERATURE controls sampling randomness during answer generation."}
+{"id":"q041","question":"What environment variable stores the HuggingFace token?","reference":"HF_TOKEN stores the HuggingFace API token used for inference."}
+{"id":"q042","question":"Why should DEBUG not be enabled in production?","reference":"DEBUG enables detailed behavior intended for development and should not be enabled in production."}
+{"id":"q043","question":"How are production CORS origins configured?","reference":"Production CORS origins are configured through ALLOWED_ORIGINS."}
+{"id":"q044","question":"What database is used by default for local development?","reference":"The default database URL points to a local SQLite database at ./data/app.db."}
+{"id":"q045","question":"What database does Docker Compose provide for the stack?","reference":"Docker Compose provides a PostgreSQL database service for the stack."}
+{"id":"q046","question":"What is the contributor target branch for pull requests?","reference":"Contributor pull requests should target the dev branch."}
+{"id":"q047","question":"Which branch is production protected for deployment?","reference":"The main branch is treated as the production branch for deployment."}
+{"id":"q048","question":"Where can developers view Swagger locally?","reference":"Developers can view Swagger at /docs when the backend is running locally."}
+{"id":"q049","question":"What does the architecture document focus on?","reference":"The architecture document focuses on how requests move through the system and how major runtime components interact."}
+{"id":"q050","question":"Why is a RAGAS evaluation pipeline useful for this project?","reference":"A RAGAS evaluation pipeline provides quantitative scores to compare standard vector search with GraphRAG and track retrieval and answer quality over time."}

backend/requirements.txt CHANGED Viewed

@@ -38,6 +38,7 @@ langchain-huggingface
 langchain-text-splitters
 langsmith
 rank-bm25
 # Embeddings & ML
 sentence-transformers

 langchain-text-splitters
 langsmith
 rank-bm25
+ragas>=0.3.0
 # Embeddings & ML
 sentence-transformers

backend/scripts/run_ragas_eval.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""Run a 50-question RAGAS comparison for vector search and GraphRAG."""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[2]
+BACKEND_DIR = ROOT / "backend"
+if str(BACKEND_DIR) not in sys.path:
+    sys.path.insert(0, str(BACKEND_DIR))
+DEFAULT_DATASET = BACKEND_DIR / "evaluation" / "ragas_sample_questions.jsonl"
+DEFAULT_OUTPUT = BACKEND_DIR / "evaluation" / "ragas_results.json"
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate vector search versus GraphRAG with RAGAS.",
+    )
+    parser.add_argument("--user-id", required=True, help="Owner user id for indexed documents.")
+    parser.add_argument("--document-id", help="Optional single document id to evaluate.")
+    parser.add_argument("--dataset", type=Path, default=DEFAULT_DATASET)
+    parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
+    parser.add_argument("--limit", type=int, default=50)
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    from app.evaluation.ragas_pipeline import collect_records, compare_pipelines, load_questions
+    questions = load_questions(args.dataset, limit=args.limit)
+    grouped_records = collect_records(
+        questions=questions,
+        user_id=args.user_id,
+        document_id=args.document_id,
+    )
+    scores = compare_pipelines(grouped_records)
+    payload = {
+        "generated_at": datetime.now(timezone.utc).isoformat(),
+        "dataset": str(args.dataset),
+        "question_count": len(questions),
+        "user_id": args.user_id,
+        "document_id": args.document_id,
+        "scores": scores,
+    }
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    print(json.dumps(payload["scores"], indent=2))
+    print(f"Wrote RAGAS evaluation results to {args.output}")
+if __name__ == "__main__":
+    main()

backend/tests/test_ragas_pipeline.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import json
+from types import SimpleNamespace
+from app.evaluation import ragas_pipeline
+from app.evaluation.ragas_pipeline import (
+    EvaluationQuestion,
+    append_graph_context,
+    collect_records,
+    load_questions,
+    summarize_ragas_result,
+)
+def test_load_questions_requires_exact_limit(tmp_path):
+    dataset = tmp_path / "questions.jsonl"
+    rows = [
+        {"id": "q1", "question": "Question 1?", "reference": "Reference 1."},
+        {"id": "q2", "question": "Question 2?", "reference": "Reference 2."},
+    ]
+    dataset.write_text("\n".join(json.dumps(row) for row in rows), encoding="utf-8")
+    questions = load_questions(dataset, limit=2)
+    assert [question.id for question in questions] == ["q1", "q2"]
+    assert questions[0].question == "Question 1?"
+def test_append_graph_context_skips_empty_context():
+    assert append_graph_context(["vector context"], "  ") == ["vector context"]
+    assert append_graph_context(["vector context"], "graph context") == [
+        "vector context",
+        "graph context",
+    ]
+def test_collect_records_builds_vector_and_graphrag_samples(monkeypatch):
+    questions = [
+        EvaluationQuestion(id="q1", question="What is Alpha?", reference="Alpha is a product."),
+    ]
+    monkeypatch.setattr(
+        ragas_pipeline,
+        "retrieve_vector_contexts",
+        lambda **_kwargs: ["Alpha vector context."],
+    )
+    monkeypatch.setattr(
+        ragas_pipeline,
+        "retrieve_graphrag_contexts",
+        lambda **_kwargs: ["Alpha vector context.", "Alpha is related to Beta."],
+    )
+    records = collect_records(
+        questions=questions,
+        user_id="user-1",
+        answer_generator=lambda question, contexts: f"{question} -> {len(contexts)} contexts",
+    )
+    assert records["vector"][0].mode == "vector"
+    assert records["vector"][0].response.endswith("1 contexts")
+    assert records["graphrag"][0].mode == "graphrag"
+    assert records["graphrag"][0].response.endswith("2 contexts")
+def test_summarize_ragas_result_averages_score_rows():
+    result = SimpleNamespace(
+        scores=[
+            {"faithfulness": 1.0, "context_recall": 0.5},
+            {"faithfulness": 0.5, "context_recall": 1.0},
+        ]
+    )
+    assert summarize_ragas_result(result) == {
+        "faithfulness": 0.75,
+        "context_recall": 0.75,
+    }