Spaces:
Running
Running
Srushti-Kamble commited on
Commit ·
234da68
1
Parent(s): cb1048d
test(rag): add RAGAS evaluation pipeline
Browse files- .gitignore +2 -1
- README.md +6 -0
- backend/app/evaluation/__init__.py +2 -0
- backend/app/evaluation/ragas_pipeline.py +292 -0
- backend/evaluation/ragas_sample_questions.jsonl +50 -0
- backend/requirements.txt +1 -0
- backend/scripts/run_ragas_eval.py +59 -0
- backend/tests/test_ragas_pipeline.py +76 -0
.gitignore
CHANGED
|
@@ -8,6 +8,7 @@ __pycache__/
|
|
| 8 |
# Data (runtime generated)
|
| 9 |
data/
|
| 10 |
*.db
|
|
|
|
| 11 |
|
| 12 |
# Environment
|
| 13 |
.env
|
|
@@ -29,4 +30,4 @@ Thumbs.db
|
|
| 29 |
# Misc
|
| 30 |
*.log
|
| 31 |
static/
|
| 32 |
-
.planning/
|
|
|
|
| 8 |
# Data (runtime generated)
|
| 9 |
data/
|
| 10 |
*.db
|
| 11 |
+
backend/evaluation/ragas_results.json
|
| 12 |
|
| 13 |
# Environment
|
| 14 |
.env
|
|
|
|
| 30 |
# Misc
|
| 31 |
*.log
|
| 32 |
static/
|
| 33 |
+
.planning/
|
README.md
CHANGED
|
@@ -524,6 +524,12 @@ docker compose up --build
|
|
| 524 |
|---------|-------------|
|
| 525 |
| `uvicorn app.main:app --reload` | Start FastAPI with hot reload |
|
| 526 |
| `uvicorn app.main:app --port 8000` | Start FastAPI on port 8000 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
|
| 528 |
### Frontend (`frontend/`)
|
| 529 |
|
|
|
|
| 524 |
|---------|-------------|
|
| 525 |
| `uvicorn app.main:app --reload` | Start FastAPI with hot reload |
|
| 526 |
| `uvicorn app.main:app --port 8000` | Start FastAPI on port 8000 |
|
| 527 |
+
| `python scripts/run_ragas_eval.py --user-id <user-id>` | Run the 50-question RAGAS comparison for vector search vs GraphRAG |
|
| 528 |
+
|
| 529 |
+
The RAGAS script reads `backend/evaluation/ragas_sample_questions.jsonl`,
|
| 530 |
+
generates answers from standard vector contexts and vector-plus-GraphRAG
|
| 531 |
+
contexts, then writes aggregate scores to `backend/evaluation/ragas_results.json`.
|
| 532 |
+
Pass `--document-id <document-id>` to evaluate one indexed document.
|
| 533 |
|
| 534 |
### Frontend (`frontend/`)
|
| 535 |
|
backend/app/evaluation/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Evaluation helpers for offline RAG quality checks."""
|
| 2 |
+
|
backend/app/evaluation/ragas_pipeline.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""RAGAS evaluation pipeline for vector search versus GraphRAG."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from statistics import mean
|
| 8 |
+
from typing import Any, Callable, Iterable, Optional
|
| 9 |
+
|
| 10 |
+
from huggingface_hub import InferenceClient
|
| 11 |
+
|
| 12 |
+
from app.config import get_settings
|
| 13 |
+
from app.rag.embeddings import embed_query
|
| 14 |
+
from app.rag.graph_retriever import get_entity_context
|
| 15 |
+
from app.rag.vectorstore import query_chunks
|
| 16 |
+
|
| 17 |
+
settings = get_settings()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
AnswerGenerator = Callable[[str, list[str]], str]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass(frozen=True)
|
| 24 |
+
class EvaluationQuestion:
|
| 25 |
+
id: str
|
| 26 |
+
question: str
|
| 27 |
+
reference: str
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass(frozen=True)
|
| 31 |
+
class EvaluationRecord:
|
| 32 |
+
id: str
|
| 33 |
+
mode: str
|
| 34 |
+
question: str
|
| 35 |
+
reference: str
|
| 36 |
+
response: str
|
| 37 |
+
contexts: list[str]
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def load_questions(dataset_path: Path, limit: int = 50) -> list[EvaluationQuestion]:
|
| 41 |
+
"""Load a JSONL RAGAS dataset and validate the required fields."""
|
| 42 |
+
questions: list[EvaluationQuestion] = []
|
| 43 |
+
|
| 44 |
+
with dataset_path.open("r", encoding="utf-8") as handle:
|
| 45 |
+
for line_number, line in enumerate(handle, start=1):
|
| 46 |
+
stripped = line.strip()
|
| 47 |
+
if not stripped:
|
| 48 |
+
continue
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
row = json.loads(stripped)
|
| 52 |
+
except json.JSONDecodeError as exc:
|
| 53 |
+
raise ValueError(f"Invalid JSON on line {line_number}: {exc}") from exc
|
| 54 |
+
|
| 55 |
+
missing = {"id", "question", "reference"} - set(row)
|
| 56 |
+
if missing:
|
| 57 |
+
fields = ", ".join(sorted(missing))
|
| 58 |
+
raise ValueError(f"Line {line_number} is missing required field(s): {fields}")
|
| 59 |
+
|
| 60 |
+
questions.append(
|
| 61 |
+
EvaluationQuestion(
|
| 62 |
+
id=str(row["id"]),
|
| 63 |
+
question=str(row["question"]).strip(),
|
| 64 |
+
reference=str(row["reference"]).strip(),
|
| 65 |
+
)
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
if len(questions) >= limit:
|
| 69 |
+
break
|
| 70 |
+
|
| 71 |
+
if len(questions) < limit:
|
| 72 |
+
raise ValueError(f"Expected {limit} evaluation questions, found {len(questions)}")
|
| 73 |
+
|
| 74 |
+
return questions
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def retrieve_vector_contexts(
|
| 78 |
+
question: str,
|
| 79 |
+
user_id: str,
|
| 80 |
+
document_id: Optional[str] = None,
|
| 81 |
+
top_k: Optional[int] = None,
|
| 82 |
+
) -> list[str]:
|
| 83 |
+
"""Retrieve plain vector-search contexts for a question."""
|
| 84 |
+
query_embedding = embed_query(question)
|
| 85 |
+
chunks = query_chunks(
|
| 86 |
+
query_embedding=query_embedding,
|
| 87 |
+
user_id=user_id,
|
| 88 |
+
document_id=document_id,
|
| 89 |
+
top_k=top_k or settings.TOP_K_RETRIEVAL,
|
| 90 |
+
)
|
| 91 |
+
return _chunk_texts(chunks)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def retrieve_graphrag_contexts(
|
| 95 |
+
question: str,
|
| 96 |
+
user_id: str,
|
| 97 |
+
document_id: Optional[str] = None,
|
| 98 |
+
top_k: Optional[int] = None,
|
| 99 |
+
) -> list[str]:
|
| 100 |
+
"""Retrieve vector contexts and append GraphRAG relationship context."""
|
| 101 |
+
contexts = retrieve_vector_contexts(
|
| 102 |
+
question=question,
|
| 103 |
+
user_id=user_id,
|
| 104 |
+
document_id=document_id,
|
| 105 |
+
top_k=top_k,
|
| 106 |
+
)
|
| 107 |
+
graph_context = get_entity_context(
|
| 108 |
+
query=question,
|
| 109 |
+
user_id=user_id,
|
| 110 |
+
document_id=document_id,
|
| 111 |
+
)
|
| 112 |
+
return append_graph_context(contexts, graph_context)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def append_graph_context(contexts: list[str], graph_context: str) -> list[str]:
|
| 116 |
+
"""Return contexts plus graph context when GraphRAG found relationships."""
|
| 117 |
+
clean_graph_context = graph_context.strip()
|
| 118 |
+
if not clean_graph_context:
|
| 119 |
+
return contexts
|
| 120 |
+
return [*contexts, clean_graph_context]
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def generate_grounded_answer(question: str, contexts: list[str]) -> str:
|
| 124 |
+
"""Generate an answer using only retrieved contexts."""
|
| 125 |
+
if not contexts:
|
| 126 |
+
return "I do not have enough retrieved context to answer this question."
|
| 127 |
+
|
| 128 |
+
client = InferenceClient(token=settings.HF_TOKEN)
|
| 129 |
+
context_block = "\n\n".join(
|
| 130 |
+
f"Context {index}:\n{context}" for index, context in enumerate(contexts, start=1)
|
| 131 |
+
)
|
| 132 |
+
prompt = (
|
| 133 |
+
"Answer the question using only the provided context. "
|
| 134 |
+
"If the context is insufficient, say that the answer is not available in the context.\n\n"
|
| 135 |
+
f"{context_block}\n\nQuestion: {question}"
|
| 136 |
+
)
|
| 137 |
+
response = client.chat_completion(
|
| 138 |
+
messages=[
|
| 139 |
+
{
|
| 140 |
+
"role": "system",
|
| 141 |
+
"content": "You are a careful RAG evaluator that only uses supplied evidence.",
|
| 142 |
+
},
|
| 143 |
+
{"role": "user", "content": prompt},
|
| 144 |
+
],
|
| 145 |
+
model=settings.LLM_MODEL,
|
| 146 |
+
max_tokens=min(settings.LLM_MAX_NEW_TOKENS, 512),
|
| 147 |
+
temperature=0.0,
|
| 148 |
+
)
|
| 149 |
+
if not response.choices:
|
| 150 |
+
return ""
|
| 151 |
+
return (response.choices[0].message.content or "").strip()
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def collect_records(
|
| 155 |
+
questions: Iterable[EvaluationQuestion],
|
| 156 |
+
user_id: str,
|
| 157 |
+
document_id: Optional[str] = None,
|
| 158 |
+
answer_generator: AnswerGenerator = generate_grounded_answer,
|
| 159 |
+
) -> dict[str, list[EvaluationRecord]]:
|
| 160 |
+
"""Build vector and GraphRAG samples ready for RAGAS."""
|
| 161 |
+
grouped: dict[str, list[EvaluationRecord]] = {"vector": [], "graphrag": []}
|
| 162 |
+
|
| 163 |
+
for item in questions:
|
| 164 |
+
vector_contexts = retrieve_vector_contexts(
|
| 165 |
+
question=item.question,
|
| 166 |
+
user_id=user_id,
|
| 167 |
+
document_id=document_id,
|
| 168 |
+
)
|
| 169 |
+
graphrag_contexts = retrieve_graphrag_contexts(
|
| 170 |
+
question=item.question,
|
| 171 |
+
user_id=user_id,
|
| 172 |
+
document_id=document_id,
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
grouped["vector"].append(
|
| 176 |
+
EvaluationRecord(
|
| 177 |
+
id=item.id,
|
| 178 |
+
mode="vector",
|
| 179 |
+
question=item.question,
|
| 180 |
+
reference=item.reference,
|
| 181 |
+
response=answer_generator(item.question, vector_contexts),
|
| 182 |
+
contexts=vector_contexts,
|
| 183 |
+
)
|
| 184 |
+
)
|
| 185 |
+
grouped["graphrag"].append(
|
| 186 |
+
EvaluationRecord(
|
| 187 |
+
id=item.id,
|
| 188 |
+
mode="graphrag",
|
| 189 |
+
question=item.question,
|
| 190 |
+
reference=item.reference,
|
| 191 |
+
response=answer_generator(item.question, graphrag_contexts),
|
| 192 |
+
contexts=graphrag_contexts,
|
| 193 |
+
)
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
return grouped
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def evaluate_records(records: list[EvaluationRecord]) -> dict[str, float]:
|
| 200 |
+
"""Run RAGAS over collected records and return mean metric scores."""
|
| 201 |
+
from langchain_huggingface import HuggingFaceEndpoint
|
| 202 |
+
from ragas import EvaluationDataset, evaluate
|
| 203 |
+
from ragas.llms import LangchainLLMWrapper
|
| 204 |
+
from ragas.metrics import Faithfulness, FactualCorrectness, LLMContextRecall
|
| 205 |
+
|
| 206 |
+
dataset = EvaluationDataset.from_list(
|
| 207 |
+
[
|
| 208 |
+
{
|
| 209 |
+
"user_input": record.question,
|
| 210 |
+
"retrieved_contexts": record.contexts,
|
| 211 |
+
"response": record.response,
|
| 212 |
+
"reference": record.reference,
|
| 213 |
+
}
|
| 214 |
+
for record in records
|
| 215 |
+
]
|
| 216 |
+
)
|
| 217 |
+
evaluator_llm = LangchainLLMWrapper(
|
| 218 |
+
HuggingFaceEndpoint(
|
| 219 |
+
repo_id=settings.LLM_MODEL,
|
| 220 |
+
huggingfacehub_api_token=settings.HF_TOKEN,
|
| 221 |
+
max_new_tokens=512,
|
| 222 |
+
temperature=0.0,
|
| 223 |
+
timeout=300,
|
| 224 |
+
)
|
| 225 |
+
)
|
| 226 |
+
result = evaluate(
|
| 227 |
+
dataset=dataset,
|
| 228 |
+
metrics=[
|
| 229 |
+
Faithfulness(),
|
| 230 |
+
FactualCorrectness(),
|
| 231 |
+
LLMContextRecall(),
|
| 232 |
+
],
|
| 233 |
+
llm=evaluator_llm,
|
| 234 |
+
)
|
| 235 |
+
return summarize_ragas_result(result)
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def compare_pipelines(grouped_records: dict[str, list[EvaluationRecord]]) -> dict[str, Any]:
|
| 239 |
+
"""Evaluate both retrieval modes and include metric deltas."""
|
| 240 |
+
vector_scores = evaluate_records(grouped_records["vector"])
|
| 241 |
+
graphrag_scores = evaluate_records(grouped_records["graphrag"])
|
| 242 |
+
metrics = sorted(set(vector_scores) | set(graphrag_scores))
|
| 243 |
+
|
| 244 |
+
return {
|
| 245 |
+
"vector": vector_scores,
|
| 246 |
+
"graphrag": graphrag_scores,
|
| 247 |
+
"delta": {
|
| 248 |
+
metric: round(graphrag_scores.get(metric, 0.0) - vector_scores.get(metric, 0.0), 4)
|
| 249 |
+
for metric in metrics
|
| 250 |
+
},
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def summarize_ragas_result(result: Any) -> dict[str, float]:
|
| 255 |
+
"""Normalize RAGAS result objects into mean metric scores."""
|
| 256 |
+
if hasattr(result, "to_pandas"):
|
| 257 |
+
dataframe = result.to_pandas()
|
| 258 |
+
scores: dict[str, float] = {}
|
| 259 |
+
for column in dataframe.columns:
|
| 260 |
+
values = [
|
| 261 |
+
float(value)
|
| 262 |
+
for value in dataframe[column].tolist()
|
| 263 |
+
if isinstance(value, (int, float)) and value == value
|
| 264 |
+
]
|
| 265 |
+
if values:
|
| 266 |
+
scores[str(column)] = round(mean(values), 4)
|
| 267 |
+
return scores
|
| 268 |
+
|
| 269 |
+
if isinstance(result, dict):
|
| 270 |
+
return {
|
| 271 |
+
str(key): round(float(value), 4)
|
| 272 |
+
for key, value in result.items()
|
| 273 |
+
if isinstance(value, (int, float))
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
scores = getattr(result, "scores", None)
|
| 277 |
+
if isinstance(scores, list):
|
| 278 |
+
by_metric: dict[str, list[float]] = {}
|
| 279 |
+
for row in scores:
|
| 280 |
+
if not isinstance(row, dict):
|
| 281 |
+
continue
|
| 282 |
+
for key, value in row.items():
|
| 283 |
+
if isinstance(value, (int, float)):
|
| 284 |
+
by_metric.setdefault(str(key), []).append(float(value))
|
| 285 |
+
return {key: round(mean(values), 4) for key, values in by_metric.items()}
|
| 286 |
+
|
| 287 |
+
raise TypeError(f"Unsupported RAGAS result type: {type(result)!r}")
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def _chunk_texts(chunks: list[dict[str, Any]]) -> list[str]:
|
| 291 |
+
return [str(chunk["text"]) for chunk in chunks if chunk.get("text")]
|
| 292 |
+
|
backend/evaluation/ragas_sample_questions.jsonl
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"id":"q001","question":"What is the main purpose of PDF-Assistant-RAG?","reference":"PDF-Assistant-RAG helps users upload documents, retrieve relevant document context, and ask questions answered through a retrieval-augmented generation workflow."}
|
| 2 |
+
{"id":"q002","question":"Which backend framework serves the API?","reference":"The backend API is served by FastAPI."}
|
| 3 |
+
{"id":"q003","question":"Which frontend framework is used for the application interface?","reference":"The frontend is a Next.js application."}
|
| 4 |
+
{"id":"q004","question":"What does the document upload route do before saving permanent state?","reference":"The upload route validates filename, extension, size, MIME type, and parser readability before moving a file into permanent storage."}
|
| 5 |
+
{"id":"q005","question":"Which vector database stores retrieved document chunks?","reference":"ChromaDB stores document chunks for vector retrieval."}
|
| 6 |
+
{"id":"q006","question":"Which embedding model is configured by default?","reference":"The default embedding model is sentence-transformers/all-MiniLM-L6-v2."}
|
| 7 |
+
{"id":"q007","question":"What is the default embedding dimension?","reference":"The default embedding dimension is 384."}
|
| 8 |
+
{"id":"q008","question":"What is the purpose of TOP_K_RETRIEVAL?","reference":"TOP_K_RETRIEVAL controls how many candidate chunks are retrieved before reranking."}
|
| 9 |
+
{"id":"q009","question":"What is the purpose of TOP_K_RERANK?","reference":"TOP_K_RERANK controls how many reranked chunks are finally passed to answer generation."}
|
| 10 |
+
{"id":"q010","question":"Which model family is used for reranking by default?","reference":"The default reranker is a cross-encoder model, cross-encoder/ms-marco-MiniLM-L-6-v2."}
|
| 11 |
+
{"id":"q011","question":"How does the backend identify authenticated users?","reference":"Authenticated routes use JWT identity through the current-user dependency."}
|
| 12 |
+
{"id":"q012","question":"What data must user-facing routes filter by?","reference":"User-facing routes must filter documents, files, vector chunks, and chat data by the authenticated user's id."}
|
| 13 |
+
{"id":"q013","question":"What does the health endpoint check?","reference":"The health endpoint checks service health such as API, SQL database, and Chroma availability."}
|
| 14 |
+
{"id":"q014","question":"What does the chat route provide besides normal JSON answers?","reference":"The chat route supports server-sent events so answers can stream tokens to the frontend."}
|
| 15 |
+
{"id":"q015","question":"What is GraphRAG used for in this project?","reference":"GraphRAG builds and retrieves lightweight entity co-occurrence relationships to add graph context to document answers."}
|
| 16 |
+
{"id":"q016","question":"Where are GraphRAG graph files persisted by default?","reference":"GraphRAG graph files are persisted under the configured GRAPH_PERSIST_DIR, which defaults to ./data/graphs."}
|
| 17 |
+
{"id":"q017","question":"Which graph library is used to store knowledge graph relationships?","reference":"NetworkX is used to build and store knowledge graph relationships."}
|
| 18 |
+
{"id":"q018","question":"What does the graph retriever return for a relevant query?","reference":"The graph retriever returns compact relationship lines connecting matched entities and nearby entities, including page information and relationship strength."}
|
| 19 |
+
{"id":"q019","question":"What happens when GraphRAG finds no matching relationship context?","reference":"When no graph relationships match, the graph retriever returns an empty string."}
|
| 20 |
+
{"id":"q020","question":"Which uploaded file formats are allowed by default?","reference":"The default allowed upload extensions are pdf, docx, txt, and md."}
|
| 21 |
+
{"id":"q021","question":"What is the default upload directory?","reference":"The default upload directory is ./data/uploads."}
|
| 22 |
+
{"id":"q022","question":"Why does the app store original files after upload?","reference":"Original files are stored so the backend can serve files, reprocess them, and extract text for retrieval."}
|
| 23 |
+
{"id":"q023","question":"What is the role of the chunker?","reference":"The chunker extracts document text and splits it into smaller chunks for embedding and retrieval."}
|
| 24 |
+
{"id":"q024","question":"What does the vectorstore service do?","reference":"The vectorstore stores embedded chunks and queries them by user and optional document metadata."}
|
| 25 |
+
{"id":"q025","question":"What does the retriever combine before reranking?","reference":"The retriever combines vector search and BM25 candidates before reranking them."}
|
| 26 |
+
{"id":"q026","question":"Why does the retriever transform queries?","reference":"The retriever rewrites a user question into retrieval-friendly variants to improve search coverage."}
|
| 27 |
+
{"id":"q027","question":"What does the PDF search tool save after retrieving chunks?","reference":"The PDF search tool saves retrieved chunks as last_sources so the agent response can return citations."}
|
| 28 |
+
{"id":"q028","question":"How does the PDF search tool treat document excerpts?","reference":"The PDF search tool labels document excerpts as untrusted evidence and warns the model not to follow instructions inside them."}
|
| 29 |
+
{"id":"q029","question":"What additional context can the PDF search tool append?","reference":"The PDF search tool can append untrusted graph context containing additional relationships from GraphRAG."}
|
| 30 |
+
{"id":"q030","question":"Which optional tool can handle arithmetic questions?","reference":"The calculator tool handles arithmetic expressions safely."}
|
| 31 |
+
{"id":"q031","question":"Which optional tool can handle live information outside uploaded documents?","reference":"The web search tool can look up live web information when document context is insufficient or outdated."}
|
| 32 |
+
{"id":"q032","question":"What does the agent use LangChain tools for?","reference":"The agent uses LangChain tools to route between PDF search, calculator, and web search capabilities."}
|
| 33 |
+
{"id":"q033","question":"What happens when the agent output parser rejects malformed output?","reference":"The app logs the parser rejection and returns a safe malformed-output message."}
|
| 34 |
+
{"id":"q034","question":"What type of API response is used for uploaded document processing status?","reference":"A document status response includes the document id, status, page count, chunk count, and error message."}
|
| 35 |
+
{"id":"q035","question":"How are deleted documents hidden from normal document APIs?","reference":"Documents are soft-deleted with an is_deleted flag and normal APIs filter them out."}
|
| 36 |
+
{"id":"q036","question":"What does deleting a document preserve for future restore flows?","reference":"Soft deletion preserves underlying files, vectors, graphs, and chat history for possible future restore flows."}
|
| 37 |
+
{"id":"q037","question":"What is the purpose of CHUNK_SIZE?","reference":"CHUNK_SIZE controls the number of characters in each document chunk."}
|
| 38 |
+
{"id":"q038","question":"What is the purpose of CHUNK_OVERLAP?","reference":"CHUNK_OVERLAP controls how much text overlaps between adjacent chunks to preserve boundary context."}
|
| 39 |
+
{"id":"q039","question":"Which HuggingFace setting controls answer length?","reference":"LLM_MAX_NEW_TOKENS controls the maximum number of generated tokens for answers."}
|
| 40 |
+
{"id":"q040","question":"Which HuggingFace setting controls answer randomness?","reference":"LLM_TEMPERATURE controls sampling randomness during answer generation."}
|
| 41 |
+
{"id":"q041","question":"What environment variable stores the HuggingFace token?","reference":"HF_TOKEN stores the HuggingFace API token used for inference."}
|
| 42 |
+
{"id":"q042","question":"Why should DEBUG not be enabled in production?","reference":"DEBUG enables detailed behavior intended for development and should not be enabled in production."}
|
| 43 |
+
{"id":"q043","question":"How are production CORS origins configured?","reference":"Production CORS origins are configured through ALLOWED_ORIGINS."}
|
| 44 |
+
{"id":"q044","question":"What database is used by default for local development?","reference":"The default database URL points to a local SQLite database at ./data/app.db."}
|
| 45 |
+
{"id":"q045","question":"What database does Docker Compose provide for the stack?","reference":"Docker Compose provides a PostgreSQL database service for the stack."}
|
| 46 |
+
{"id":"q046","question":"What is the contributor target branch for pull requests?","reference":"Contributor pull requests should target the dev branch."}
|
| 47 |
+
{"id":"q047","question":"Which branch is production protected for deployment?","reference":"The main branch is treated as the production branch for deployment."}
|
| 48 |
+
{"id":"q048","question":"Where can developers view Swagger locally?","reference":"Developers can view Swagger at /docs when the backend is running locally."}
|
| 49 |
+
{"id":"q049","question":"What does the architecture document focus on?","reference":"The architecture document focuses on how requests move through the system and how major runtime components interact."}
|
| 50 |
+
{"id":"q050","question":"Why is a RAGAS evaluation pipeline useful for this project?","reference":"A RAGAS evaluation pipeline provides quantitative scores to compare standard vector search with GraphRAG and track retrieval and answer quality over time."}
|
backend/requirements.txt
CHANGED
|
@@ -38,6 +38,7 @@ langchain-huggingface
|
|
| 38 |
langchain-text-splitters
|
| 39 |
langsmith
|
| 40 |
rank-bm25
|
|
|
|
| 41 |
|
| 42 |
# Embeddings & ML
|
| 43 |
sentence-transformers
|
|
|
|
| 38 |
langchain-text-splitters
|
| 39 |
langsmith
|
| 40 |
rank-bm25
|
| 41 |
+
ragas>=0.3.0
|
| 42 |
|
| 43 |
# Embeddings & ML
|
| 44 |
sentence-transformers
|
backend/scripts/run_ragas_eval.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Run a 50-question RAGAS comparison for vector search and GraphRAG."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
import sys
|
| 7 |
+
from datetime import datetime, timezone
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
ROOT = Path(__file__).resolve().parents[2]
|
| 11 |
+
BACKEND_DIR = ROOT / "backend"
|
| 12 |
+
if str(BACKEND_DIR) not in sys.path:
|
| 13 |
+
sys.path.insert(0, str(BACKEND_DIR))
|
| 14 |
+
|
| 15 |
+
DEFAULT_DATASET = BACKEND_DIR / "evaluation" / "ragas_sample_questions.jsonl"
|
| 16 |
+
DEFAULT_OUTPUT = BACKEND_DIR / "evaluation" / "ragas_results.json"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def parse_args() -> argparse.Namespace:
|
| 20 |
+
parser = argparse.ArgumentParser(
|
| 21 |
+
description="Evaluate vector search versus GraphRAG with RAGAS.",
|
| 22 |
+
)
|
| 23 |
+
parser.add_argument("--user-id", required=True, help="Owner user id for indexed documents.")
|
| 24 |
+
parser.add_argument("--document-id", help="Optional single document id to evaluate.")
|
| 25 |
+
parser.add_argument("--dataset", type=Path, default=DEFAULT_DATASET)
|
| 26 |
+
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
|
| 27 |
+
parser.add_argument("--limit", type=int, default=50)
|
| 28 |
+
return parser.parse_args()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def main() -> None:
|
| 32 |
+
args = parse_args()
|
| 33 |
+
|
| 34 |
+
from app.evaluation.ragas_pipeline import collect_records, compare_pipelines, load_questions
|
| 35 |
+
|
| 36 |
+
questions = load_questions(args.dataset, limit=args.limit)
|
| 37 |
+
grouped_records = collect_records(
|
| 38 |
+
questions=questions,
|
| 39 |
+
user_id=args.user_id,
|
| 40 |
+
document_id=args.document_id,
|
| 41 |
+
)
|
| 42 |
+
scores = compare_pipelines(grouped_records)
|
| 43 |
+
payload = {
|
| 44 |
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
| 45 |
+
"dataset": str(args.dataset),
|
| 46 |
+
"question_count": len(questions),
|
| 47 |
+
"user_id": args.user_id,
|
| 48 |
+
"document_id": args.document_id,
|
| 49 |
+
"scores": scores,
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
args.output.parent.mkdir(parents=True, exist_ok=True)
|
| 53 |
+
args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
| 54 |
+
print(json.dumps(payload["scores"], indent=2))
|
| 55 |
+
print(f"Wrote RAGAS evaluation results to {args.output}")
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
main()
|
backend/tests/test_ragas_pipeline.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from types import SimpleNamespace
|
| 3 |
+
|
| 4 |
+
from app.evaluation import ragas_pipeline
|
| 5 |
+
from app.evaluation.ragas_pipeline import (
|
| 6 |
+
EvaluationQuestion,
|
| 7 |
+
append_graph_context,
|
| 8 |
+
collect_records,
|
| 9 |
+
load_questions,
|
| 10 |
+
summarize_ragas_result,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def test_load_questions_requires_exact_limit(tmp_path):
|
| 15 |
+
dataset = tmp_path / "questions.jsonl"
|
| 16 |
+
rows = [
|
| 17 |
+
{"id": "q1", "question": "Question 1?", "reference": "Reference 1."},
|
| 18 |
+
{"id": "q2", "question": "Question 2?", "reference": "Reference 2."},
|
| 19 |
+
]
|
| 20 |
+
dataset.write_text("\n".join(json.dumps(row) for row in rows), encoding="utf-8")
|
| 21 |
+
|
| 22 |
+
questions = load_questions(dataset, limit=2)
|
| 23 |
+
|
| 24 |
+
assert [question.id for question in questions] == ["q1", "q2"]
|
| 25 |
+
assert questions[0].question == "Question 1?"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def test_append_graph_context_skips_empty_context():
|
| 29 |
+
assert append_graph_context(["vector context"], " ") == ["vector context"]
|
| 30 |
+
assert append_graph_context(["vector context"], "graph context") == [
|
| 31 |
+
"vector context",
|
| 32 |
+
"graph context",
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def test_collect_records_builds_vector_and_graphrag_samples(monkeypatch):
|
| 37 |
+
questions = [
|
| 38 |
+
EvaluationQuestion(id="q1", question="What is Alpha?", reference="Alpha is a product."),
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
monkeypatch.setattr(
|
| 42 |
+
ragas_pipeline,
|
| 43 |
+
"retrieve_vector_contexts",
|
| 44 |
+
lambda **_kwargs: ["Alpha vector context."],
|
| 45 |
+
)
|
| 46 |
+
monkeypatch.setattr(
|
| 47 |
+
ragas_pipeline,
|
| 48 |
+
"retrieve_graphrag_contexts",
|
| 49 |
+
lambda **_kwargs: ["Alpha vector context.", "Alpha is related to Beta."],
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
records = collect_records(
|
| 53 |
+
questions=questions,
|
| 54 |
+
user_id="user-1",
|
| 55 |
+
answer_generator=lambda question, contexts: f"{question} -> {len(contexts)} contexts",
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
assert records["vector"][0].mode == "vector"
|
| 59 |
+
assert records["vector"][0].response.endswith("1 contexts")
|
| 60 |
+
assert records["graphrag"][0].mode == "graphrag"
|
| 61 |
+
assert records["graphrag"][0].response.endswith("2 contexts")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def test_summarize_ragas_result_averages_score_rows():
|
| 65 |
+
result = SimpleNamespace(
|
| 66 |
+
scores=[
|
| 67 |
+
{"faithfulness": 1.0, "context_recall": 0.5},
|
| 68 |
+
{"faithfulness": 0.5, "context_recall": 1.0},
|
| 69 |
+
]
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
assert summarize_ragas_result(result) == {
|
| 73 |
+
"faithfulness": 0.75,
|
| 74 |
+
"context_recall": 0.75,
|
| 75 |
+
}
|
| 76 |
+
|