Srushti-Kamble commited on
Commit
234da68
·
1 Parent(s): cb1048d

test(rag): add RAGAS evaluation pipeline

Browse files
.gitignore CHANGED
@@ -8,6 +8,7 @@ __pycache__/
8
  # Data (runtime generated)
9
  data/
10
  *.db
 
11
 
12
  # Environment
13
  .env
@@ -29,4 +30,4 @@ Thumbs.db
29
  # Misc
30
  *.log
31
  static/
32
- .planning/
 
8
  # Data (runtime generated)
9
  data/
10
  *.db
11
+ backend/evaluation/ragas_results.json
12
 
13
  # Environment
14
  .env
 
30
  # Misc
31
  *.log
32
  static/
33
+ .planning/
README.md CHANGED
@@ -524,6 +524,12 @@ docker compose up --build
524
  |---------|-------------|
525
  | `uvicorn app.main:app --reload` | Start FastAPI with hot reload |
526
  | `uvicorn app.main:app --port 8000` | Start FastAPI on port 8000 |
 
 
 
 
 
 
527
 
528
  ### Frontend (`frontend/`)
529
 
 
524
  |---------|-------------|
525
  | `uvicorn app.main:app --reload` | Start FastAPI with hot reload |
526
  | `uvicorn app.main:app --port 8000` | Start FastAPI on port 8000 |
527
+ | `python scripts/run_ragas_eval.py --user-id <user-id>` | Run the 50-question RAGAS comparison for vector search vs GraphRAG |
528
+
529
+ The RAGAS script reads `backend/evaluation/ragas_sample_questions.jsonl`,
530
+ generates answers from standard vector contexts and vector-plus-GraphRAG
531
+ contexts, then writes aggregate scores to `backend/evaluation/ragas_results.json`.
532
+ Pass `--document-id <document-id>` to evaluate one indexed document.
533
 
534
  ### Frontend (`frontend/`)
535
 
backend/app/evaluation/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ """Evaluation helpers for offline RAG quality checks."""
2
+
backend/app/evaluation/ragas_pipeline.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """RAGAS evaluation pipeline for vector search versus GraphRAG."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from statistics import mean
8
+ from typing import Any, Callable, Iterable, Optional
9
+
10
+ from huggingface_hub import InferenceClient
11
+
12
+ from app.config import get_settings
13
+ from app.rag.embeddings import embed_query
14
+ from app.rag.graph_retriever import get_entity_context
15
+ from app.rag.vectorstore import query_chunks
16
+
17
+ settings = get_settings()
18
+
19
+
20
+ AnswerGenerator = Callable[[str, list[str]], str]
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class EvaluationQuestion:
25
+ id: str
26
+ question: str
27
+ reference: str
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class EvaluationRecord:
32
+ id: str
33
+ mode: str
34
+ question: str
35
+ reference: str
36
+ response: str
37
+ contexts: list[str]
38
+
39
+
40
+ def load_questions(dataset_path: Path, limit: int = 50) -> list[EvaluationQuestion]:
41
+ """Load a JSONL RAGAS dataset and validate the required fields."""
42
+ questions: list[EvaluationQuestion] = []
43
+
44
+ with dataset_path.open("r", encoding="utf-8") as handle:
45
+ for line_number, line in enumerate(handle, start=1):
46
+ stripped = line.strip()
47
+ if not stripped:
48
+ continue
49
+
50
+ try:
51
+ row = json.loads(stripped)
52
+ except json.JSONDecodeError as exc:
53
+ raise ValueError(f"Invalid JSON on line {line_number}: {exc}") from exc
54
+
55
+ missing = {"id", "question", "reference"} - set(row)
56
+ if missing:
57
+ fields = ", ".join(sorted(missing))
58
+ raise ValueError(f"Line {line_number} is missing required field(s): {fields}")
59
+
60
+ questions.append(
61
+ EvaluationQuestion(
62
+ id=str(row["id"]),
63
+ question=str(row["question"]).strip(),
64
+ reference=str(row["reference"]).strip(),
65
+ )
66
+ )
67
+
68
+ if len(questions) >= limit:
69
+ break
70
+
71
+ if len(questions) < limit:
72
+ raise ValueError(f"Expected {limit} evaluation questions, found {len(questions)}")
73
+
74
+ return questions
75
+
76
+
77
+ def retrieve_vector_contexts(
78
+ question: str,
79
+ user_id: str,
80
+ document_id: Optional[str] = None,
81
+ top_k: Optional[int] = None,
82
+ ) -> list[str]:
83
+ """Retrieve plain vector-search contexts for a question."""
84
+ query_embedding = embed_query(question)
85
+ chunks = query_chunks(
86
+ query_embedding=query_embedding,
87
+ user_id=user_id,
88
+ document_id=document_id,
89
+ top_k=top_k or settings.TOP_K_RETRIEVAL,
90
+ )
91
+ return _chunk_texts(chunks)
92
+
93
+
94
+ def retrieve_graphrag_contexts(
95
+ question: str,
96
+ user_id: str,
97
+ document_id: Optional[str] = None,
98
+ top_k: Optional[int] = None,
99
+ ) -> list[str]:
100
+ """Retrieve vector contexts and append GraphRAG relationship context."""
101
+ contexts = retrieve_vector_contexts(
102
+ question=question,
103
+ user_id=user_id,
104
+ document_id=document_id,
105
+ top_k=top_k,
106
+ )
107
+ graph_context = get_entity_context(
108
+ query=question,
109
+ user_id=user_id,
110
+ document_id=document_id,
111
+ )
112
+ return append_graph_context(contexts, graph_context)
113
+
114
+
115
+ def append_graph_context(contexts: list[str], graph_context: str) -> list[str]:
116
+ """Return contexts plus graph context when GraphRAG found relationships."""
117
+ clean_graph_context = graph_context.strip()
118
+ if not clean_graph_context:
119
+ return contexts
120
+ return [*contexts, clean_graph_context]
121
+
122
+
123
+ def generate_grounded_answer(question: str, contexts: list[str]) -> str:
124
+ """Generate an answer using only retrieved contexts."""
125
+ if not contexts:
126
+ return "I do not have enough retrieved context to answer this question."
127
+
128
+ client = InferenceClient(token=settings.HF_TOKEN)
129
+ context_block = "\n\n".join(
130
+ f"Context {index}:\n{context}" for index, context in enumerate(contexts, start=1)
131
+ )
132
+ prompt = (
133
+ "Answer the question using only the provided context. "
134
+ "If the context is insufficient, say that the answer is not available in the context.\n\n"
135
+ f"{context_block}\n\nQuestion: {question}"
136
+ )
137
+ response = client.chat_completion(
138
+ messages=[
139
+ {
140
+ "role": "system",
141
+ "content": "You are a careful RAG evaluator that only uses supplied evidence.",
142
+ },
143
+ {"role": "user", "content": prompt},
144
+ ],
145
+ model=settings.LLM_MODEL,
146
+ max_tokens=min(settings.LLM_MAX_NEW_TOKENS, 512),
147
+ temperature=0.0,
148
+ )
149
+ if not response.choices:
150
+ return ""
151
+ return (response.choices[0].message.content or "").strip()
152
+
153
+
154
+ def collect_records(
155
+ questions: Iterable[EvaluationQuestion],
156
+ user_id: str,
157
+ document_id: Optional[str] = None,
158
+ answer_generator: AnswerGenerator = generate_grounded_answer,
159
+ ) -> dict[str, list[EvaluationRecord]]:
160
+ """Build vector and GraphRAG samples ready for RAGAS."""
161
+ grouped: dict[str, list[EvaluationRecord]] = {"vector": [], "graphrag": []}
162
+
163
+ for item in questions:
164
+ vector_contexts = retrieve_vector_contexts(
165
+ question=item.question,
166
+ user_id=user_id,
167
+ document_id=document_id,
168
+ )
169
+ graphrag_contexts = retrieve_graphrag_contexts(
170
+ question=item.question,
171
+ user_id=user_id,
172
+ document_id=document_id,
173
+ )
174
+
175
+ grouped["vector"].append(
176
+ EvaluationRecord(
177
+ id=item.id,
178
+ mode="vector",
179
+ question=item.question,
180
+ reference=item.reference,
181
+ response=answer_generator(item.question, vector_contexts),
182
+ contexts=vector_contexts,
183
+ )
184
+ )
185
+ grouped["graphrag"].append(
186
+ EvaluationRecord(
187
+ id=item.id,
188
+ mode="graphrag",
189
+ question=item.question,
190
+ reference=item.reference,
191
+ response=answer_generator(item.question, graphrag_contexts),
192
+ contexts=graphrag_contexts,
193
+ )
194
+ )
195
+
196
+ return grouped
197
+
198
+
199
+ def evaluate_records(records: list[EvaluationRecord]) -> dict[str, float]:
200
+ """Run RAGAS over collected records and return mean metric scores."""
201
+ from langchain_huggingface import HuggingFaceEndpoint
202
+ from ragas import EvaluationDataset, evaluate
203
+ from ragas.llms import LangchainLLMWrapper
204
+ from ragas.metrics import Faithfulness, FactualCorrectness, LLMContextRecall
205
+
206
+ dataset = EvaluationDataset.from_list(
207
+ [
208
+ {
209
+ "user_input": record.question,
210
+ "retrieved_contexts": record.contexts,
211
+ "response": record.response,
212
+ "reference": record.reference,
213
+ }
214
+ for record in records
215
+ ]
216
+ )
217
+ evaluator_llm = LangchainLLMWrapper(
218
+ HuggingFaceEndpoint(
219
+ repo_id=settings.LLM_MODEL,
220
+ huggingfacehub_api_token=settings.HF_TOKEN,
221
+ max_new_tokens=512,
222
+ temperature=0.0,
223
+ timeout=300,
224
+ )
225
+ )
226
+ result = evaluate(
227
+ dataset=dataset,
228
+ metrics=[
229
+ Faithfulness(),
230
+ FactualCorrectness(),
231
+ LLMContextRecall(),
232
+ ],
233
+ llm=evaluator_llm,
234
+ )
235
+ return summarize_ragas_result(result)
236
+
237
+
238
+ def compare_pipelines(grouped_records: dict[str, list[EvaluationRecord]]) -> dict[str, Any]:
239
+ """Evaluate both retrieval modes and include metric deltas."""
240
+ vector_scores = evaluate_records(grouped_records["vector"])
241
+ graphrag_scores = evaluate_records(grouped_records["graphrag"])
242
+ metrics = sorted(set(vector_scores) | set(graphrag_scores))
243
+
244
+ return {
245
+ "vector": vector_scores,
246
+ "graphrag": graphrag_scores,
247
+ "delta": {
248
+ metric: round(graphrag_scores.get(metric, 0.0) - vector_scores.get(metric, 0.0), 4)
249
+ for metric in metrics
250
+ },
251
+ }
252
+
253
+
254
+ def summarize_ragas_result(result: Any) -> dict[str, float]:
255
+ """Normalize RAGAS result objects into mean metric scores."""
256
+ if hasattr(result, "to_pandas"):
257
+ dataframe = result.to_pandas()
258
+ scores: dict[str, float] = {}
259
+ for column in dataframe.columns:
260
+ values = [
261
+ float(value)
262
+ for value in dataframe[column].tolist()
263
+ if isinstance(value, (int, float)) and value == value
264
+ ]
265
+ if values:
266
+ scores[str(column)] = round(mean(values), 4)
267
+ return scores
268
+
269
+ if isinstance(result, dict):
270
+ return {
271
+ str(key): round(float(value), 4)
272
+ for key, value in result.items()
273
+ if isinstance(value, (int, float))
274
+ }
275
+
276
+ scores = getattr(result, "scores", None)
277
+ if isinstance(scores, list):
278
+ by_metric: dict[str, list[float]] = {}
279
+ for row in scores:
280
+ if not isinstance(row, dict):
281
+ continue
282
+ for key, value in row.items():
283
+ if isinstance(value, (int, float)):
284
+ by_metric.setdefault(str(key), []).append(float(value))
285
+ return {key: round(mean(values), 4) for key, values in by_metric.items()}
286
+
287
+ raise TypeError(f"Unsupported RAGAS result type: {type(result)!r}")
288
+
289
+
290
+ def _chunk_texts(chunks: list[dict[str, Any]]) -> list[str]:
291
+ return [str(chunk["text"]) for chunk in chunks if chunk.get("text")]
292
+
backend/evaluation/ragas_sample_questions.jsonl ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"id":"q001","question":"What is the main purpose of PDF-Assistant-RAG?","reference":"PDF-Assistant-RAG helps users upload documents, retrieve relevant document context, and ask questions answered through a retrieval-augmented generation workflow."}
2
+ {"id":"q002","question":"Which backend framework serves the API?","reference":"The backend API is served by FastAPI."}
3
+ {"id":"q003","question":"Which frontend framework is used for the application interface?","reference":"The frontend is a Next.js application."}
4
+ {"id":"q004","question":"What does the document upload route do before saving permanent state?","reference":"The upload route validates filename, extension, size, MIME type, and parser readability before moving a file into permanent storage."}
5
+ {"id":"q005","question":"Which vector database stores retrieved document chunks?","reference":"ChromaDB stores document chunks for vector retrieval."}
6
+ {"id":"q006","question":"Which embedding model is configured by default?","reference":"The default embedding model is sentence-transformers/all-MiniLM-L6-v2."}
7
+ {"id":"q007","question":"What is the default embedding dimension?","reference":"The default embedding dimension is 384."}
8
+ {"id":"q008","question":"What is the purpose of TOP_K_RETRIEVAL?","reference":"TOP_K_RETRIEVAL controls how many candidate chunks are retrieved before reranking."}
9
+ {"id":"q009","question":"What is the purpose of TOP_K_RERANK?","reference":"TOP_K_RERANK controls how many reranked chunks are finally passed to answer generation."}
10
+ {"id":"q010","question":"Which model family is used for reranking by default?","reference":"The default reranker is a cross-encoder model, cross-encoder/ms-marco-MiniLM-L-6-v2."}
11
+ {"id":"q011","question":"How does the backend identify authenticated users?","reference":"Authenticated routes use JWT identity through the current-user dependency."}
12
+ {"id":"q012","question":"What data must user-facing routes filter by?","reference":"User-facing routes must filter documents, files, vector chunks, and chat data by the authenticated user's id."}
13
+ {"id":"q013","question":"What does the health endpoint check?","reference":"The health endpoint checks service health such as API, SQL database, and Chroma availability."}
14
+ {"id":"q014","question":"What does the chat route provide besides normal JSON answers?","reference":"The chat route supports server-sent events so answers can stream tokens to the frontend."}
15
+ {"id":"q015","question":"What is GraphRAG used for in this project?","reference":"GraphRAG builds and retrieves lightweight entity co-occurrence relationships to add graph context to document answers."}
16
+ {"id":"q016","question":"Where are GraphRAG graph files persisted by default?","reference":"GraphRAG graph files are persisted under the configured GRAPH_PERSIST_DIR, which defaults to ./data/graphs."}
17
+ {"id":"q017","question":"Which graph library is used to store knowledge graph relationships?","reference":"NetworkX is used to build and store knowledge graph relationships."}
18
+ {"id":"q018","question":"What does the graph retriever return for a relevant query?","reference":"The graph retriever returns compact relationship lines connecting matched entities and nearby entities, including page information and relationship strength."}
19
+ {"id":"q019","question":"What happens when GraphRAG finds no matching relationship context?","reference":"When no graph relationships match, the graph retriever returns an empty string."}
20
+ {"id":"q020","question":"Which uploaded file formats are allowed by default?","reference":"The default allowed upload extensions are pdf, docx, txt, and md."}
21
+ {"id":"q021","question":"What is the default upload directory?","reference":"The default upload directory is ./data/uploads."}
22
+ {"id":"q022","question":"Why does the app store original files after upload?","reference":"Original files are stored so the backend can serve files, reprocess them, and extract text for retrieval."}
23
+ {"id":"q023","question":"What is the role of the chunker?","reference":"The chunker extracts document text and splits it into smaller chunks for embedding and retrieval."}
24
+ {"id":"q024","question":"What does the vectorstore service do?","reference":"The vectorstore stores embedded chunks and queries them by user and optional document metadata."}
25
+ {"id":"q025","question":"What does the retriever combine before reranking?","reference":"The retriever combines vector search and BM25 candidates before reranking them."}
26
+ {"id":"q026","question":"Why does the retriever transform queries?","reference":"The retriever rewrites a user question into retrieval-friendly variants to improve search coverage."}
27
+ {"id":"q027","question":"What does the PDF search tool save after retrieving chunks?","reference":"The PDF search tool saves retrieved chunks as last_sources so the agent response can return citations."}
28
+ {"id":"q028","question":"How does the PDF search tool treat document excerpts?","reference":"The PDF search tool labels document excerpts as untrusted evidence and warns the model not to follow instructions inside them."}
29
+ {"id":"q029","question":"What additional context can the PDF search tool append?","reference":"The PDF search tool can append untrusted graph context containing additional relationships from GraphRAG."}
30
+ {"id":"q030","question":"Which optional tool can handle arithmetic questions?","reference":"The calculator tool handles arithmetic expressions safely."}
31
+ {"id":"q031","question":"Which optional tool can handle live information outside uploaded documents?","reference":"The web search tool can look up live web information when document context is insufficient or outdated."}
32
+ {"id":"q032","question":"What does the agent use LangChain tools for?","reference":"The agent uses LangChain tools to route between PDF search, calculator, and web search capabilities."}
33
+ {"id":"q033","question":"What happens when the agent output parser rejects malformed output?","reference":"The app logs the parser rejection and returns a safe malformed-output message."}
34
+ {"id":"q034","question":"What type of API response is used for uploaded document processing status?","reference":"A document status response includes the document id, status, page count, chunk count, and error message."}
35
+ {"id":"q035","question":"How are deleted documents hidden from normal document APIs?","reference":"Documents are soft-deleted with an is_deleted flag and normal APIs filter them out."}
36
+ {"id":"q036","question":"What does deleting a document preserve for future restore flows?","reference":"Soft deletion preserves underlying files, vectors, graphs, and chat history for possible future restore flows."}
37
+ {"id":"q037","question":"What is the purpose of CHUNK_SIZE?","reference":"CHUNK_SIZE controls the number of characters in each document chunk."}
38
+ {"id":"q038","question":"What is the purpose of CHUNK_OVERLAP?","reference":"CHUNK_OVERLAP controls how much text overlaps between adjacent chunks to preserve boundary context."}
39
+ {"id":"q039","question":"Which HuggingFace setting controls answer length?","reference":"LLM_MAX_NEW_TOKENS controls the maximum number of generated tokens for answers."}
40
+ {"id":"q040","question":"Which HuggingFace setting controls answer randomness?","reference":"LLM_TEMPERATURE controls sampling randomness during answer generation."}
41
+ {"id":"q041","question":"What environment variable stores the HuggingFace token?","reference":"HF_TOKEN stores the HuggingFace API token used for inference."}
42
+ {"id":"q042","question":"Why should DEBUG not be enabled in production?","reference":"DEBUG enables detailed behavior intended for development and should not be enabled in production."}
43
+ {"id":"q043","question":"How are production CORS origins configured?","reference":"Production CORS origins are configured through ALLOWED_ORIGINS."}
44
+ {"id":"q044","question":"What database is used by default for local development?","reference":"The default database URL points to a local SQLite database at ./data/app.db."}
45
+ {"id":"q045","question":"What database does Docker Compose provide for the stack?","reference":"Docker Compose provides a PostgreSQL database service for the stack."}
46
+ {"id":"q046","question":"What is the contributor target branch for pull requests?","reference":"Contributor pull requests should target the dev branch."}
47
+ {"id":"q047","question":"Which branch is production protected for deployment?","reference":"The main branch is treated as the production branch for deployment."}
48
+ {"id":"q048","question":"Where can developers view Swagger locally?","reference":"Developers can view Swagger at /docs when the backend is running locally."}
49
+ {"id":"q049","question":"What does the architecture document focus on?","reference":"The architecture document focuses on how requests move through the system and how major runtime components interact."}
50
+ {"id":"q050","question":"Why is a RAGAS evaluation pipeline useful for this project?","reference":"A RAGAS evaluation pipeline provides quantitative scores to compare standard vector search with GraphRAG and track retrieval and answer quality over time."}
backend/requirements.txt CHANGED
@@ -38,6 +38,7 @@ langchain-huggingface
38
  langchain-text-splitters
39
  langsmith
40
  rank-bm25
 
41
 
42
  # Embeddings & ML
43
  sentence-transformers
 
38
  langchain-text-splitters
39
  langsmith
40
  rank-bm25
41
+ ragas>=0.3.0
42
 
43
  # Embeddings & ML
44
  sentence-transformers
backend/scripts/run_ragas_eval.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Run a 50-question RAGAS comparison for vector search and GraphRAG."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import sys
7
+ from datetime import datetime, timezone
8
+ from pathlib import Path
9
+
10
+ ROOT = Path(__file__).resolve().parents[2]
11
+ BACKEND_DIR = ROOT / "backend"
12
+ if str(BACKEND_DIR) not in sys.path:
13
+ sys.path.insert(0, str(BACKEND_DIR))
14
+
15
+ DEFAULT_DATASET = BACKEND_DIR / "evaluation" / "ragas_sample_questions.jsonl"
16
+ DEFAULT_OUTPUT = BACKEND_DIR / "evaluation" / "ragas_results.json"
17
+
18
+
19
+ def parse_args() -> argparse.Namespace:
20
+ parser = argparse.ArgumentParser(
21
+ description="Evaluate vector search versus GraphRAG with RAGAS.",
22
+ )
23
+ parser.add_argument("--user-id", required=True, help="Owner user id for indexed documents.")
24
+ parser.add_argument("--document-id", help="Optional single document id to evaluate.")
25
+ parser.add_argument("--dataset", type=Path, default=DEFAULT_DATASET)
26
+ parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
27
+ parser.add_argument("--limit", type=int, default=50)
28
+ return parser.parse_args()
29
+
30
+
31
+ def main() -> None:
32
+ args = parse_args()
33
+
34
+ from app.evaluation.ragas_pipeline import collect_records, compare_pipelines, load_questions
35
+
36
+ questions = load_questions(args.dataset, limit=args.limit)
37
+ grouped_records = collect_records(
38
+ questions=questions,
39
+ user_id=args.user_id,
40
+ document_id=args.document_id,
41
+ )
42
+ scores = compare_pipelines(grouped_records)
43
+ payload = {
44
+ "generated_at": datetime.now(timezone.utc).isoformat(),
45
+ "dataset": str(args.dataset),
46
+ "question_count": len(questions),
47
+ "user_id": args.user_id,
48
+ "document_id": args.document_id,
49
+ "scores": scores,
50
+ }
51
+
52
+ args.output.parent.mkdir(parents=True, exist_ok=True)
53
+ args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
54
+ print(json.dumps(payload["scores"], indent=2))
55
+ print(f"Wrote RAGAS evaluation results to {args.output}")
56
+
57
+
58
+ if __name__ == "__main__":
59
+ main()
backend/tests/test_ragas_pipeline.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from types import SimpleNamespace
3
+
4
+ from app.evaluation import ragas_pipeline
5
+ from app.evaluation.ragas_pipeline import (
6
+ EvaluationQuestion,
7
+ append_graph_context,
8
+ collect_records,
9
+ load_questions,
10
+ summarize_ragas_result,
11
+ )
12
+
13
+
14
+ def test_load_questions_requires_exact_limit(tmp_path):
15
+ dataset = tmp_path / "questions.jsonl"
16
+ rows = [
17
+ {"id": "q1", "question": "Question 1?", "reference": "Reference 1."},
18
+ {"id": "q2", "question": "Question 2?", "reference": "Reference 2."},
19
+ ]
20
+ dataset.write_text("\n".join(json.dumps(row) for row in rows), encoding="utf-8")
21
+
22
+ questions = load_questions(dataset, limit=2)
23
+
24
+ assert [question.id for question in questions] == ["q1", "q2"]
25
+ assert questions[0].question == "Question 1?"
26
+
27
+
28
+ def test_append_graph_context_skips_empty_context():
29
+ assert append_graph_context(["vector context"], " ") == ["vector context"]
30
+ assert append_graph_context(["vector context"], "graph context") == [
31
+ "vector context",
32
+ "graph context",
33
+ ]
34
+
35
+
36
+ def test_collect_records_builds_vector_and_graphrag_samples(monkeypatch):
37
+ questions = [
38
+ EvaluationQuestion(id="q1", question="What is Alpha?", reference="Alpha is a product."),
39
+ ]
40
+
41
+ monkeypatch.setattr(
42
+ ragas_pipeline,
43
+ "retrieve_vector_contexts",
44
+ lambda **_kwargs: ["Alpha vector context."],
45
+ )
46
+ monkeypatch.setattr(
47
+ ragas_pipeline,
48
+ "retrieve_graphrag_contexts",
49
+ lambda **_kwargs: ["Alpha vector context.", "Alpha is related to Beta."],
50
+ )
51
+
52
+ records = collect_records(
53
+ questions=questions,
54
+ user_id="user-1",
55
+ answer_generator=lambda question, contexts: f"{question} -> {len(contexts)} contexts",
56
+ )
57
+
58
+ assert records["vector"][0].mode == "vector"
59
+ assert records["vector"][0].response.endswith("1 contexts")
60
+ assert records["graphrag"][0].mode == "graphrag"
61
+ assert records["graphrag"][0].response.endswith("2 contexts")
62
+
63
+
64
+ def test_summarize_ragas_result_averages_score_rows():
65
+ result = SimpleNamespace(
66
+ scores=[
67
+ {"faithfulness": 1.0, "context_recall": 0.5},
68
+ {"faithfulness": 0.5, "context_recall": 1.0},
69
+ ]
70
+ )
71
+
72
+ assert summarize_ragas_result(result) == {
73
+ "faithfulness": 0.75,
74
+ "context_recall": 0.75,
75
+ }
76
+