Spaces:

aankitdas
/

doc-intelligence-rag

Sleeping

App Files Files Community

aankitdas commited on Jan 7

Commit

af2f8e1

1 Parent(s): 714bdb1

feat: add RAGAS eval

Browse files

Files changed (15) hide show

evaluation_results/_results.jsonl +50 -0
evaluation_results/results.jsonl +20 -50
frontend/evaluation.html +567 -341
frontend/index.html +1 -0
frontend/index_.html +592 -0
frontend/ragas.html +671 -0
pyproject.toml +5 -2
run_real_evaluation.py +473 -0
src/evaluation/ragas_integration/__init__.py +14 -0
src/evaluation/ragas_integration/pipeline_adapter.py +103 -0
src/evaluation/ragas_integration/ragas_endpoints.py +129 -0
src/evaluation/ragas_integration/ragas_evaluator.py +139 -0
src/evaluation/ragas_integration/test_ragas.py +68 -0
src/main.py +35 -5
uv.lock +0 -0

evaluation_results/_results.jsonl ADDED Viewed

	@@ -0,0 +1,50 @@

+{"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8615102352119911, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.6217199504672873, "bert_score": 0.9101784656133992, "answer_relevance": 0.8611807441816679, "faithfulness": 0.9889532712914122, "hallucination_detected": 0, "source_attribution_score": 0.9197433053801606, "latency_ms": 193.9050181207473, "tokens_used": 180, "cost_cents": 0.1947866279399885, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "abaf4ca6"}
+{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.6691151867351297, "retrieval_recall": 0.823127264267807, "rank_position": 1, "rouge_l": 0.714583633420124, "bert_score": 0.7968070501948343, "answer_relevance": 0.8386952468169229, "faithfulness": 0.8427198816502497, "hallucination_detected": 0, "source_attribution_score": 0.834049480985246, "latency_ms": 309.52617615332184, "tokens_used": 130, "cost_cents": 0.5222450372503339, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
+{"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 2, "retrieval_precision": 0.7820863657323606, "retrieval_recall": 0.7278826391993161, "rank_position": 4, "rouge_l": 0.7288516571075816, "bert_score": 0.800838399605806, "answer_relevance": 0.7623839343155656, "faithfulness": 0.760938424869514, "hallucination_detected": 0, "source_attribution_score": 0.7367638541396095, "latency_ms": 127.88553000716428, "tokens_used": 86, "cost_cents": 0.6028654205830427, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "ff093944"}
+{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8032957101002208, "retrieval_recall": 0.9059703284838815, "rank_position": 1, "rouge_l": 0.8300273338544246, "bert_score": 0.9454453940286349, "answer_relevance": 0.9520338304764728, "faithfulness": 0.897131533318752, "hallucination_detected": 0, "source_attribution_score": 0.9492810947177941, "latency_ms": 465.42015740446305, "tokens_used": 223, "cost_cents": 0.20152073810222879, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "c4f50504"}
+{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8530337397480929, "retrieval_recall": 0.7059261296867919, "rank_position": 2, "rouge_l": 0.7193205808960748, "bert_score": 0.9022171118953591, "answer_relevance": 0.8531732924021801, "faithfulness": 0.8121930123501006, "hallucination_detected": 0, "source_attribution_score": 0.8249719199625603, "latency_ms": 118.93191807619638, "tokens_used": 156, "cost_cents": 0.6705483559336415, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
+{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.871851781632808, "retrieval_recall": 1.0, "rank_position": 2, "rouge_l": 0.8763052973676115, "bert_score": 0.9155189067363468, "answer_relevance": 0.7819811920531572, "faithfulness": 0.9020511875557776, "hallucination_detected": 0, "source_attribution_score": 0.8835911263653357, "latency_ms": 215.06062627830062, "tokens_used": 158, "cost_cents": 0.28454809454724767, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "c4f50504"}
+{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8374406518052472, "retrieval_recall": 0.8995269271491464, "rank_position": 1, "rouge_l": 0.6625752862799461, "bert_score": 0.8433178449037969, "answer_relevance": 0.8430013035861083, "faithfulness": 0.893951241843859, "hallucination_detected": 0, "source_attribution_score": 0.7615935243739598, "latency_ms": 419.38297913278507, "tokens_used": 219, "cost_cents": 0.685936998794628, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
+{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.7801096274968522, "retrieval_recall": 0.6863130363664056, "rank_position": 2, "rouge_l": 0.7314517596590595, "bert_score": 0.8276297752821052, "answer_relevance": 0.7915041989155733, "faithfulness": 0.814200114298667, "hallucination_detected": 0, "source_attribution_score": 0.7910397701255416, "latency_ms": 192.75282528673864, "tokens_used": 140, "cost_cents": 0.5706402044081957, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
+{"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9319798193959905, "retrieval_recall": 0.7301414759104026, "rank_position": 3, "rouge_l": 0.9195189478153559, "bert_score": 0.9506571721308754, "answer_relevance": 0.9012898093375585, "faithfulness": 0.9159276711160365, "hallucination_detected": 0, "source_attribution_score": 0.8105097496319957, "latency_ms": 310.01153330005803, "tokens_used": 134, "cost_cents": 0.36313962364633723, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "ff093944"}
+{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.987131336980288, "retrieval_recall": 0.8161833189974133, "rank_position": 2, "rouge_l": 0.754633239450571, "bert_score": 0.8525460742457374, "answer_relevance": 0.8388153285264023, "faithfulness": 0.8947958687708046, "hallucination_detected": 0, "source_attribution_score": 0.8670965141635586, "latency_ms": 367.03119966417205, "tokens_used": 105, "cost_cents": 0.6425165690009661, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
+{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8161943972603446, "retrieval_recall": 0.8191451209916161, "rank_position": 1, "rouge_l": 0.8566238483374247, "bert_score": 0.8407886193759627, "answer_relevance": 0.788647130938179, "faithfulness": 0.9458751488959517, "hallucination_detected": 0, "source_attribution_score": 0.8442883639082127, "latency_ms": 394.39735015927437, "tokens_used": 126, "cost_cents": 0.6182353694114775, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "730981e3"}
+{"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9268720168249583, "retrieval_recall": 0.7744657390458949, "rank_position": 1, "rouge_l": 0.726991263638828, "bert_score": 0.9072089249292097, "answer_relevance": 0.7368736773342853, "faithfulness": 0.9109545928726132, "hallucination_detected": 0, "source_attribution_score": 0.8389074559482628, "latency_ms": 363.97033617468753, "tokens_used": 89, "cost_cents": 0.1520075706493582, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1236ae18"}
+{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 1.0, "retrieval_recall": 0.7883493024047399, "rank_position": 2, "rouge_l": 0.8794507996771228, "bert_score": 0.9890372805052198, "answer_relevance": 0.98, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.9202874935555082, "latency_ms": 180.5318450150473, "tokens_used": 164, "cost_cents": 0.3633483811341406, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
+{"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9574712547642229, "retrieval_recall": 0.898715076798533, "rank_position": 3, "rouge_l": 0.8192000079755279, "bert_score": 0.8864239733582311, "answer_relevance": 0.7428977779588922, "faithfulness": 0.9030187960492433, "hallucination_detected": 0, "source_attribution_score": 0.7624554954695243, "latency_ms": 373.14060250844705, "tokens_used": 84, "cost_cents": 0.7117749597236492, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "abaf4ca6"}
+{"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9405730690612055, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.8808942707215686, "bert_score": 0.9132934029079159, "answer_relevance": 0.905135078735406, "faithfulness": 0.8931492108116512, "hallucination_detected": 0, "source_attribution_score": 0.8268512614166635, "latency_ms": 339.0772795799579, "tokens_used": 214, "cost_cents": 0.7190302687955942, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1236ae18"}
+{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9730519547186312, "retrieval_recall": 0.7000247515495168, "rank_position": 1, "rouge_l": 0.8481502060571952, "bert_score": 0.8512169150469242, "answer_relevance": 0.6066385743234217, "faithfulness": 0.90739914345254, "hallucination_detected": 0, "source_attribution_score": 0.7272214400773345, "latency_ms": 180.27676298939465, "tokens_used": 213, "cost_cents": 0.32992956367012927, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "97deba54"}
+{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.882795855159822, "retrieval_recall": 0.9536984414043154, "rank_position": 2, "rouge_l": 0.8652639536487609, "bert_score": 0.9510206928805952, "answer_relevance": 0.98, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.9041183043586343, "latency_ms": 304.8777755850387, "tokens_used": 202, "cost_cents": 0.4573272605920282, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "ea8a82db"}
+{"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9074913930124041, "retrieval_recall": 0.8337368592917234, "rank_position": 2, "rouge_l": 0.7866443218610295, "bert_score": 0.8890794098843255, "answer_relevance": 0.8644104258787939, "faithfulness": 0.9162807414858771, "hallucination_detected": 0, "source_attribution_score": 0.8535049903935279, "latency_ms": 153.72881316060213, "tokens_used": 192, "cost_cents": 0.42866080997615663, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "f93850ae"}
+{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 3, "retrieval_precision": 0.736362372230679, "retrieval_recall": 0.6501144391355456, "rank_position": 1, "rouge_l": 0.550190206571193, "bert_score": 0.7928686128619693, "answer_relevance": 0.7479453349256642, "faithfulness": 0.724801089955014, "hallucination_detected": 0, "source_attribution_score": 0.65, "latency_ms": 318.4624295326313, "tokens_used": 190, "cost_cents": 0.2437715682365154, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "97deba54"}
+{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.6735724740087383, "retrieval_recall": 0.8542061489968834, "rank_position": 3, "rouge_l": 0.753272128099022, "bert_score": 0.7951286425632846, "answer_relevance": 0.7111372864814403, "faithfulness": 0.7658078142862852, "hallucination_detected": 0, "source_attribution_score": 0.7921982358187583, "latency_ms": 74.69663423689695, "tokens_used": 81, "cost_cents": 0.27307125512490826, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "d72f240c"}
+{"query": "What are the inclusion/exclusion criteria for this study?", "answer": "Based on the clinical data, what are the inclusion/exclusion criteria for this study. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Adverse_Events_Listing.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8823590395988979, "retrieval_recall": 0.8054000463419554, "rank_position": 4, "rouge_l": 0.657165617121062, "bert_score": 0.7163229867147761, "answer_relevance": 0.9214749005186091, "faithfulness": 0.7984409109004382, "hallucination_detected": 0, "source_attribution_score": 0.8050997029870711, "latency_ms": 320.5743801768407, "tokens_used": 139, "cost_cents": 0.18256542430243572, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "ca0963ac"}
+{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8280930369418137, "retrieval_recall": 0.8386391743395781, "rank_position": 2, "rouge_l": 0.859195213413575, "bert_score": 0.8490706943949407, "answer_relevance": 0.8362695754227021, "faithfulness": 0.9851436561648604, "hallucination_detected": 0, "source_attribution_score": 0.9006380350406552, "latency_ms": 315.9631216373207, "tokens_used": 204, "cost_cents": 0.7895173564931882, "timestamp": "2025-12-29T16:33:52.686903", "eval_id": "225ada1b"}
+{"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9229645259544321, "retrieval_recall": 0.9045329288076733, "rank_position": 1, "rouge_l": 0.6794903586334973, "bert_score": 0.9114979234771378, "answer_relevance": 0.8710294892629211, "faithfulness": 0.8819010505339767, "hallucination_detected": 0, "source_attribution_score": 0.828883744019921, "latency_ms": 310.1823792045738, "tokens_used": 92, "cost_cents": 0.2911547513375744, "timestamp": "2025-12-29T16:33:52.687494", "eval_id": "d5d72fda"}
+{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8319845838750437, "retrieval_recall": 0.8232464235414909, "rank_position": 2, "rouge_l": 0.8391753287550867, "bert_score": 0.8678652920374175, "answer_relevance": 0.7651917363208208, "faithfulness": 0.9490917638308898, "hallucination_detected": 0, "source_attribution_score": 0.8135628503503984, "latency_ms": 321.0052546194483, "tokens_used": 245, "cost_cents": 0.5595103002738705, "timestamp": "2025-12-29T16:33:52.687494", "eval_id": "9ea17371"}
+{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8901844188411807, "retrieval_recall": 0.8188640273866499, "rank_position": 1, "rouge_l": 0.6292299958891738, "bert_score": 0.8351537114403716, "answer_relevance": 0.9379221844509074, "faithfulness": 0.889919633199584, "hallucination_detected": 0, "source_attribution_score": 0.9361556905489479, "latency_ms": 301.31795277671677, "tokens_used": 193, "cost_cents": 0.5017837116409055, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "2edfcccd"}
+{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.7410372108118369, "retrieval_recall": 0.966334676629508, "rank_position": 4, "rouge_l": 0.7403451654608713, "bert_score": 0.9267453574782148, "answer_relevance": 0.8281170820867129, "faithfulness": 0.86390200814052, "hallucination_detected": 0, "source_attribution_score": 0.8378864895727723, "latency_ms": 211.24391674054434, "tokens_used": 83, "cost_cents": 0.2983786385854106, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "a061c7fe"}
+{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8720063460928076, "retrieval_recall": 0.6729254224539245, "rank_position": 2, "rouge_l": 0.8327709659558473, "bert_score": 0.8128271397265061, "answer_relevance": 0.8905255016851306, "faithfulness": 0.84745277001056, "hallucination_detected": 0, "source_attribution_score": 0.8230615394254884, "latency_ms": 283.8294453572478, "tokens_used": 250, "cost_cents": 0.6332729607669917, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "2edfcccd"}
+{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9301020713000657, "retrieval_recall": 0.924804842721284, "rank_position": 1, "rouge_l": 0.9427204506133842, "bert_score": 0.9349403716685819, "answer_relevance": 0.8945900053205512, "faithfulness": 0.9102438848352746, "hallucination_detected": 0, "source_attribution_score": 0.99, "latency_ms": 277.97498285046345, "tokens_used": 196, "cost_cents": 0.7801133042353303, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "8fdf6b7c"}
+{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 0.899280381787354, "retrieval_recall": 0.8961888094914131, "rank_position": 2, "rouge_l": 0.5936623542297897, "bert_score": 0.823996206720772, "answer_relevance": 0.6865616319136963, "faithfulness": 0.8144270370656516, "hallucination_detected": 0, "source_attribution_score": 0.9211159702320861, "latency_ms": 316.20020030370006, "tokens_used": 94, "cost_cents": 0.7486503882498293, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "2ce76cd9"}
+{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8315011482402368, "retrieval_recall": 0.833569355528467, "rank_position": 1, "rouge_l": 0.8225004807085223, "bert_score": 0.8431786717167729, "answer_relevance": 0.7471615327404427, "faithfulness": 0.8178606484394222, "hallucination_detected": 0, "source_attribution_score": 0.7317171144269652, "latency_ms": 265.3077015433886, "tokens_used": 228, "cost_cents": 0.2775564966165721, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "d06ff1bd"}
+{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8641595010789612, "retrieval_recall": 0.7945909900018892, "rank_position": 1, "rouge_l": 0.8006758319947014, "bert_score": 0.8321939471946035, "answer_relevance": 0.825745775211993, "faithfulness": 0.8467257172080817, "hallucination_detected": 0, "source_attribution_score": 0.8497391658427235, "latency_ms": 235.03663142966545, "tokens_used": 141, "cost_cents": 0.17524629198643646, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "d06ff1bd"}
+{"query": "What is the success rate from the phase II trial?", "answer": "Based on the clinical data, what is the success rate from the phase ii trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.6903702549893261, "retrieval_recall": 0.73223634008384, "rank_position": 2, "rouge_l": 0.707429022155934, "bert_score": 0.777869930411189, "answer_relevance": 0.7031065283777661, "faithfulness": 0.7, "hallucination_detected": 0, "source_attribution_score": 0.7439494136650804, "latency_ms": 400.82343329582545, "tokens_used": 98, "cost_cents": 0.7870249846125801, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "36d2fc3b"}
+{"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.725867027337326, "retrieval_recall": 0.863170416240463, "rank_position": 2, "rouge_l": 0.8912824102328486, "bert_score": 0.9643405650883139, "answer_relevance": 0.8885158015034251, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.8784123194447961, "latency_ms": 236.3847138217219, "tokens_used": 204, "cost_cents": 0.5521449515774235, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "682b9450"}
+{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9750320447486492, "retrieval_recall": 0.7245802712668319, "rank_position": 1, "rouge_l": 0.7099476163376697, "bert_score": 0.9440083937887742, "answer_relevance": 0.8156100248089608, "faithfulness": 0.8919262171326391, "hallucination_detected": 0, "source_attribution_score": 0.863174486121895, "latency_ms": 362.8327560575255, "tokens_used": 189, "cost_cents": 0.7171165823008571, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "0de1bbf5"}
+{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9015276357455221, "retrieval_recall": 0.9264841648056893, "rank_position": 1, "rouge_l": 0.891494274758995, "bert_score": 0.99, "answer_relevance": 0.7355355223593343, "faithfulness": 0.9702206503913026, "hallucination_detected": 0, "source_attribution_score": 0.777455505087579, "latency_ms": 321.1816552932661, "tokens_used": 247, "cost_cents": 0.7289428286892591, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "848a59c9"}
+{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9769493002143846, "retrieval_recall": 0.7089498127174636, "rank_position": 1, "rouge_l": 0.743951051682124, "bert_score": 0.88134771993094, "answer_relevance": 0.8198995975819598, "faithfulness": 0.759966915206261, "hallucination_detected": 0, "source_attribution_score": 0.653680388081969, "latency_ms": 394.7220409253053, "tokens_used": 217, "cost_cents": 0.3830027894556253, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "d882ccef"}
+{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9190628509274618, "retrieval_recall": 0.8737374216126653, "rank_position": 1, "rouge_l": 0.7916949852181128, "bert_score": 0.9615837240171882, "answer_relevance": 0.8916037889745834, "faithfulness": 0.8590999883691032, "hallucination_detected": 0, "source_attribution_score": 0.7580742362127584, "latency_ms": 158.1258379270646, "tokens_used": 169, "cost_cents": 0.3752840055083183, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "f7a91f0a"}
+{"query": "What are the inclusion/exclusion criteria for this study?", "answer": "Based on the clinical data, what are the inclusion/exclusion criteria for this study. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9933833682135537, "retrieval_recall": 0.9982061025926003, "rank_position": 2, "rouge_l": 0.778356020463265, "bert_score": 0.8294547617138849, "answer_relevance": 0.7798722760563348, "faithfulness": 0.9499485856550234, "hallucination_detected": 0, "source_attribution_score": 0.7780376963536395, "latency_ms": 173.39694248228693, "tokens_used": 88, "cost_cents": 0.5571400235923032, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "c57f0a77"}
+{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf"], "num_retrieved": 1, "retrieval_precision": 0.805738836595438, "retrieval_recall": 0.6065653094384298, "rank_position": 1, "rouge_l": 0.7577008392952846, "bert_score": 0.8073994742363001, "answer_relevance": 0.7914931355367709, "faithfulness": 0.7987802242346304, "hallucination_detected": 0, "source_attribution_score": 0.8243795429292404, "latency_ms": 358.4161124076011, "tokens_used": 149, "cost_cents": 0.7836666239789596, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "6751328e"}
+{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9294316493258027, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.99, "answer_relevance": 0.8935370280287651, "faithfulness": 0.9657921757626544, "hallucination_detected": 0, "source_attribution_score": 0.9450063312067425, "latency_ms": 327.13170085845616, "tokens_used": 175, "cost_cents": 0.6736960835259596, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "04338b7b"}
+{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8691780049883588, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.9053724290502517, "bert_score": 0.887461041624208, "answer_relevance": 0.8255338939540914, "faithfulness": 0.9626678867129402, "hallucination_detected": 0, "source_attribution_score": 0.8858444522908131, "latency_ms": 285.00447311390025, "tokens_used": 212, "cost_cents": 0.26516004149862177, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "04338b7b"}
+{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9104627762675658, "retrieval_recall": 0.937976535494375, "rank_position": 1, "rouge_l": 0.5755118375496409, "bert_score": 0.8919959033394592, "answer_relevance": 0.7836202850178633, "faithfulness": 0.953432186460839, "hallucination_detected": 0, "source_attribution_score": 0.7835803057646042, "latency_ms": 393.78508982459505, "tokens_used": 127, "cost_cents": 0.5839725982219669, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "6751328e"}
+{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.7014865049756396, "retrieval_recall": 0.8532205085753696, "rank_position": 1, "rouge_l": 0.7036879055392565, "bert_score": 0.8232887647654229, "answer_relevance": 0.7240432417784443, "faithfulness": 0.8174281679074274, "hallucination_detected": 0, "source_attribution_score": 0.8094309307066749, "latency_ms": 346.74320628259454, "tokens_used": 156, "cost_cents": 0.30470012119609546, "timestamp": "2025-12-29T16:33:52.692906", "eval_id": "6817a77e"}
+{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.7708525319903022, "retrieval_recall": 0.7140539324008609, "rank_position": 1, "rouge_l": 0.7582038473536197, "bert_score": 0.8719206100765141, "answer_relevance": 0.768747467165288, "faithfulness": 0.7863906811511377, "hallucination_detected": 0, "source_attribution_score": 0.7827059691758022, "latency_ms": 284.30338447510456, "tokens_used": 193, "cost_cents": 0.5194315945804843, "timestamp": "2025-12-29T16:33:52.692906", "eval_id": "70950525"}
+{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8863518011536086, "retrieval_recall": 0.9528433531913749, "rank_position": 1, "rouge_l": 0.6924764309368061, "bert_score": 0.9074272676584865, "answer_relevance": 0.8856496644947377, "faithfulness": 0.9643048532855157, "hallucination_detected": 0, "source_attribution_score": 0.9218544026918479, "latency_ms": 387.9856471606976, "tokens_used": 107, "cost_cents": 0.1927569268723833, "timestamp": "2025-12-29T16:33:52.693443", "eval_id": "bc0d2943"}
+{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8950536281566746, "retrieval_recall": 0.9144248160397045, "rank_position": 1, "rouge_l": 0.7618677680298188, "bert_score": 0.8461644035252505, "answer_relevance": 0.9653601861381645, "faithfulness": 0.8755786694922031, "hallucination_detected": 0, "source_attribution_score": 0.8808869584154418, "latency_ms": 353.36305965541663, "tokens_used": 245, "cost_cents": 0.5148915885221008, "timestamp": "2025-12-29T16:33:52.693443", "eval_id": "eeaa869f"}
+{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8098059467924409, "retrieval_recall": 0.6023065734388835, "rank_position": 3, "rouge_l": 0.7004028932959154, "bert_score": 0.813015925326988, "answer_relevance": 0.6784644783231156, "faithfulness": 0.7845740350573508, "hallucination_detected": 0, "source_attribution_score": 0.65, "latency_ms": 459.79977076107156, "tokens_used": 117, "cost_cents": 0.27331328918592634, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "2202146d"}
+{"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 3, "retrieval_precision": 1.0, "retrieval_recall": 0.7608373996776078, "rank_position": 1, "rouge_l": 0.7774878763927089, "bert_score": 0.8396996698506028, "answer_relevance": 0.8271539804365684, "faithfulness": 0.9337213882950308, "hallucination_detected": 0, "source_attribution_score": 0.8388598969576262, "latency_ms": 213.90842919317265, "tokens_used": 86, "cost_cents": 0.29153943157162554, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "bfa3ef53"}
+{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8346511394851743, "retrieval_recall": 0.8814336106436549, "rank_position": 1, "rouge_l": 0.8430315573988195, "bert_score": 0.8944331459730633, "answer_relevance": 0.7897513415421694, "faithfulness": 0.8580143425540971, "hallucination_detected": 1, "source_attribution_score": 0.7346872187150348, "latency_ms": 361.9668724913192, "tokens_used": 101, "cost_cents": 0.5711407488163474, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "b0d984c7"}
+{"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9009022034719727, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.9105038892045774, "bert_score": 0.8960253555216661, "answer_relevance": 0.8253569744550738, "faithfulness": 0.8368519357763938, "hallucination_detected": 0, "source_attribution_score": 0.7777537972022747, "latency_ms": 300.33605788776373, "tokens_used": 207, "cost_cents": 0.3048499137601775, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "81f2022b"}

evaluation_results/results.jsonl CHANGED Viewed

@@ -1,50 +1,20 @@
-{"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8615102352119911, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.6217199504672873, "bert_score": 0.9101784656133992, "answer_relevance": 0.8611807441816679, "faithfulness": 0.9889532712914122, "hallucination_detected": 0, "source_attribution_score": 0.9197433053801606, "latency_ms": 193.9050181207473, "tokens_used": 180, "cost_cents": 0.1947866279399885, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "abaf4ca6"}
-{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.6691151867351297, "retrieval_recall": 0.823127264267807, "rank_position": 1, "rouge_l": 0.714583633420124, "bert_score": 0.7968070501948343, "answer_relevance": 0.8386952468169229, "faithfulness": 0.8427198816502497, "hallucination_detected": 0, "source_attribution_score": 0.834049480985246, "latency_ms": 309.52617615332184, "tokens_used": 130, "cost_cents": 0.5222450372503339, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
-{"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 2, "retrieval_precision": 0.7820863657323606, "retrieval_recall": 0.7278826391993161, "rank_position": 4, "rouge_l": 0.7288516571075816, "bert_score": 0.800838399605806, "answer_relevance": 0.7623839343155656, "faithfulness": 0.760938424869514, "hallucination_detected": 0, "source_attribution_score": 0.7367638541396095, "latency_ms": 127.88553000716428, "tokens_used": 86, "cost_cents": 0.6028654205830427, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "ff093944"}
-{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8032957101002208, "retrieval_recall": 0.9059703284838815, "rank_position": 1, "rouge_l": 0.8300273338544246, "bert_score": 0.9454453940286349, "answer_relevance": 0.9520338304764728, "faithfulness": 0.897131533318752, "hallucination_detected": 0, "source_attribution_score": 0.9492810947177941, "latency_ms": 465.42015740446305, "tokens_used": 223, "cost_cents": 0.20152073810222879, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "c4f50504"}
-{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8530337397480929, "retrieval_recall": 0.7059261296867919, "rank_position": 2, "rouge_l": 0.7193205808960748, "bert_score": 0.9022171118953591, "answer_relevance": 0.8531732924021801, "faithfulness": 0.8121930123501006, "hallucination_detected": 0, "source_attribution_score": 0.8249719199625603, "latency_ms": 118.93191807619638, "tokens_used": 156, "cost_cents": 0.6705483559336415, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
-{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.871851781632808, "retrieval_recall": 1.0, "rank_position": 2, "rouge_l": 0.8763052973676115, "bert_score": 0.9155189067363468, "answer_relevance": 0.7819811920531572, "faithfulness": 0.9020511875557776, "hallucination_detected": 0, "source_attribution_score": 0.8835911263653357, "latency_ms": 215.06062627830062, "tokens_used": 158, "cost_cents": 0.28454809454724767, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "c4f50504"}
-{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8374406518052472, "retrieval_recall": 0.8995269271491464, "rank_position": 1, "rouge_l": 0.6625752862799461, "bert_score": 0.8433178449037969, "answer_relevance": 0.8430013035861083, "faithfulness": 0.893951241843859, "hallucination_detected": 0, "source_attribution_score": 0.7615935243739598, "latency_ms": 419.38297913278507, "tokens_used": 219, "cost_cents": 0.685936998794628, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
-{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.7801096274968522, "retrieval_recall": 0.6863130363664056, "rank_position": 2, "rouge_l": 0.7314517596590595, "bert_score": 0.8276297752821052, "answer_relevance": 0.7915041989155733, "faithfulness": 0.814200114298667, "hallucination_detected": 0, "source_attribution_score": 0.7910397701255416, "latency_ms": 192.75282528673864, "tokens_used": 140, "cost_cents": 0.5706402044081957, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
-{"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9319798193959905, "retrieval_recall": 0.7301414759104026, "rank_position": 3, "rouge_l": 0.9195189478153559, "bert_score": 0.9506571721308754, "answer_relevance": 0.9012898093375585, "faithfulness": 0.9159276711160365, "hallucination_detected": 0, "source_attribution_score": 0.8105097496319957, "latency_ms": 310.01153330005803, "tokens_used": 134, "cost_cents": 0.36313962364633723, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "ff093944"}
-{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.987131336980288, "retrieval_recall": 0.8161833189974133, "rank_position": 2, "rouge_l": 0.754633239450571, "bert_score": 0.8525460742457374, "answer_relevance": 0.8388153285264023, "faithfulness": 0.8947958687708046, "hallucination_detected": 0, "source_attribution_score": 0.8670965141635586, "latency_ms": 367.03119966417205, "tokens_used": 105, "cost_cents": 0.6425165690009661, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
-{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8161943972603446, "retrieval_recall": 0.8191451209916161, "rank_position": 1, "rouge_l": 0.8566238483374247, "bert_score": 0.8407886193759627, "answer_relevance": 0.788647130938179, "faithfulness": 0.9458751488959517, "hallucination_detected": 0, "source_attribution_score": 0.8442883639082127, "latency_ms": 394.39735015927437, "tokens_used": 126, "cost_cents": 0.6182353694114775, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "730981e3"}
-{"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9268720168249583, "retrieval_recall": 0.7744657390458949, "rank_position": 1, "rouge_l": 0.726991263638828, "bert_score": 0.9072089249292097, "answer_relevance": 0.7368736773342853, "faithfulness": 0.9109545928726132, "hallucination_detected": 0, "source_attribution_score": 0.8389074559482628, "latency_ms": 363.97033617468753, "tokens_used": 89, "cost_cents": 0.1520075706493582, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1236ae18"}
-{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 1.0, "retrieval_recall": 0.7883493024047399, "rank_position": 2, "rouge_l": 0.8794507996771228, "bert_score": 0.9890372805052198, "answer_relevance": 0.98, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.9202874935555082, "latency_ms": 180.5318450150473, "tokens_used": 164, "cost_cents": 0.3633483811341406, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
-{"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9574712547642229, "retrieval_recall": 0.898715076798533, "rank_position": 3, "rouge_l": 0.8192000079755279, "bert_score": 0.8864239733582311, "answer_relevance": 0.7428977779588922, "faithfulness": 0.9030187960492433, "hallucination_detected": 0, "source_attribution_score": 0.7624554954695243, "latency_ms": 373.14060250844705, "tokens_used": 84, "cost_cents": 0.7117749597236492, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "abaf4ca6"}
-{"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9405730690612055, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.8808942707215686, "bert_score": 0.9132934029079159, "answer_relevance": 0.905135078735406, "faithfulness": 0.8931492108116512, "hallucination_detected": 0, "source_attribution_score": 0.8268512614166635, "latency_ms": 339.0772795799579, "tokens_used": 214, "cost_cents": 0.7190302687955942, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1236ae18"}
-{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9730519547186312, "retrieval_recall": 0.7000247515495168, "rank_position": 1, "rouge_l": 0.8481502060571952, "bert_score": 0.8512169150469242, "answer_relevance": 0.6066385743234217, "faithfulness": 0.90739914345254, "hallucination_detected": 0, "source_attribution_score": 0.7272214400773345, "latency_ms": 180.27676298939465, "tokens_used": 213, "cost_cents": 0.32992956367012927, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "97deba54"}
-{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.882795855159822, "retrieval_recall": 0.9536984414043154, "rank_position": 2, "rouge_l": 0.8652639536487609, "bert_score": 0.9510206928805952, "answer_relevance": 0.98, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.9041183043586343, "latency_ms": 304.8777755850387, "tokens_used": 202, "cost_cents": 0.4573272605920282, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "ea8a82db"}
-{"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9074913930124041, "retrieval_recall": 0.8337368592917234, "rank_position": 2, "rouge_l": 0.7866443218610295, "bert_score": 0.8890794098843255, "answer_relevance": 0.8644104258787939, "faithfulness": 0.9162807414858771, "hallucination_detected": 0, "source_attribution_score": 0.8535049903935279, "latency_ms": 153.72881316060213, "tokens_used": 192, "cost_cents": 0.42866080997615663, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "f93850ae"}
-{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 3, "retrieval_precision": 0.736362372230679, "retrieval_recall": 0.6501144391355456, "rank_position": 1, "rouge_l": 0.550190206571193, "bert_score": 0.7928686128619693, "answer_relevance": 0.7479453349256642, "faithfulness": 0.724801089955014, "hallucination_detected": 0, "source_attribution_score": 0.65, "latency_ms": 318.4624295326313, "tokens_used": 190, "cost_cents": 0.2437715682365154, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "97deba54"}
-{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.6735724740087383, "retrieval_recall": 0.8542061489968834, "rank_position": 3, "rouge_l": 0.753272128099022, "bert_score": 0.7951286425632846, "answer_relevance": 0.7111372864814403, "faithfulness": 0.7658078142862852, "hallucination_detected": 0, "source_attribution_score": 0.7921982358187583, "latency_ms": 74.69663423689695, "tokens_used": 81, "cost_cents": 0.27307125512490826, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "d72f240c"}
-{"query": "What are the inclusion/exclusion criteria for this study?", "answer": "Based on the clinical data, what are the inclusion/exclusion criteria for this study. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Adverse_Events_Listing.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8823590395988979, "retrieval_recall": 0.8054000463419554, "rank_position": 4, "rouge_l": 0.657165617121062, "bert_score": 0.7163229867147761, "answer_relevance": 0.9214749005186091, "faithfulness": 0.7984409109004382, "hallucination_detected": 0, "source_attribution_score": 0.8050997029870711, "latency_ms": 320.5743801768407, "tokens_used": 139, "cost_cents": 0.18256542430243572, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "ca0963ac"}
-{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8280930369418137, "retrieval_recall": 0.8386391743395781, "rank_position": 2, "rouge_l": 0.859195213413575, "bert_score": 0.8490706943949407, "answer_relevance": 0.8362695754227021, "faithfulness": 0.9851436561648604, "hallucination_detected": 0, "source_attribution_score": 0.9006380350406552, "latency_ms": 315.9631216373207, "tokens_used": 204, "cost_cents": 0.7895173564931882, "timestamp": "2025-12-29T16:33:52.686903", "eval_id": "225ada1b"}
-{"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9229645259544321, "retrieval_recall": 0.9045329288076733, "rank_position": 1, "rouge_l": 0.6794903586334973, "bert_score": 0.9114979234771378, "answer_relevance": 0.8710294892629211, "faithfulness": 0.8819010505339767, "hallucination_detected": 0, "source_attribution_score": 0.828883744019921, "latency_ms": 310.1823792045738, "tokens_used": 92, "cost_cents": 0.2911547513375744, "timestamp": "2025-12-29T16:33:52.687494", "eval_id": "d5d72fda"}
-{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8319845838750437, "retrieval_recall": 0.8232464235414909, "rank_position": 2, "rouge_l": 0.8391753287550867, "bert_score": 0.8678652920374175, "answer_relevance": 0.7651917363208208, "faithfulness": 0.9490917638308898, "hallucination_detected": 0, "source_attribution_score": 0.8135628503503984, "latency_ms": 321.0052546194483, "tokens_used": 245, "cost_cents": 0.5595103002738705, "timestamp": "2025-12-29T16:33:52.687494", "eval_id": "9ea17371"}
-{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8901844188411807, "retrieval_recall": 0.8188640273866499, "rank_position": 1, "rouge_l": 0.6292299958891738, "bert_score": 0.8351537114403716, "answer_relevance": 0.9379221844509074, "faithfulness": 0.889919633199584, "hallucination_detected": 0, "source_attribution_score": 0.9361556905489479, "latency_ms": 301.31795277671677, "tokens_used": 193, "cost_cents": 0.5017837116409055, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "2edfcccd"}
-{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.7410372108118369, "retrieval_recall": 0.966334676629508, "rank_position": 4, "rouge_l": 0.7403451654608713, "bert_score": 0.9267453574782148, "answer_relevance": 0.8281170820867129, "faithfulness": 0.86390200814052, "hallucination_detected": 0, "source_attribution_score": 0.8378864895727723, "latency_ms": 211.24391674054434, "tokens_used": 83, "cost_cents": 0.2983786385854106, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "a061c7fe"}
-{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8720063460928076, "retrieval_recall": 0.6729254224539245, "rank_position": 2, "rouge_l": 0.8327709659558473, "bert_score": 0.8128271397265061, "answer_relevance": 0.8905255016851306, "faithfulness": 0.84745277001056, "hallucination_detected": 0, "source_attribution_score": 0.8230615394254884, "latency_ms": 283.8294453572478, "tokens_used": 250, "cost_cents": 0.6332729607669917, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "2edfcccd"}
-{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9301020713000657, "retrieval_recall": 0.924804842721284, "rank_position": 1, "rouge_l": 0.9427204506133842, "bert_score": 0.9349403716685819, "answer_relevance": 0.8945900053205512, "faithfulness": 0.9102438848352746, "hallucination_detected": 0, "source_attribution_score": 0.99, "latency_ms": 277.97498285046345, "tokens_used": 196, "cost_cents": 0.7801133042353303, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "8fdf6b7c"}
-{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 0.899280381787354, "retrieval_recall": 0.8961888094914131, "rank_position": 2, "rouge_l": 0.5936623542297897, "bert_score": 0.823996206720772, "answer_relevance": 0.6865616319136963, "faithfulness": 0.8144270370656516, "hallucination_detected": 0, "source_attribution_score": 0.9211159702320861, "latency_ms": 316.20020030370006, "tokens_used": 94, "cost_cents": 0.7486503882498293, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "2ce76cd9"}
-{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8315011482402368, "retrieval_recall": 0.833569355528467, "rank_position": 1, "rouge_l": 0.8225004807085223, "bert_score": 0.8431786717167729, "answer_relevance": 0.7471615327404427, "faithfulness": 0.8178606484394222, "hallucination_detected": 0, "source_attribution_score": 0.7317171144269652, "latency_ms": 265.3077015433886, "tokens_used": 228, "cost_cents": 0.2775564966165721, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "d06ff1bd"}
-{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8641595010789612, "retrieval_recall": 0.7945909900018892, "rank_position": 1, "rouge_l": 0.8006758319947014, "bert_score": 0.8321939471946035, "answer_relevance": 0.825745775211993, "faithfulness": 0.8467257172080817, "hallucination_detected": 0, "source_attribution_score": 0.8497391658427235, "latency_ms": 235.03663142966545, "tokens_used": 141, "cost_cents": 0.17524629198643646, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "d06ff1bd"}
-{"query": "What is the success rate from the phase II trial?", "answer": "Based on the clinical data, what is the success rate from the phase ii trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.6903702549893261, "retrieval_recall": 0.73223634008384, "rank_position": 2, "rouge_l": 0.707429022155934, "bert_score": 0.777869930411189, "answer_relevance": 0.7031065283777661, "faithfulness": 0.7, "hallucination_detected": 0, "source_attribution_score": 0.7439494136650804, "latency_ms": 400.82343329582545, "tokens_used": 98, "cost_cents": 0.7870249846125801, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "36d2fc3b"}
-{"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.725867027337326, "retrieval_recall": 0.863170416240463, "rank_position": 2, "rouge_l": 0.8912824102328486, "bert_score": 0.9643405650883139, "answer_relevance": 0.8885158015034251, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.8784123194447961, "latency_ms": 236.3847138217219, "tokens_used": 204, "cost_cents": 0.5521449515774235, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "682b9450"}
-{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9750320447486492, "retrieval_recall": 0.7245802712668319, "rank_position": 1, "rouge_l": 0.7099476163376697, "bert_score": 0.9440083937887742, "answer_relevance": 0.8156100248089608, "faithfulness": 0.8919262171326391, "hallucination_detected": 0, "source_attribution_score": 0.863174486121895, "latency_ms": 362.8327560575255, "tokens_used": 189, "cost_cents": 0.7171165823008571, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "0de1bbf5"}
-{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9015276357455221, "retrieval_recall": 0.9264841648056893, "rank_position": 1, "rouge_l": 0.891494274758995, "bert_score": 0.99, "answer_relevance": 0.7355355223593343, "faithfulness": 0.9702206503913026, "hallucination_detected": 0, "source_attribution_score": 0.777455505087579, "latency_ms": 321.1816552932661, "tokens_used": 247, "cost_cents": 0.7289428286892591, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "848a59c9"}
-{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9769493002143846, "retrieval_recall": 0.7089498127174636, "rank_position": 1, "rouge_l": 0.743951051682124, "bert_score": 0.88134771993094, "answer_relevance": 0.8198995975819598, "faithfulness": 0.759966915206261, "hallucination_detected": 0, "source_attribution_score": 0.653680388081969, "latency_ms": 394.7220409253053, "tokens_used": 217, "cost_cents": 0.3830027894556253, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "d882ccef"}
-{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9190628509274618, "retrieval_recall": 0.8737374216126653, "rank_position": 1, "rouge_l": 0.7916949852181128, "bert_score": 0.9615837240171882, "answer_relevance": 0.8916037889745834, "faithfulness": 0.8590999883691032, "hallucination_detected": 0, "source_attribution_score": 0.7580742362127584, "latency_ms": 158.1258379270646, "tokens_used": 169, "cost_cents": 0.3752840055083183, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "f7a91f0a"}
-{"query": "What are the inclusion/exclusion criteria for this study?", "answer": "Based on the clinical data, what are the inclusion/exclusion criteria for this study. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9933833682135537, "retrieval_recall": 0.9982061025926003, "rank_position": 2, "rouge_l": 0.778356020463265, "bert_score": 0.8294547617138849, "answer_relevance": 0.7798722760563348, "faithfulness": 0.9499485856550234, "hallucination_detected": 0, "source_attribution_score": 0.7780376963536395, "latency_ms": 173.39694248228693, "tokens_used": 88, "cost_cents": 0.5571400235923032, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "c57f0a77"}
-{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf"], "num_retrieved": 1, "retrieval_precision": 0.805738836595438, "retrieval_recall": 0.6065653094384298, "rank_position": 1, "rouge_l": 0.7577008392952846, "bert_score": 0.8073994742363001, "answer_relevance": 0.7914931355367709, "faithfulness": 0.7987802242346304, "hallucination_detected": 0, "source_attribution_score": 0.8243795429292404, "latency_ms": 358.4161124076011, "tokens_used": 149, "cost_cents": 0.7836666239789596, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "6751328e"}
-{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9294316493258027, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.99, "answer_relevance": 0.8935370280287651, "faithfulness": 0.9657921757626544, "hallucination_detected": 0, "source_attribution_score": 0.9450063312067425, "latency_ms": 327.13170085845616, "tokens_used": 175, "cost_cents": 0.6736960835259596, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "04338b7b"}
-{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8691780049883588, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.9053724290502517, "bert_score": 0.887461041624208, "answer_relevance": 0.8255338939540914, "faithfulness": 0.9626678867129402, "hallucination_detected": 0, "source_attribution_score": 0.8858444522908131, "latency_ms": 285.00447311390025, "tokens_used": 212, "cost_cents": 0.26516004149862177, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "04338b7b"}
-{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9104627762675658, "retrieval_recall": 0.937976535494375, "rank_position": 1, "rouge_l": 0.5755118375496409, "bert_score": 0.8919959033394592, "answer_relevance": 0.7836202850178633, "faithfulness": 0.953432186460839, "hallucination_detected": 0, "source_attribution_score": 0.7835803057646042, "latency_ms": 393.78508982459505, "tokens_used": 127, "cost_cents": 0.5839725982219669, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "6751328e"}
-{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.7014865049756396, "retrieval_recall": 0.8532205085753696, "rank_position": 1, "rouge_l": 0.7036879055392565, "bert_score": 0.8232887647654229, "answer_relevance": 0.7240432417784443, "faithfulness": 0.8174281679074274, "hallucination_detected": 0, "source_attribution_score": 0.8094309307066749, "latency_ms": 346.74320628259454, "tokens_used": 156, "cost_cents": 0.30470012119609546, "timestamp": "2025-12-29T16:33:52.692906", "eval_id": "6817a77e"}
-{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.7708525319903022, "retrieval_recall": 0.7140539324008609, "rank_position": 1, "rouge_l": 0.7582038473536197, "bert_score": 0.8719206100765141, "answer_relevance": 0.768747467165288, "faithfulness": 0.7863906811511377, "hallucination_detected": 0, "source_attribution_score": 0.7827059691758022, "latency_ms": 284.30338447510456, "tokens_used": 193, "cost_cents": 0.5194315945804843, "timestamp": "2025-12-29T16:33:52.692906", "eval_id": "70950525"}
-{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8863518011536086, "retrieval_recall": 0.9528433531913749, "rank_position": 1, "rouge_l": 0.6924764309368061, "bert_score": 0.9074272676584865, "answer_relevance": 0.8856496644947377, "faithfulness": 0.9643048532855157, "hallucination_detected": 0, "source_attribution_score": 0.9218544026918479, "latency_ms": 387.9856471606976, "tokens_used": 107, "cost_cents": 0.1927569268723833, "timestamp": "2025-12-29T16:33:52.693443", "eval_id": "bc0d2943"}
-{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8950536281566746, "retrieval_recall": 0.9144248160397045, "rank_position": 1, "rouge_l": 0.7618677680298188, "bert_score": 0.8461644035252505, "answer_relevance": 0.9653601861381645, "faithfulness": 0.8755786694922031, "hallucination_detected": 0, "source_attribution_score": 0.8808869584154418, "latency_ms": 353.36305965541663, "tokens_used": 245, "cost_cents": 0.5148915885221008, "timestamp": "2025-12-29T16:33:52.693443", "eval_id": "eeaa869f"}
-{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8098059467924409, "retrieval_recall": 0.6023065734388835, "rank_position": 3, "rouge_l": 0.7004028932959154, "bert_score": 0.813015925326988, "answer_relevance": 0.6784644783231156, "faithfulness": 0.7845740350573508, "hallucination_detected": 0, "source_attribution_score": 0.65, "latency_ms": 459.79977076107156, "tokens_used": 117, "cost_cents": 0.27331328918592634, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "2202146d"}
-{"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 3, "retrieval_precision": 1.0, "retrieval_recall": 0.7608373996776078, "rank_position": 1, "rouge_l": 0.7774878763927089, "bert_score": 0.8396996698506028, "answer_relevance": 0.8271539804365684, "faithfulness": 0.9337213882950308, "hallucination_detected": 0, "source_attribution_score": 0.8388598969576262, "latency_ms": 213.90842919317265, "tokens_used": 86, "cost_cents": 0.29153943157162554, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "bfa3ef53"}
-{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8346511394851743, "retrieval_recall": 0.8814336106436549, "rank_position": 1, "rouge_l": 0.8430315573988195, "bert_score": 0.8944331459730633, "answer_relevance": 0.7897513415421694, "faithfulness": 0.8580143425540971, "hallucination_detected": 1, "source_attribution_score": 0.7346872187150348, "latency_ms": 361.9668724913192, "tokens_used": 101, "cost_cents": 0.5711407488163474, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "b0d984c7"}
-{"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9009022034719727, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.9105038892045774, "bert_score": 0.8960253555216661, "answer_relevance": 0.8253569744550738, "faithfulness": 0.8368519357763938, "hallucination_detected": 0, "source_attribution_score": 0.7777537972022747, "latency_ms": 300.33605788776373, "tokens_used": 207, "cost_cents": 0.3048499137601775, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "81f2022b"}

+{"query": "What is the response rate of Drug X?", "answer": "According to the provided context (Chunk 1), Drug X demonstrated a 65% response rate compared to 35% in the placebo group.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 324.74422454833984, "tokens_used": 21, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:34.998293", "eval_id": "4e10192a"}
+{"query": "What are the side effects of Drug X?", "answer": "The side effects of Drug X are:\n\n- Headache (12% of patients)\n- Mild gastrointestinal upset (8% of patients)\n- Dizziness (5% of patients)\n- Fatigue (4% of patients)\n\nAdditionally, serious adverse events occurred in 2% of patients, including liver enzyme elevation.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 198.8542079925537, "tokens_used": 42, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:35.198696", "eval_id": "05359951"}
+{"query": "How does Drug X interact with warfarin?", "answer": "There is information about the interaction between Drug X and warfarin in the context provided. According to the information in Chunk 1, the interaction between Drug X and warfarin is:\n\n- Potential increased bleeding risk\n- Monitor INR closely\n- Baseline INR and weekly monitoring recommended", "source_docs": ["drug_interactions", "medical_research_1", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 274.7206687927246, "tokens_used": 46, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:35.474923", "eval_id": "09eb8892"}
+{"query": "What is the recommended dosage of Drug X?", "answer": "The recommended dose of Drug X is 500mg twice daily with meals.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 233.20865631103516, "tokens_used": 12, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:35.708131", "eval_id": "297130d7"}
+{"query": "What is the mechanism of action for Drug X?", "answer": "Drug X works by inhibiting protein kinase Y, which is overexpressed in condition Y cells. This inhibition leads to cell cycle arrest and apoptosis of affected cells.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 150.78186988830566, "tokens_used": 27, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:35.860418", "eval_id": "f55b494b"}
+{"query": "What is the response rate of Drug X?", "answer": "65% response rate compared to 35% in the placebo group.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 435.7447624206543, "tokens_used": 10, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:27.660827", "eval_id": "92c25556"}
+{"query": "What are the side effects of Drug X?", "answer": "The most common adverse events reported in the clinical trial for Drug X were:\n\n1. Headache (12% of patients)\n2. Mild gastrointestinal upset (8% of patients)\n3. Dizziness (5% of patients)\n4. Fatigue (4% of patients)\n\nAdditionally, in the case study, the patient experienced:\n\n1. Mild headache (resolved with acetaminophen)\n2. Occasional nausea (resolved with food intake)", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 236.0692024230957, "tokens_used": 57, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:27.898918", "eval_id": "1fab9b0a"}
+{"query": "How does Drug X interact with warfarin?", "answer": "According to the provided context (Chunk 1 - Relevance: 66.9%), Drug X has a potential interaction with warfarin: \"2. Warfarin: - Potential increased bleeding risk - Monitor INR closely - Baseline INR and weekly monitoring recommended\"", "source_docs": ["drug_interactions", "medical_research_1", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 198.96793365478516, "tokens_used": 36, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:28.098393", "eval_id": "83a684fc"}
+{"query": "What is the recommended dosage of Drug X?", "answer": "The recommended dose of Drug X is 500mg twice daily with meals.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 149.9345302581787, "tokens_used": 12, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:28.248327", "eval_id": "a0771f33"}
+{"query": "What is the mechanism of action for Drug X?", "answer": "The mechanism of action for Drug X is inhibiting protein kinase Y, which is overexpressed in condition Y cells. This inhibition leads to cell cycle arrest and apoptosis of affected cells.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 179.38470840454102, "tokens_used": 31, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:28.440212", "eval_id": "35939350"}
+{"query": "What is the response rate of Drug X?", "answer": "The response rate of Drug X is 65%, compared to 35% in the placebo group (p<0.001).", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.5333333333333333, "hallucination_detected": true, "source_attribution_score": 0.3125, "latency_ms": 336.91954612731934, "tokens_used": 16, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:21.623166", "eval_id": "3688751b"}
+{"query": "What are the side effects of Drug X?", "answer": "The most common side effects of Drug X are:\n\n- Headache (12% of patients)\n- Mild gastrointestinal upset (8% of patients)\n- Dizziness (5% of patients)\n- Fatigue (4% of patients)\n\nAdditionally, serious adverse events occurred in 2% of patients, including liver enzyme elevation.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.18181818181818182, "hallucination_detected": true, "source_attribution_score": 0.06818181818181818, "latency_ms": 331.4476013183594, "tokens_used": 44, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:21.955619", "eval_id": "586b096d"}
+{"query": "How does Drug X interact with warfarin?", "answer": "According to the provided context, Drug X interacts with warfarin by potentially increasing the risk of bleeding. Monitoring of INR is recommended, including baseline INR and weekly monitoring is recommended.", "source_docs": ["drug_interactions", "medical_research_1", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.48, "hallucination_detected": true, "source_attribution_score": 0.23333333333333334, "latency_ms": 283.388614654541, "tokens_used": 30, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:22.240178", "eval_id": "a37df1d9"}
+{"query": "What is the recommended dosage of Drug X?", "answer": "The recommended dose of Drug X is 500mg twice daily with meals. Dose adjustments may be necessary for patients with renal impairment, in which case a dose reduction to 250mg twice daily is recommended.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.5, "hallucination_detected": true, "source_attribution_score": 0.23529411764705882, "latency_ms": 272.6554870605469, "tokens_used": 34, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:22.512833", "eval_id": "66ecfa00"}
+{"query": "What is the mechanism of action for Drug X?", "answer": "The mechanism of action for Drug X is inhibiting protein kinase Y, which is overexpressed in condition Y cells. This inhibition leads to cell cycle arrest and apoptosis of affected cells.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.5, "hallucination_detected": true, "source_attribution_score": 0.22580645161290322, "latency_ms": 165.5900478363037, "tokens_used": 31, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:22.679422", "eval_id": "4b7e99dd"}
+{"query": "What is the response rate of Drug X?", "answer": "According to the provided context in Chunk 1, the response rate of Drug X was 65%.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.6666666666666666, "hallucination_detected": false, "source_attribution_score": 0.375, "latency_ms": 594.7244167327881, "tokens_used": 16, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:10.135089", "eval_id": "f5583f32"}
+{"query": "What are the side effects of Drug X?", "answer": "According to the provided context, the side effects of Drug X are:\n\n- Headache (12% of patients)\n- Mild gastrointestinal upset (8% of patients)\n- Dizziness (5% of patients)\n- Fatigue (4% of patients)\n\nAdditionally, from the case study, the patient experienced:\n\n- Mild headache (resolved with acetaminophen)\n- Occasional nausea (resolved with food intake)", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.3888888888888889, "hallucination_detected": true, "source_attribution_score": 0.2, "latency_ms": 265.3524875640869, "tokens_used": 55, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:10.403565", "eval_id": "07148b4e"}
+{"query": "How does Drug X interact with warfarin?", "answer": "According to the context, Drug X interacts with warfarin as follows: \n\n- There is a potential increased bleeding risk.\n- It is recommended to monitor INR closely.\n- Baseline INR and weekly monitoring are recommended.", "source_docs": ["drug_interactions", "medical_research_1", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.4666666666666667, "hallucination_detected": true, "source_attribution_score": 0.22857142857142856, "latency_ms": 339.0464782714844, "tokens_used": 35, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:10.742611", "eval_id": "b7c99790"}
+{"query": "What is the recommended dosage of Drug X?", "answer": "The recommended dosage of Drug X is 500mg twice daily with meals.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.6666666666666666, "hallucination_detected": false, "source_attribution_score": 0.3333333333333333, "latency_ms": 176.81241035461426, "tokens_used": 12, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:10.920425", "eval_id": "1993b43f"}
+{"query": "What is the mechanism of action for Drug X?", "answer": "Drug X works by inhibiting protein kinase Y, which is overexpressed in condition Y cells. This inhibition leads to cell cycle arrest and apoptosis of affected cells.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.46153846153846156, "hallucination_detected": true, "source_attribution_score": 0.2222222222222222, "latency_ms": 389.65582847595215, "tokens_used": 27, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:11.310612", "eval_id": "e1fdaf31"}

frontend/evaluation.html CHANGED Viewed

@@ -7,154 +7,280 @@
     <title>RAG Evaluation Dashboard</title>
     <script src="https://cdnjs.cloudflare.com/ajax/libs/plotly.js/2.26.0/plotly.min.js"></script>
     <style>
         * {
             margin: 0;
             padding: 0;
             box-sizing: border-box;
         }
         body {
-            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
             min-height: 100vh;
-            padding: 20px;
         }
         .container {
-            max-width: 1400px;
             margin: 0 auto;
-            background: white;
-            border-radius: 15px;
-            box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
-            overflow: hidden;
         }
-        .header {
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-            color: white;
-            padding: 40px 30px;
             text-align: center;
         }
-        .header h1 {
-            font-size: 2.5em;
-            margin-bottom: 10px;
         }
-        .header p {
-            font-size: 1.1em;
-            opacity: 0.9;
         }
         .nav-buttons {
             display: flex;
-            gap: 10px;
             justify-content: center;
-            margin-top: 20px;
             flex-wrap: wrap;
         }
         .nav-btn {
-            padding: 10px 20px;
-            background: rgba(255, 255, 255, 0.2);
-            border: 2px solid white;
             color: white;
-            border-radius: 5px;
             cursor: pointer;
-            font-size: 1em;
-            transition: all 0.3s;
         }
-        .nav-btn:hover,
-        .nav-btn.active {
-            background: white;
-            color: #667eea;
         }
-        .content {
-            padding: 30px;
         }
-        .section {
-            display: none;
         }
-        .section.active {
-            display: block;
-            animation: fadeIn 0.3s;
         }
-        @keyframes fadeIn {
-            from {
-                opacity: 0;
-            }
-            to {
-                opacity: 1;
-            }
         }
         .metrics-grid {
             display: grid;
-            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
-            gap: 20px;
-            margin-bottom: 30px;
         }
         .metric-card {
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
             color: white;
-            padding: 25px;
-            border-radius: 10px;
             text-align: center;
-            box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
-            transition: transform 0.3s;
         }
         .metric-card:hover {
-            transform: translateY(-5px);
         }
         .metric-label {
-            font-size: 0.9em;
             opacity: 0.9;
-            margin-bottom: 10px;
             text-transform: uppercase;
-            letter-spacing: 1px;
         }
         .metric-value {
-            font-size: 2em;
-            font-weight: bold;
         }
         .metric-unit {
-            font-size: 0.7em;
             opacity: 0.8;
-            margin-top: 5px;
         }
         .chart-container {
-            background: white;
-            border: 1px solid #e0e0e0;
-            border-radius: 10px;
-            padding: 20px;
-            margin-bottom: 30px;
-            min-height: 400px;
         }
         .chart-title {
-            font-size: 1.3em;
             font-weight: 600;
-            margin-bottom: 15px;
-            color: #333;
         }
         .two-column {
             display: grid;
             grid-template-columns: 1fr 1fr;
             gap: 20px;
-            margin-bottom: 20px;
         }
         @media (max-width: 900px) {
@@ -163,31 +289,49 @@
             }
         }
-        .status-good {
-            color: #2ecc71;
         }
-        .status-warning {
-            color: #f39c12;
         }
-        .status-critical {
-            color: #e74c3c;
         }
         .info-box {
-            background: #f8f9fa;
-            border-left: 4px solid #667eea;
-            padding: 15px;
             margin-bottom: 20px;
-            border-radius: 5px;
         }
         .info-box p {
-            color: #555;
-            margin: 5px 0;
         }
         .button-group {
             display: flex;
             gap: 10px;
@@ -196,43 +340,49 @@
         }
         .btn {
-            padding: 10px 20px;
-            background: #667eea;
             color: white;
             border: none;
-            border-radius: 5px;
             cursor: pointer;
-            font-size: 1em;
-            transition: background 0.3s;
         }
         .btn:hover {
-            background: #764ba2;
         }
         .btn-secondary {
-            background: #95a5a6;
         }
         .btn-secondary:hover {
-            background: #7f8c8d;
         }
         .loading {
             display: none;
             text-align: center;
-            padding: 20px;
-            color: #667eea;
         }
         .spinner {
-            border: 4px solid #f3f3f3;
-            border-top: 4px solid #667eea;
             border-radius: 50%;
             width: 40px;
             height: 40px;
             animation: spin 1s linear infinite;
-            margin: 0 auto 10px;
         }
         @keyframes spin {
@@ -245,272 +395,287 @@
             }
         }
-        .failure-list {
-            max-height: 400px;
-            overflow-y: auto;
-        }
         .failure-item {
-            background: #f8f9fa;
-            padding: 10px;
-            margin: 5px 0;
-            border-radius: 5px;
-            border-left: 4px solid #e74c3c;
         }
         .failure-item-query {
-            font-weight: 600;
-            color: #333;
         }
         .failure-item-score {
-            font-size: 0.9em;
-            color: #e74c3c;
-            margin-top: 5px;
         }
     </style>
 </head>
 <body>
     <div class="container">
-        <div class="header">
-            <h1>🔍 RAG Evaluation Dashboard</h1>
-            <p>Comprehensive evaluation metrics for your Retrieval-Augmented Generation system</p>
             <div class="nav-buttons">
-                <button class="nav-btn active" onclick="showSection('overview')">Overview</button>
-                <button class="nav-btn" onclick="showSection('retrieval')">Retrieval</button>
-                <button class="nav-btn" onclick="showSection('generation')">Generation</button>
-                <button class="nav-btn" onclick="showSection('faithfulness')">Faithfulness</button>
-                <button class="nav-btn" onclick="showSection('performance')">Performance</button>
-                <button class="nav-btn" onclick="showSection('failures')">Failures</button>
             </div>
         </div>
-        <div class="content">
-            <!-- Overview Section -->
-            <div id="overview" class="section active">
-                <div class="button-group">
-                    <button class="btn" onclick="loadMetrics()">🔄 Refresh Metrics</button>
-                    <button class="btn btn-secondary" onclick="exportResults()">📥 Export Results</button>
-                    <button class="btn btn-secondary" onclick="clearResults()">🗑️ Clear Results</button>
-                </div>
-                <div class="loading" id="loading">
-                    <div class="spinner"></div>
-                    Loading metrics...
                 </div>
-                <div class="metrics-grid" id="metricsGrid">
-                    <!-- Populated by JavaScript -->
                 </div>
-                <div class="info-box">
-                    <p><strong>📊 Total Evaluations:</strong> <span id="totalEvals">0</span></p>
-                    <p><strong>📅 Last Updated:</strong> <span id="lastUpdated">--</span></p>
-                    <p><strong>✅ System Status:</strong> <span id="systemStatus">Initializing...</span></p>
                 </div>
             </div>
-            <!-- Retrieval Section -->
-            <div id="retrieval" class="section">
-                <h2 class="chart-title">📈 Retrieval Quality Analysis</h2>
-                <div class="two-column">
-                    <div class="chart-container">
-                        <div class="chart-title">Precision & Recall Trend</div>
-                        <div id="retrievalChart"></div>
-                    </div>
-                    <div class="chart-container">
-                        <div class="chart-title">Key Metrics</div>
-                        <div style="padding: 20px;">
-                            <p><strong>Mean Reciprocal Rank (MRR):</strong> <span id="mrrValue">--</span></p>
-                            <p style="margin-top: 10px; font-size: 0.9em;">Measures ranking quality of retrieved
-                                documents. Higher is better (ideal: 1.0)</p>
-                            <hr style="margin: 15px 0;">
-                            <p><strong>Avg Precision:</strong> <span id="avgPrecision">--</span></p>
-                            <p style="margin-top: 10px;"><strong>Avg Recall:</strong> <span id="avgRecall">--</span></p>
-                        </div>
-                    </div>
-                </div>
             </div>
-            <!-- Generation Section -->
-            <div id="generation" class="section">
-                <h2 class="chart-title">🎯 Generation Quality Metrics</h2>
-                <div class="two-column">
-                    <div class="chart-container">
-                        <div class="chart-title">Quality Score Trends</div>
-                        <div id="generationChart"></div>
-                    </div>
-                    <div class="chart-container">
-                        <div class="chart-title">Average Scores</div>
-                        <div id="generationBars"></div>
-                    </div>
                 </div>
-                <div class="info-box">
-                    <p><strong>ROUGE-L:</strong> Token-level overlap between generated and reference answers (0-1)</p>
-                    <p><strong>BERTScore:</strong> Semantic similarity using contextual embeddings (0-1)</p>
-                    <p><strong>Answer Relevance:</strong> How relevant is the answer to the query (0-1)</p>
                 </div>
             </div>
-            <!-- Faithfulness Section -->
-            <div id="faithfulness" class="section">
-                <h2 class="chart-title">✅ Faithfulness & Source Attribution</h2>
-                <div class="two-column">
-                    <div class="chart-container">
-                        <div class="chart-title">Hallucination Distribution</div>
-                        <div id="hallucinationChart"></div>
-                    </div>
-                    <div class="chart-container">
-                        <div class="chart-title">Faithfulness Trend</div>
-                        <div id="faithfulnessChart"></div>
-                    </div>
                 </div>
             </div>
-            <!-- Performance Section -->
-            <div id="performance" class="section">
-                <h2 class="chart-title">⚡ Performance & Cost Analysis</h2>
-                <div class="two-column">
-                    <div class="chart-container">
-                        <div class="chart-title">Latency vs Cost</div>
-                        <div id="latencyChart"></div>
-                    </div>
-                    <div class="chart-container">
-                        <div class="chart-title">Latency Percentiles</div>
-                        <div id="percentileChart"></div>
-                    </div>
                 </div>
-                <div class="metrics-grid">
-                    <div class="metric-card">
-                        <div class="metric-label">P50 Latency</div>
-                        <div class="metric-value" id="p50Value">--</div>
-                        <div class="metric-unit">milliseconds</div>
-                    </div>
-                    <div class="metric-card">
-                        <div class="metric-label">P95 Latency</div>
-                        <div class="metric-value" id="p95Value">--</div>
-                        <div class="metric-unit">milliseconds</div>
-                    </div>
-                    <div class="metric-card">
-                        <div class="metric-label">P99 Latency</div>
-                        <div class="metric-value" id="p99Value">--</div>
-                        <div class="metric-unit">milliseconds</div>
-                    </div>
-                    <div class="metric-card">
-                        <div class="metric-label">Avg Cost</div>
-                        <div class="metric-value" id="costValue">--</div>
-                        <div class="metric-unit">cents per query</div>
-                    </div>
                 </div>
             </div>
-            <!-- Failures Section -->
-            <div id="failures" class="section">
-                <h2 class="chart-title">❌ Failure Mode Analysis</h2>
-                <div class="two-column">
-                    <div class="chart-container">
-                        <div class="chart-title">Failure Distribution</div>
-                        <div id="failureChart"></div>
-                    </div>
-                    <div class="chart-container">
-                        <div class="chart-title">Failure Summary</div>
-                        <div style="padding: 20px;">
-                            <p><strong>Total Failures:</strong> <span id="totalFailures">0</span></p>
-                            <p style="margin-top: 15px;"><strong>Hallucinations:</strong> <span id="hallCount">0</span>
-                            </p>
-                            <p><strong>Low Retrieval:</strong> <span id="retCount">0</span></p>
-                            <p><strong>Low Generation:</strong> <span id="genCount">0</span></p>
-                            <p><strong>Low Faithfulness:</strong> <span id="faithCount">0</span></p>
                         </div>
                     </div>
                 </div>
-                <div class="chart-container">
-                    <div class="chart-title">Recent Failures</div>
-                    <div class="failure-list" id="failureList"></div>
                 </div>
             </div>
         </div>
     </div>
     <script>
         let metricsData = null;
         let timeseriesData = null;
         let failureData = null;
         function showSection(sectionId) {
-            // Hide all sections
             document.querySelectorAll('.section').forEach(s => s.classList.remove('active'));
-            document.querySelectorAll('.nav-btn').forEach(b => b.classList.remove('active'));
-            // Show selected section
             document.getElementById(sectionId).classList.add('active');
             event.target.classList.add('active');
-            // Load data for this section
-            if (sectionId === 'overview') loadMetrics();
-            else if (sectionId === 'retrieval') loadRetrievalCharts();
-            else if (sectionId === 'generation') loadGenerationCharts();
-            else if (sectionId === 'faithfulness') loadFaithfulnessCharts();
-            else if (sectionId === 'performance') loadPerformanceCharts();
-            else if (sectionId === 'failures') loadFailureCharts();
         }
         async function loadMetrics() {
             showLoading(true);
             try {
-                const response = await fetch('/evaluation/metrics');
                 metricsData = await response.json();
-                // Update overview cards
-                const metricsGrid = document.getElementById('metricsGrid');
-                metricsGrid.innerHTML = `
-                    <div class="metric-card">
-                        <div class="metric-label">Total Evaluations</div>
-                        <div class="metric-value">${metricsData.total_evaluations}</div>
-                    </div>
-                    <div class="metric-card">
-                        <div class="metric-label">Avg Precision</div>
-                        <div class="metric-value">${metricsData.retrieval_precision_mean.toFixed(3)}</div>
-                    </div>
-                    <div class="metric-card">
-                        <div class="metric-label">Avg BERTScore</div>
-                        <div class="metric-value">${metricsData.bert_score_mean.toFixed(3)}</div>
-                    </div>
-                    <div class="metric-card">
-                        <div class="metric-label">Faithfulness</div>
-                        <div class="metric-value">${metricsData.faithfulness_mean.toFixed(3)}</div>
-                    </div>
-                    <div class="metric-card">
-                        <div class="metric-label">Hallucination Rate</div>
-                        <div class="metric-value">${(metricsData.hallucination_rate * 100).toFixed(1)}%</div>
-                    </div>
-                    <div class="metric-card">
-                        <div class="metric-label">Avg Latency</div>
-                        <div class="metric-value">${metricsData.latency_mean.toFixed(0)}</div>
-                        <div class="metric-unit">ms</div>
-                    </div>
-                    <div class="metric-card">
-                        <div class="metric-label">MRR</div>
-                        <div class="metric-value">${metricsData.mrr.toFixed(3)}</div>
-                    </div>
-                    <div class="metric-card">
-                        <div class="metric-label">Cost/Query</div>
-                        <div class="metric-value">$${(metricsData.cost_per_query / 100).toFixed(4)}</div>
-                    </div>
-                `;
-                document.getElementById('totalEvals').textContent = metricsData.total_evaluations;
-                document.getElementById('lastUpdated').textContent = new Date(metricsData.timestamp).toLocaleString();
-                document.getElementById('systemStatus').textContent = metricsData.hallucination_rate < 0.15 ? '✅ Healthy' : '⚠️ Issues Detected';
             } catch (e) {
                 console.error('Error loading metrics:', e);
@@ -520,11 +685,13 @@
         async function loadRetrievalCharts() {
             try {
-                const response = await fetch('/evaluation/timeseries');
-                timeseriesData = await response.json();
                 if (!timeseriesData.query_idx || timeseriesData.query_idx.length === 0) {
-                    document.getElementById('retrievalChart').innerHTML = '<p style="padding: 20px;">No data yet</p>';
                     return;
                 }
@@ -533,7 +700,7 @@
                     y: timeseriesData.retrieval_precision,
                     name: 'Precision',
                     mode: 'lines+markers',
-                    line: { color: '#667eea' }
                 };
                 const trace2 = {
@@ -541,21 +708,25 @@
                     y: timeseriesData.retrieval_recall,
                     name: 'Recall',
                     mode: 'lines+markers',
-                    line: { color: '#764ba2' }
                 };
-                Plotly.newPlot('retrievalChart', [trace1, trace2], {
-                    title: '',
                     xaxis: { title: 'Query Index' },
                     yaxis: { title: 'Score' },
                     hovermode: 'x unified',
-                    responsive: true
-                });
                 if (metricsData) {
-                    document.getElementById('mrrValue').textContent = metricsData.mrr.toFixed(3);
-                    document.getElementById('avgPrecision').textContent = metricsData.retrieval_precision_mean.toFixed(3);
-                    document.getElementById('avgRecall').textContent = metricsData.retrieval_recall_mean.toFixed(3);
                 }
             } catch (e) {
@@ -566,7 +737,7 @@
         async function loadGenerationCharts() {
             try {
                 if (!timeseriesData) {
-                    const response = await fetch('/evaluation/timeseries');
                     timeseriesData = await response.json();
                 }
@@ -577,7 +748,7 @@
                     y: timeseriesData.rouge_l,
                     name: 'ROUGE-L',
                     mode: 'lines+markers',
-                    line: { color: '#f39c12' }
                 };
                 const trace2 = {
@@ -585,23 +756,34 @@
                     y: timeseriesData.bert_score,
                     name: 'BERTScore',
                     mode: 'lines+markers',
-                    line: { color: '#2ecc71' }
                 };
-                Plotly.newPlot('generationChart', [trace1, trace2], {
-                    title: '', xaxis: { title: 'Query Index' }, yaxis: { title: 'Score' }, hovermode: 'x unified', responsive: true
-                });
                 if (metricsData) {
                     const barsTrace = {
                         x: ['ROUGE-L', 'BERTScore', 'Answer Relevance'],
                         y: [metricsData.rouge_l_mean, metricsData.bert_score_mean, metricsData.answer_relevance_mean],
                         type: 'bar',
-                        marker: { color: ['#f39c12', '#2ecc71', '#3498db'] }
                     };
                     Plotly.newPlot('generationBars', [barsTrace], {
-                        title: '', yaxis: { title: 'Score' }, responsive: true, showlegend: false
                     });
                 }
@@ -614,21 +796,24 @@
             try {
                 if (!metricsData) await loadMetrics();
                 if (!timeseriesData) {
-                    const response = await fetch('/evaluation/timeseries');
                     timeseriesData = await response.json();
                 }
-                const hallRate = metricsData.hallucination_rate;
                 const faithfulRate = 1 - hallRate;
                 const pieTrace = {
                     labels: ['Faithful Answers', 'Hallucinations'],
                     values: [faithfulRate * 100, hallRate * 100],
                     type: 'pie',
-                    marker: { colors: ['#2ecc71', '#e74c3c'] }
                 };
-                Plotly.newPlot('hallucinationChart', [pieTrace], { title: '', responsive: true });
                 if (timeseriesData.query_idx && timeseriesData.query_idx.length > 0) {
                     const faithTrace = {
@@ -636,12 +821,16 @@
                         y: timeseriesData.faithfulness,
                         name: 'Faithfulness',
                         mode: 'lines+markers',
-                        line: { color: '#16a085', width: 2 },
                         marker: { size: 6 }
                     };
                     Plotly.newPlot('faithfulnessChart', [faithTrace], {
-                        title: '', xaxis: { title: 'Query Index' }, yaxis: { title: 'Score (0-1)' }, responsive: true
                     });
                 }
@@ -654,28 +843,32 @@
             try {
                 if (!metricsData) await loadMetrics();
                 if (!timeseriesData) {
-                    const response = await fetch('/evaluation/timeseries');
                     timeseriesData = await response.json();
                 }
                 if (timeseriesData.query_idx && timeseriesData.query_idx.length > 0) {
                     const latencyTrace = {
-                        x: timeseriesData.latency_ms,
                         y: timeseriesData.latency_ms,
-                        mode: 'markers',
-                        marker: { size: 8, color: timeseriesData.query_idx, colorscale: 'Viridis', showscale: true },
-                        type: 'scatter'
                     };
                     Plotly.newPlot('latencyChart', [latencyTrace], {
-                        title: '', xaxis: { title: 'Query Index' }, yaxis: { title: 'Latency (ms)' }, responsive: true
                     });
                 }
-                document.getElementById('p50Value').textContent = metricsData.latency_p50.toFixed(0);
-                document.getElementById('p95Value').textContent = metricsData.latency_p95.toFixed(0);
-                document.getElementById('p99Value').textContent = metricsData.latency_p99.toFixed(0);
-                document.getElementById('costValue').textContent = (metricsData.cost_per_query / 100).toFixed(4);
             } catch (e) {
                 console.error('Error loading performance charts:', e);
@@ -684,25 +877,28 @@
         async function loadFailureCharts() {
             try {
-                const response = await fetch('/evaluation/failures');
                 failureData = await response.json();
                 const failureChart = {
                     x: Object.keys(failureData.failure_modes),
                     y: Object.values(failureData.failure_modes),
                     type: 'bar',
-                    marker: { color: '#e74c3c' }
                 };
                 Plotly.newPlot('failureChart', [failureChart], {
-                    title: '', yaxis: { title: 'Count' }, responsive: true, showlegend: false
                 });
                 document.getElementById('totalFailures').textContent = failureData.total_failures;
                 document.getElementById('hallCount').textContent = failureData.failure_modes.hallucinations;
                 document.getElementById('retCount').textContent = failureData.failure_modes.low_retrieval;
                 document.getElementById('genCount').textContent = failureData.failure_modes.low_generation;
-                document.getElementById('faithCount').textContent = failureData.failure_modes.low_faithfulness;
                 // Show recent failures
                 const failureList = document.getElementById('failureList');
@@ -713,10 +909,13 @@
                 ];
                 allFailures.forEach(f => {
-                    html += `<div class="failure-item"><div class="failure-item-query">${f.query}</div><div class="failure-item-score">Score: ${f.score.toFixed(3)}</div></div>`;
                 });
-                failureList.innerHTML = html || '<p style="padding: 20px; color: #999;">No failures detected! 🎉</p>';
             } catch (e) {
                 console.error('Error loading failure analysis:', e);
@@ -729,7 +928,7 @@
         async function exportResults() {
             try {
-                const response = await fetch('/evaluation/export');
                 const blob = await response.blob();
                 const url = window.URL.createObjectURL(blob);
                 const a = document.createElement('a');
@@ -745,7 +944,7 @@
         async function clearResults() {
             if (confirm('Are you sure you want to clear all results?')) {
                 try {
-                    await fetch('/evaluation/reset', { method: 'POST' });
                     metricsData = null;
                     timeseriesData = null;
                     failureData = null;
@@ -757,6 +956,33 @@
             }
         }
         // Load metrics on page load
         window.addEventListener('load', loadMetrics);
     </script>

     <title>RAG Evaluation Dashboard</title>
     <script src="https://cdnjs.cloudflare.com/ajax/libs/plotly.js/2.26.0/plotly.min.js"></script>
     <style>
+        /* -----------------------------
+           Design tokens (matching index.html)
+        ------------------------------*/
+        :root {
+            --surface: #ffffff;
+            --surface-subtle: #fafafa;
+            --bg-main: #f5f7fb;
+            --card-bg: #ffffff;
+            --accent: #2563eb;
+            --accent-soft: #eff6ff;
+            --text-main: #111827;
+            --text-muted: #6b7280;
+            --border-soft: #e5e7eb;
+            --success: #16a34a;
+            --error: #dc2626;
+            --warning: #f59e0b;
+            --info: #2563eb;
+            --radius-sm: 6px;
+            --radius-md: 10px;
+            --radius-lg: 14px;
+        }
+        /* Dark mode */
+        [data-theme="dark"] {
+            --surface: #1f2937;
+            --surface-subtle: #111827;
+            --bg-main: #0f172a;
+            --card-bg: #1e293b;
+            --accent: #60a5fa;
+            --accent-soft: #1e3a5f;
+            --text-main: #f1f5f9;
+            --text-muted: #94a3b8;
+            --border-soft: #334155;
+        }
+        /* -----------------------------
+           Reset
+        ------------------------------*/
         * {
             margin: 0;
             padding: 0;
             box-sizing: border-box;
         }
+        /* -----------------------------
+           Base
+        ------------------------------*/
         body {
+            font-family: Inter, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
+            background: var(--bg-main);
             min-height: 100vh;
+            padding: 24px;
+            color: var(--text-main);
         }
+        /* -----------------------------
+           Container
+        ------------------------------*/
         .container {
+            max-width: 1200px;
             margin: 0 auto;
         }
+        /* -----------------------------
+           Header
+        ------------------------------*/
+        header {
             text-align: center;
+            margin-bottom: 36px;
         }
+        header h1 {
+            font-size: 2.2rem;
+            font-weight: 600;
+            letter-spacing: -0.02em;
+            margin-bottom: 8px;
         }
+        header p {
+            font-size: 1rem;
+            color: var(--text-muted);
+            margin-bottom: 20px;
         }
         .nav-buttons {
             display: flex;
             justify-content: center;
+            gap: 12px;
             flex-wrap: wrap;
+            margin-bottom: 12px;
         }
         .nav-btn {
+            display: inline-flex;
+            align-items: center;
+            gap: 8px;
+            padding: 10px 18px;
+            background: var(--accent);
             color: white;
+            text-decoration: none;
+            border-radius: var(--radius-md);
+            font-size: 0.9rem;
+            font-weight: 500;
+            transition: background 0.15s ease, transform 0.15s ease;
+            border: none;
             cursor: pointer;
         }
+        .nav-btn:hover {
+            background: #1d4ed8;
+            transform: translateY(-1px);
         }
+        .nav-btn.secondary {
+            background: var(--surface);
+            color: var(--text-main);
+            border: 1px solid var(--border-soft);
         }
+        .nav-btn.secondary:hover {
+            background: var(--surface-subtle);
         }
+        [data-theme="dark"] .nav-btn {
+            background: #60a5fa;
         }
+        [data-theme="dark"] .nav-btn:hover {
+            background: #3b82f6;
+        }
+        /* Theme toggle */
+        .theme-toggle {
+            position: fixed;
+            top: 20px;
+            right: 20px;
+            padding: 8px 14px;
+            background: var(--surface);
+            border: 1px solid var(--border-soft);
+            border-radius: var(--radius-md);
+            cursor: pointer;
+            font-size: 0.85rem;
+            color: var(--text-main);
+            z-index: 100;
         }
+        /* -----------------------------
+           Tab Navigation
+        ------------------------------*/
+        .tab-nav {
+            display: flex;
+            justify-content: center;
+            gap: 8px;
+            margin-bottom: 24px;
+            flex-wrap: wrap;
+        }
+        .tab-btn {
+            padding: 10px 20px;
+            background: var(--surface);
+            border: 1px solid var(--border-soft);
+            color: var(--text-main);
+            border-radius: var(--radius-md);
+            cursor: pointer;
+            font-size: 0.9rem;
+            transition: all 0.15s ease;
+        }
+        .tab-btn:hover {
+            background: var(--surface-subtle);
+        }
+        .tab-btn.active {
+            background: var(--accent);
+            color: white;
+            border-color: var(--accent);
+        }
+        /* -----------------------------
+           Cards
+        ------------------------------*/
+        .card {
+            background: var(--surface);
+            border-radius: var(--radius-lg);
+            padding: 24px;
+            border: 1px solid var(--border-soft);
+            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05);
+            margin-bottom: 24px;
+        }
+        .card h2 {
+            font-size: 1.1rem;
+            font-weight: 600;
+            margin-bottom: 16px;
+            color: var(--text-main);
+        }
+        /* -----------------------------
+           Metrics Grid
+        ------------------------------*/
         .metrics-grid {
             display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
+            gap: 16px;
+            margin-bottom: 24px;
         }
         .metric-card {
+            background: var(--accent);
             color: white;
+            padding: 20px;
+            border-radius: var(--radius-md);
             text-align: center;
+            transition: transform 0.15s ease;
         }
         .metric-card:hover {
+            transform: translateY(-2px);
+        }
+        .metric-card.success {
+            background: var(--success);
+        }
+        .metric-card.warning {
+            background: var(--warning);
+        }
+        .metric-card.error {
+            background: var(--error);
         }
         .metric-label {
+            font-size: 0.8rem;
             opacity: 0.9;
+            margin-bottom: 8px;
             text-transform: uppercase;
+            letter-spacing: 0.5px;
         }
         .metric-value {
+            font-size: 1.8rem;
+            font-weight: 700;
         }
         .metric-unit {
+            font-size: 0.75rem;
             opacity: 0.8;
+            margin-top: 4px;
         }
+        /* -----------------------------
+           Charts
+        ------------------------------*/
         .chart-container {
+            background: var(--surface);
+            border: 1px solid var(--border-soft);
+            border-radius: var(--radius-md);
+            padding: 16px;
+            margin-bottom: 20px;
+            min-height: 350px;
         }
         .chart-title {
+            font-size: 1rem;
             font-weight: 600;
+            margin-bottom: 12px;
+            color: var(--text-main);
         }
         .two-column {
             display: grid;
             grid-template-columns: 1fr 1fr;
             gap: 20px;
         }
         @media (max-width: 900px) {
             }
         }
+        /* -----------------------------
+           Sections
+        ------------------------------*/
+        .section {
+            display: none;
         }
+        .section.active {
+            display: block;
+            animation: fadeIn 0.3s ease;
         }
+        @keyframes fadeIn {
+            from {
+                opacity: 0;
+                transform: translateY(10px);
+            }
+            to {
+                opacity: 1;
+                transform: translateY(0);
+            }
         }
+        /* -----------------------------
+           Info Box
+        ------------------------------*/
         .info-box {
+            background: var(--accent-soft);
+            border-left: 4px solid var(--accent);
+            padding: 14px;
+            border-radius: var(--radius-sm);
             margin-bottom: 20px;
         }
         .info-box p {
+            font-size: 0.9rem;
+            color: var(--text-muted);
         }
+        /* -----------------------------
+           Button Group
+        ------------------------------*/
         .button-group {
             display: flex;
             gap: 10px;
         }
         .btn {
+            padding: 10px 18px;
+            background: var(--accent);
             color: white;
             border: none;
+            border-radius: var(--radius-md);
             cursor: pointer;
+            font-size: 0.9rem;
+            font-weight: 500;
+            transition: background 0.15s ease;
         }
         .btn:hover {
+            background: #1d4ed8;
         }
         .btn-secondary {
+            background: var(--surface);
+            color: var(--text-main);
+            border: 1px solid var(--border-soft);
         }
         .btn-secondary:hover {
+            background: var(--surface-subtle);
         }
+        /* -----------------------------
+           Loading
+        ------------------------------*/
         .loading {
             display: none;
             text-align: center;
+            padding: 40px;
+            color: var(--accent);
         }
         .spinner {
+            border: 4px solid var(--border-soft);
+            border-top: 4px solid var(--accent);
             border-radius: 50%;
             width: 40px;
             height: 40px;
             animation: spin 1s linear infinite;
+            margin: 0 auto 15px;
         }
         @keyframes spin {
             }
         }
+        /* -----------------------------
+           Failure Items
+        ------------------------------*/
         .failure-item {
+            background: var(--surface-subtle);
+            border-left: 4px solid var(--error);
+            padding: 12px;
+            margin-bottom: 10px;
+            border-radius: var(--radius-sm);
         }
         .failure-item-query {
+            font-size: 0.9rem;
+            color: var(--text-main);
+            margin-bottom: 4px;
         }
         .failure-item-score {
+            font-size: 0.8rem;
+            color: var(--error);
+            font-weight: 500;
+        }
+        /* -----------------------------
+           Status Colors
+        ------------------------------*/
+        .status-good {
+            color: var(--success);
+        }
+        .status-warning {
+            color: var(--warning);
+        }
+        .status-critical {
+            color: var(--error);
+        }
+        /* Plotly chart background fix for dark mode */
+        [data-theme="dark"] .js-plotly-plot .plotly .bg {
+            fill: var(--surface) !important;
         }
     </style>
 </head>
 <body>
+    <button class="theme-toggle" id="themeToggle">🌙 Dark mode</button>
     <div class="container">
+        <header>
+            <h1>📊 RAG Evaluation Dashboard</h1>
+            <p>Real-time metrics and analysis for your RAG pipeline</p>
             <div class="nav-buttons">
+                <a href="/" class="nav-btn secondary">← Back to Main</a>
+                <a href="/ragas-demo" class="nav-btn">🔬 RAGAS Evaluation</a>
             </div>
+        </header>
+        <!-- Tab Navigation -->
+        <div class="tab-nav">
+            <button class="tab-btn active" onclick="showSection('overview')">Overview</button>
+            <button class="tab-btn" onclick="showSection('retrieval')">Retrieval</button>
+            <button class="tab-btn" onclick="showSection('generation')">Generation</button>
+            <button class="tab-btn" onclick="showSection('faithfulness')">Faithfulness</button>
+            <button class="tab-btn" onclick="showSection('performance')">Performance</button>
+            <button class="tab-btn" onclick="showSection('failures')">Failures</button>
         </div>
+        <div class="loading" id="loading">
+            <div class="spinner"></div>
+            <p>Loading evaluation data...</p>
+        </div>
+        <!-- Overview Section -->
+        <div class="section active" id="overview">
+            <div class="metrics-grid">
+                <div class="metric-card">
+                    <div class="metric-label">Total Evaluations</div>
+                    <div class="metric-value" id="totalEvaluations">0</div>
                 </div>
+                <div class="metric-card success">
+                    <div class="metric-label">Avg Precision</div>
+                    <div class="metric-value" id="overviewPrecision">-</div>
                 </div>
+                <div class="metric-card">
+                    <div class="metric-label">Avg Recall</div>
+                    <div class="metric-value" id="overviewRecall">-</div>
+                </div>
+                <div class="metric-card success">
+                    <div class="metric-label">Avg BERTScore</div>
+                    <div class="metric-value" id="overviewBert">-</div>
+                </div>
+                <div class="metric-card">
+                    <div class="metric-label">Avg Faithfulness</div>
+                    <div class="metric-value" id="overviewFaith">-</div>
+                </div>
+                <div class="metric-card warning">
+                    <div class="metric-label">Hallucination Rate</div>
+                    <div class="metric-value" id="overviewHalluc">-</div>
+                </div>
+                <div class="metric-card">
+                    <div class="metric-label">Avg Latency</div>
+                    <div class="metric-value" id="overviewLatency">-</div>
+                    <div class="metric-unit">ms</div>
+                </div>
+                <div class="metric-card">
+                    <div class="metric-label">MRR</div>
+                    <div class="metric-value" id="overviewMRR">-</div>
                 </div>
             </div>
+            <div class="button-group">
+                <button class="btn" onclick="loadMetrics()">🔄 Refresh</button>
+                <button class="btn btn-secondary" onclick="exportResults()">📥 Export CSV</button>
+                <button class="btn btn-secondary" onclick="clearResults()">🗑️ Clear Results</button>
             </div>
+            <div class="info-box">
+                <p><strong>How to use:</strong> This dashboard shows evaluation metrics from your RAG pipeline.
+                    Use the tabs above to explore different aspects of performance.
+                    Try the <a href="/ragas-demo">RAGAS Evaluation</a> to run live evaluations.</p>
+            </div>
+        </div>
+        <!-- Retrieval Section -->
+        <div class="section" id="retrieval">
+            <div class="card">
+                <h2>Retrieval Metrics Over Time</h2>
+                <div class="chart-container" id="retrievalChart"></div>
+            </div>
+            <div class="metrics-grid">
+                <div class="metric-card">
+                    <div class="metric-label">MRR</div>
+                    <div class="metric-value" id="mrrValue">-</div>
                 </div>
+                <div class="metric-card success">
+                    <div class="metric-label">Avg Precision</div>
+                    <div class="metric-value" id="avgPrecision">-</div>
+                </div>
+                <div class="metric-card">
+                    <div class="metric-label">Avg Recall</div>
+                    <div class="metric-value" id="avgRecall">-</div>
                 </div>
             </div>
+        </div>
+        <!-- Generation Section -->
+        <div class="section" id="generation">
+            <div class="two-column">
+                <div class="card">
+                    <h2>Generation Quality Over Time</h2>
+                    <div class="chart-container" id="generationChart"></div>
+                </div>
+                <div class="card">
+                    <h2>Average Scores</h2>
+                    <div class="chart-container" id="generationBars"></div>
                 </div>
             </div>
+        </div>
+        <!-- Faithfulness Section -->
+        <div class="section" id="faithfulness">
+            <div class="two-column">
+                <div class="card">
+                    <h2>Hallucination Rate</h2>
+                    <div class="chart-container" id="hallucinationChart"></div>
                 </div>
+                <div class="card">
+                    <h2>Faithfulness Over Time</h2>
+                    <div class="chart-container" id="faithfulnessChart"></div>
                 </div>
             </div>
+        </div>
+        <!-- Performance Section -->
+        <div class="section" id="performance">
+            <div class="card">
+                <h2>Latency Distribution</h2>
+                <div class="chart-container" id="latencyChart"></div>
+            </div>
+            <div class="metrics-grid">
+                <div class="metric-card">
+                    <div class="metric-label">P50 Latency</div>
+                    <div class="metric-value" id="p50Value">-</div>
+                    <div class="metric-unit">ms</div>
+                </div>
+                <div class="metric-card warning">
+                    <div class="metric-label">P95 Latency</div>
+                    <div class="metric-value" id="p95Value">-</div>
+                    <div class="metric-unit">ms</div>
+                </div>
+                <div class="metric-card">
+                    <div class="metric-label">P99 Latency</div>
+                    <div class="metric-value" id="p99Value">-</div>
+                    <div class="metric-unit">ms</div>
+                </div>
+                <div class="metric-card">
+                    <div class="metric-label">Cost per Query</div>
+                    <div class="metric-value" id="costValue">-</div>
+                    <div class="metric-unit">USD</div>
+                </div>
+            </div>
+        </div>
+        <!-- Failures Section -->
+        <div class="section" id="failures">
+            <div class="two-column">
+                <div class="card">
+                    <h2>Failure Modes</h2>
+                    <div class="chart-container" id="failureChart"></div>
+                </div>
+                <div class="card">
+                    <h2>Failure Counts</h2>
+                    <div class="metrics-grid" style="margin-top: 20px;">
+                        <div class="metric-card error">
+                            <div class="metric-label">Total Failures</div>
+                            <div class="metric-value" id="totalFailures">0</div>
+                        </div>
+                        <div class="metric-card error">
+                            <div class="metric-label">Hallucinations</div>
+                            <div class="metric-value" id="hallCount">0</div>
+                        </div>
+                        <div class="metric-card warning">
+                            <div class="metric-label">Low Retrieval</div>
+                            <div class="metric-value" id="retCount">0</div>
+                        </div>
+                        <div class="metric-card warning">
+                            <div class="metric-label">Low Generation</div>
+                            <div class="metric-value" id="genCount">0</div>
                         </div>
                     </div>
                 </div>
+            </div>
+            <div class="card">
+                <h2>Recent Failures</h2>
+                <div id="failureList">
+                    <p style="color: var(--text-muted); padding: 20px;">No failures detected! 🎉</p>
                 </div>
             </div>
         </div>
     </div>
     <script>
+        const API_URL = window.location.origin;
         let metricsData = null;
         let timeseriesData = null;
         let failureData = null;
+        // Tab navigation
         function showSection(sectionId) {
             document.querySelectorAll('.section').forEach(s => s.classList.remove('active'));
+            document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
             document.getElementById(sectionId).classList.add('active');
             event.target.classList.add('active');
+            // Load section-specific data
+            switch (sectionId) {
+                case 'retrieval': loadRetrievalCharts(); break;
+                case 'generation': loadGenerationCharts(); break;
+                case 'faithfulness': loadFaithfulnessCharts(); break;
+                case 'performance': loadPerformanceCharts(); break;
+                case 'failures': loadFailureCharts(); break;
+            }
         }
         async function loadMetrics() {
             showLoading(true);
             try {
+                const response = await fetch(`${API_URL}/evaluation/metrics`);
                 metricsData = await response.json();
+                // Update overview
+                document.getElementById('totalEvaluations').textContent = metricsData.total_evaluations;
+                document.getElementById('overviewPrecision').textContent = metricsData.retrieval_precision_mean?.toFixed(3) || '-';
+                document.getElementById('overviewRecall').textContent = metricsData.retrieval_recall_mean?.toFixed(3) || '-';
+                document.getElementById('overviewBert').textContent = metricsData.bert_score_mean?.toFixed(3) || '-';
+                document.getElementById('overviewFaith').textContent = metricsData.faithfulness_mean?.toFixed(3) || '-';
+                document.getElementById('overviewHalluc').textContent = ((metricsData.hallucination_rate || 0) * 100).toFixed(1) + '%';
+                document.getElementById('overviewLatency').textContent = metricsData.latency_mean?.toFixed(0) || '-';
+                document.getElementById('overviewMRR').textContent = metricsData.mrr?.toFixed(3) || '-';
             } catch (e) {
                 console.error('Error loading metrics:', e);
         async function loadRetrievalCharts() {
             try {
+                if (!timeseriesData) {
+                    const response = await fetch(`${API_URL}/evaluation/timeseries`);
+                    timeseriesData = await response.json();
+                }
                 if (!timeseriesData.query_idx || timeseriesData.query_idx.length === 0) {
+                    document.getElementById('retrievalChart').innerHTML = '<p style="padding: 40px; text-align: center; color: var(--text-muted);">No data yet. Run some evaluations first.</p>';
                     return;
                 }
                     y: timeseriesData.retrieval_precision,
                     name: 'Precision',
                     mode: 'lines+markers',
+                    line: { color: '#2563eb' }
                 };
                 const trace2 = {
                     y: timeseriesData.retrieval_recall,
                     name: 'Recall',
                     mode: 'lines+markers',
+                    line: { color: '#16a34a' }
                 };
+                const layout = {
                     xaxis: { title: 'Query Index' },
                     yaxis: { title: 'Score' },
                     hovermode: 'x unified',
+                    responsive: true,
+                    paper_bgcolor: 'rgba(0,0,0,0)',
+                    plot_bgcolor: 'rgba(0,0,0,0)',
+                    font: { color: getComputedStyle(document.body).getPropertyValue('--text-main') }
+                };
+                Plotly.newPlot('retrievalChart', [trace1, trace2], layout);
                 if (metricsData) {
+                    document.getElementById('mrrValue').textContent = metricsData.mrr?.toFixed(3) || '-';
+                    document.getElementById('avgPrecision').textContent = metricsData.retrieval_precision_mean?.toFixed(3) || '-';
+                    document.getElementById('avgRecall').textContent = metricsData.retrieval_recall_mean?.toFixed(3) || '-';
                 }
             } catch (e) {
         async function loadGenerationCharts() {
             try {
                 if (!timeseriesData) {
+                    const response = await fetch(`${API_URL}/evaluation/timeseries`);
                     timeseriesData = await response.json();
                 }
                     y: timeseriesData.rouge_l,
                     name: 'ROUGE-L',
                     mode: 'lines+markers',
+                    line: { color: '#f59e0b' }
                 };
                 const trace2 = {
                     y: timeseriesData.bert_score,
                     name: 'BERTScore',
                     mode: 'lines+markers',
+                    line: { color: '#16a34a' }
                 };
+                const layout = {
+                    xaxis: { title: 'Query Index' },
+                    yaxis: { title: 'Score' },
+                    hovermode: 'x unified',
+                    responsive: true,
+                    paper_bgcolor: 'rgba(0,0,0,0)',
+                    plot_bgcolor: 'rgba(0,0,0,0)'
+                };
+                Plotly.newPlot('generationChart', [trace1, trace2], layout);
                 if (metricsData) {
                     const barsTrace = {
                         x: ['ROUGE-L', 'BERTScore', 'Answer Relevance'],
                         y: [metricsData.rouge_l_mean, metricsData.bert_score_mean, metricsData.answer_relevance_mean],
                         type: 'bar',
+                        marker: { color: ['#f59e0b', '#16a34a', '#2563eb'] }
                     };
                     Plotly.newPlot('generationBars', [barsTrace], {
+                        yaxis: { title: 'Score' },
+                        responsive: true,
+                        showlegend: false,
+                        paper_bgcolor: 'rgba(0,0,0,0)',
+                        plot_bgcolor: 'rgba(0,0,0,0)'
                     });
                 }
             try {
                 if (!metricsData) await loadMetrics();
                 if (!timeseriesData) {
+                    const response = await fetch(`${API_URL}/evaluation/timeseries`);
                     timeseriesData = await response.json();
                 }
+                const hallRate = metricsData.hallucination_rate || 0;
                 const faithfulRate = 1 - hallRate;
                 const pieTrace = {
                     labels: ['Faithful Answers', 'Hallucinations'],
                     values: [faithfulRate * 100, hallRate * 100],
                     type: 'pie',
+                    marker: { colors: ['#16a34a', '#dc2626'] }
                 };
+                Plotly.newPlot('hallucinationChart', [pieTrace], {
+                    responsive: true,
+                    paper_bgcolor: 'rgba(0,0,0,0)'
+                });
                 if (timeseriesData.query_idx && timeseriesData.query_idx.length > 0) {
                     const faithTrace = {
                         y: timeseriesData.faithfulness,
                         name: 'Faithfulness',
                         mode: 'lines+markers',
+                        line: { color: '#2563eb', width: 2 },
                         marker: { size: 6 }
                     };
                     Plotly.newPlot('faithfulnessChart', [faithTrace], {
+                        xaxis: { title: 'Query Index' },
+                        yaxis: { title: 'Score (0-1)' },
+                        responsive: true,
+                        paper_bgcolor: 'rgba(0,0,0,0)',
+                        plot_bgcolor: 'rgba(0,0,0,0)'
                     });
                 }
             try {
                 if (!metricsData) await loadMetrics();
                 if (!timeseriesData) {
+                    const response = await fetch(`${API_URL}/evaluation/timeseries`);
                     timeseriesData = await response.json();
                 }
                 if (timeseriesData.query_idx && timeseriesData.query_idx.length > 0) {
                     const latencyTrace = {
+                        x: timeseriesData.query_idx,
                         y: timeseriesData.latency_ms,
+                        mode: 'lines+markers',
+                        line: { color: '#2563eb' },
+                        marker: { size: 6 }
                     };
                     Plotly.newPlot('latencyChart', [latencyTrace], {
+                        xaxis: { title: 'Query Index' },
+                        yaxis: { title: 'Latency (ms)' },
+                        responsive: true,
+                        paper_bgcolor: 'rgba(0,0,0,0)',
+                        plot_bgcolor: 'rgba(0,0,0,0)'
                     });
                 }
+                document.getElementById('p50Value').textContent = metricsData.latency_p50?.toFixed(0) || '-';
+                document.getElementById('p95Value').textContent = metricsData.latency_p95?.toFixed(0) || '-';
+                document.getElementById('p99Value').textContent = metricsData.latency_p99?.toFixed(0) || '-';
+                document.getElementById('costValue').textContent = metricsData.cost_per_query ? (metricsData.cost_per_query / 100).toFixed(4) : '-';
             } catch (e) {
                 console.error('Error loading performance charts:', e);
         async function loadFailureCharts() {
             try {
+                const response = await fetch(`${API_URL}/evaluation/failures`);
                 failureData = await response.json();
                 const failureChart = {
                     x: Object.keys(failureData.failure_modes),
                     y: Object.values(failureData.failure_modes),
                     type: 'bar',
+                    marker: { color: '#dc2626' }
                 };
                 Plotly.newPlot('failureChart', [failureChart], {
+                    yaxis: { title: 'Count' },
+                    responsive: true,
+                    showlegend: false,
+                    paper_bgcolor: 'rgba(0,0,0,0)',
+                    plot_bgcolor: 'rgba(0,0,0,0)'
                 });
                 document.getElementById('totalFailures').textContent = failureData.total_failures;
                 document.getElementById('hallCount').textContent = failureData.failure_modes.hallucinations;
                 document.getElementById('retCount').textContent = failureData.failure_modes.low_retrieval;
                 document.getElementById('genCount').textContent = failureData.failure_modes.low_generation;
                 // Show recent failures
                 const failureList = document.getElementById('failureList');
                 ];
                 allFailures.forEach(f => {
+                    html += `<div class="failure-item">
+                        <div class="failure-item-query">${f.query}</div>
+                        <div class="failure-item-score">Score: ${f.score.toFixed(3)}</div>
+                    </div>`;
                 });
+                failureList.innerHTML = html || '<p style="padding: 20px; color: var(--text-muted);">No failures detected! 🎉</p>';
             } catch (e) {
                 console.error('Error loading failure analysis:', e);
         async function exportResults() {
             try {
+                const response = await fetch(`${API_URL}/evaluation/export`);
                 const blob = await response.blob();
                 const url = window.URL.createObjectURL(blob);
                 const a = document.createElement('a');
         async function clearResults() {
             if (confirm('Are you sure you want to clear all results?')) {
                 try {
+                    await fetch(`${API_URL}/evaluation/reset`, { method: 'POST' });
                     metricsData = null;
                     timeseriesData = null;
                     failureData = null;
             }
         }
+        // Dark mode toggle
+        const themeToggle = document.getElementById("themeToggle");
+        const root = document.documentElement;
+        const savedTheme = localStorage.getItem("theme");
+        const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
+        if (savedTheme) {
+            root.setAttribute("data-theme", savedTheme);
+        } else if (prefersDark) {
+            root.setAttribute("data-theme", "dark");
+        }
+        function updateToggleText() {
+            const isDark = root.getAttribute("data-theme") === "dark";
+            themeToggle.textContent = isDark ? "☀️ Light mode" : "🌙 Dark mode";
+        }
+        updateToggleText();
+        themeToggle.addEventListener("click", () => {
+            const isDark = root.getAttribute("data-theme") === "dark";
+            const newTheme = isDark ? "light" : "dark";
+            root.setAttribute("data-theme", newTheme);
+            localStorage.setItem("theme", newTheme);
+            updateToggleText();
+        });
         // Load metrics on page load
         window.addEventListener('load', loadMetrics);
     </script>

frontend/index.html CHANGED Viewed

@@ -485,6 +485,7 @@
             <p>Ask questions about your research papers</p>
             <nav>
                 <a href="/evaluation" class="eval-button">📊 Evaluation Dashboard</a>
             </nav>
             <button id="themeToggle" aria-label="Toggle dark mode" style="
             position: fixed;

             <p>Ask questions about your research papers</p>
             <nav>
                 <a href="/evaluation" class="eval-button">📊 Evaluation Dashboard</a>
+                <a href="/ragas-demo" class="eval-button">🔬 RAGAS Evaluation</a>
             </nav>
             <button id="themeToggle" aria-label="Toggle dark mode" style="
             position: fixed;

frontend/index_.html ADDED Viewed

	@@ -0,0 +1,592 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Document Intelligence RAG</title>
+    <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            min-height: 100vh;
+            padding: 20px;
+        }
+        .container {
+            max-width: 1000px;
+            margin: 0 auto;
+        }
+        header {
+            text-align: center;
+            color: white;
+            margin-bottom: 40px;
+        }
+        header h1 {
+            font-size: 2.5em;
+            margin-bottom: 10px;
+            text-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
+        }
+        header p {
+            font-size: 1.1em;
+            opacity: 0.9;
+        }
+        .main-grid {
+            display: grid;
+            grid-template-columns: 1fr 1fr;
+            gap: 20px;
+            margin-bottom: 20px;
+        }
+        .card {
+            background: white;
+            border-radius: 12px;
+            padding: 25px;
+            box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
+        }
+        .card h2 {
+            color: #333;
+            margin-bottom: 15px;
+            font-size: 1.3em;
+        }
+        .upload-area {
+            border: 2px dashed #667eea;
+            border-radius: 8px;
+            padding: 30px;
+            text-align: center;
+            cursor: pointer;
+            transition: all 0.3s;
+        }
+        .upload-area:hover {
+            border-color: #764ba2;
+            background: #f8f9ff;
+        }
+        .upload-area.dragover {
+            border-color: #764ba2;
+            background: #f0f2ff;
+        }
+        .upload-area input {
+            display: none;
+        }
+        .upload-area p {
+            color: #666;
+            margin-bottom: 10px;
+        }
+        .btn {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            border: none;
+            padding: 12px 24px;
+            border-radius: 8px;
+            cursor: pointer;
+            font-size: 1em;
+            font-weight: 600;
+            transition: transform 0.2s, box-shadow 0.2s;
+        }
+        .btn:hover {
+            transform: translateY(-2px);
+            box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
+        }
+        .btn:active {
+            transform: translateY(0);
+        }
+        .btn-secondary {
+            background: #f0f0f0;
+            color: #333;
+        }
+        .btn-secondary:hover {
+            background: #e0e0e0;
+            box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1);
+        }
+        .query-input {
+            display: flex;
+            gap: 10px;
+            margin-bottom: 20px;
+        }
+        .query-input input {
+            flex: 1;
+            padding: 12px;
+            border: 2px solid #e0e0e0;
+            border-radius: 8px;
+            font-size: 1em;
+            transition: border-color 0.3s;
+        }
+        .query-input input:focus {
+            outline: none;
+            border-color: #667eea;
+        }
+        .status {
+            padding: 15px;
+            border-radius: 8px;
+            margin-bottom: 15px;
+            font-size: 0.95em;
+        }
+        .status.success {
+            background: #d4edda;
+            color: #155724;
+            border-left: 4px solid #28a745;
+        }
+        .status.error {
+            background: #f8d7da;
+            color: #721c24;
+            border-left: 4px solid #f5c6cb;
+        }
+        .status.loading {
+            background: #e7f3ff;
+            color: #004085;
+            border-left: 4px solid #0c5ff4;
+        }
+        .answer-box {
+            background: #f8f9fa;
+            border-left: 4px solid #667eea;
+            padding: 15px;
+            border-radius: 8px;
+            margin-bottom: 20px;
+        }
+        .answer-box h3 {
+            color: #333;
+            margin-bottom: 10px;
+        }
+        .answer-box p {
+            color: #555;
+            line-height: 1.6;
+            margin-bottom: 15px;
+        }
+        .sources {
+            background: white;
+            border-radius: 8px;
+            padding: 15px;
+            margin-bottom: 15px;
+        }
+        .sources h4 {
+            color: #333;
+            margin-bottom: 12px;
+            font-size: 0.95em;
+        }
+        .source-item {
+            padding: 10px;
+            background: #f8f9fa;
+            border-radius: 6px;
+            margin-bottom: 8px;
+            border-left: 3px solid #667eea;
+            font-size: 0.9em;
+        }
+        .source-item .relevance {
+            color: #667eea;
+            font-weight: 600;
+            margin-bottom: 5px;
+        }
+        .source-item .text {
+            color: #555;
+            font-style: italic;
+        }
+        .stats {
+            display: grid;
+            grid-template-columns: repeat(2, 1fr);
+            gap: 10px;
+            margin-bottom: 20px;
+        }
+        .stat-box {
+            background: #f8f9fa;
+            padding: 12px;
+            border-radius: 6px;
+            text-align: center;
+        }
+        .stat-box .number {
+            font-size: 1.5em;
+            font-weight: bold;
+            color: #667eea;
+        }
+        .stat-box .label {
+            font-size: 0.85em;
+            color: #666;
+            margin-top: 5px;
+        }
+        .status-grid {
+            display: grid;
+            grid-template-columns: repeat(4, 1fr);
+            gap: 10px;
+        }
+        .loading-spinner {
+            display: inline-block;
+            width: 20px;
+            height: 20px;
+            border: 3px solid #f3f3f3;
+            border-top: 3px solid #667eea;
+            border-radius: 50%;
+            animation: spin 1s linear infinite;
+            margin-right: 10px;
+            vertical-align: middle;
+        }
+        @keyframes spin {
+            0% {
+                transform: rotate(0deg);
+            }
+            100% {
+                transform: rotate(360deg);
+            }
+        }
+        .full-width {
+            grid-column: 1 / -1;
+        }
+        @media (max-width: 768px) {
+            .main-grid {
+                grid-template-columns: 1fr;
+            }
+            header h1 {
+                font-size: 1.8em;
+            }
+            .stats {
+                grid-template-columns: 1fr;
+            }
+            .status-grid {
+                grid-template-columns: repeat(2, 1fr);
+            }
+        }
+        .hidden {
+            display: none;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <header>
+            <h1>📚 Document Intelligence RAG</h1>
+            <p>Ask questions about your research papers</p>
+        </header>
+        <div class="main-grid">
+            <!-- Upload Section -->
+            <div class="card">
+                <h2>📤 Upload Documents</h2>
+                <div class="upload-area" id="uploadArea">
+                    <p>📁 Drag & drop PDFs here or click to browse</p>
+                    <input type="file" id="fileInput" multiple accept=".pdf">
+                    <button class="btn" onclick="document.getElementById('fileInput').click()">
+                        Choose Files
+                    </button>
+                </div>
+                <div id="uploadStatus" class="status hidden"></div>
+                <div id="stats" class="stats">
+                    <div class="stat-box">
+                        <div class="number" id="totalChunks">0</div>
+                        <div class="label">Total Chunks</div>
+                    </div>
+                    <div class="stat-box">
+                        <div class="number" id="docCount">0</div>
+                        <div class="label">Documents</div>
+                    </div>
+                </div>
+                <button class="btn btn-secondary" onclick="loadStats()">
+                    🔄 Refresh Stats
+                </button>
+                <button class="btn btn-secondary" style="background: #ff6b6b; color: white; margin-top: 10px;"
+                    onclick="resetSystem()">
+                    🗑️ Delete All Documents
+                </button>
+                <p style="font-size: 0.85em; color: #999; margin-top: 10px;">
+                    💾 Documents are stored persistently. They remain after restart.
+                </p>
+            </div>
+            <!-- Query Section -->
+            <div class="card">
+                <h2>❓ Ask Questions</h2>
+                <div class="query-input">
+                    <input type="text" id="queryInput" placeholder="What would you like to know about your documents?"
+                        onkeypress="if(event.key==='Enter') submitQuery()">
+                    <button class="btn" onclick="submitQuery()">Search</button>
+                </div>
+                <div id="queryStatus" class="status hidden"></div>
+                <div id="answerContainer" class="hidden">
+                    <div class="answer-box">
+                        <h3>Answer</h3>
+                        <p id="answerText"></p>
+                    </div>
+                    <div class="sources" id="sourcesBox">
+                        <h4>📖 Sources Used</h4>
+                        <div id="sourcesList"></div>
+                    </div>
+                </div>
+            </div>
+        </div>
+        <!-- Status Indicators -->
+        <div class="card full-width">
+            <h2>🔧 System Status</h2>
+            <div id="healthStatus" class="status-grid">Loading...</div>
+        </div>
+    </div>
+    <script>
+        const API_URL = window.location.origin; // 'http://localhost:8000';
+        // Upload handlers
+        const uploadArea = document.getElementById('uploadArea');
+        const fileInput = document.getElementById('fileInput');
+        uploadArea.addEventListener('click', () => fileInput.click());
+        uploadArea.addEventListener('dragover', (e) => {
+            e.preventDefault();
+            uploadArea.classList.add('dragover');
+        });
+        uploadArea.addEventListener('dragleave', () => {
+            uploadArea.classList.remove('dragover');
+        });
+        uploadArea.addEventListener('drop', (e) => {
+            e.preventDefault();
+            uploadArea.classList.remove('dragover');
+            handleFiles(e.dataTransfer.files);
+        });
+        fileInput.addEventListener('change', (e) => {
+            handleFiles(e.target.files);
+        });
+        async function handleFiles(files) {
+            const statusDiv = document.getElementById('uploadStatus');
+            for (const file of files) {
+                if (!file.name.endsWith('.pdf')) {
+                    showStatus(statusDiv, `Skipping ${file.name} - only PDFs supported`, 'error');
+                    continue;
+                }
+                showStatus(statusDiv, `Uploading ${file.name}...`, 'loading');
+                const formData = new FormData();
+                formData.append('file', file);
+                try {
+                    const response = await fetch(`${API_URL}/ingest`, {
+                        method: 'POST',
+                        body: formData
+                    });
+                    if (response.ok) {
+                        const data = await response.json();
+                        showStatus(
+                            statusDiv,
+                            `✓ ${file.name}: ${data.chunks_embedded} chunks ingested`,
+                            'success'
+                        );
+                        loadStats();
+                    } else {
+                        const error = await response.json();
+                        showStatus(statusDiv, `✗ ${file.name}: ${error.detail}`, 'error');
+                    }
+                } catch (error) {
+                    showStatus(statusDiv, `✗ Upload failed: ${error.message}`, 'error');
+                }
+            }
+            fileInput.value = '';
+        }
+        async function submitQuery() {
+            const query = document.getElementById('queryInput').value.trim();
+            if (!query) {
+                showStatus(
+                    document.getElementById('queryStatus'),
+                    'Please enter a question',
+                    'error'
+                );
+                return;
+            }
+            const statusDiv = document.getElementById('queryStatus');
+            showStatus(statusDiv, 'Searching your documents...', 'loading');
+            try {
+                const response = await fetch(`${API_URL}/query`, {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ query, top_k: 3 })
+                });
+                if (response.ok) {
+                    const data = await response.json();
+                    displayAnswer(data);
+                    statusDiv.classList.add('hidden');
+                } else {
+                    const error = await response.json();
+                    showStatus(statusDiv, error.error || 'Query failed', 'error');
+                }
+            } catch (error) {
+                showStatus(statusDiv, `Error: ${error.message}`, 'error');
+            }
+        }
+        function displayAnswer(data) {
+            document.getElementById('answerText').textContent = data.answer;
+            const sourcesList = document.getElementById('sourcesList');
+            sourcesList.innerHTML = data.sources.map(source => `
+                <div class="source-item">
+                    <div class="relevance">📌 Relevance: ${(source.similarity * 100).toFixed(0)}%</div>
+                    <div class="text">${source.preview}</div>
+                </div>
+            `).join('');
+            document.getElementById('answerContainer').classList.remove('hidden');
+        }
+        async function loadStats() {
+            try {
+                const response = await fetch(`${API_URL}/stats`);
+                if (response.ok) {
+                    const data = await response.json();
+                    document.getElementById('totalChunks').textContent = data.total_chunks;
+                }
+            } catch (error) {
+                console.error('Failed to load stats:', error);
+            }
+        }
+        async function loadHealth() {
+            try {
+                const response = await fetch(`${API_URL}/health`);
+                if (response.ok) {
+                    const data = await response.json();
+                    // Get embedding backend name
+                    let embeddingName = data.embedding_backend || 'Unknown';
+                    // Format nicely
+                    if (embeddingName === 'sentence-transformers') {
+                        embeddingName = 'Sentence-Transformers';
+                    } else if (embeddingName === 'ollama') {
+                        embeddingName = 'Ollama';
+                    }
+                    const healthHtml = `
+                        <div class="stat-box">
+                            <div class="number">${data.embedding_backend ? '✓' : '✗'}</div>
+                            <div class="label">${embeddingName} (Embeddings)</div>
+                        </div>
+                        <div class="stat-box">
+                            <div class="number">${data.groq === '✓' ? '✓' : '✗'}</div>
+                            <div class="label">Groq (LLM)</div>
+                        </div>
+                        <div class="stat-box">
+                            <div class="number">${data.chroma.status === '✓' ? '✓' : '✗'}</div>
+                            <div class="label">Chroma (Vector DB)</div>
+                        </div>
+                        <div class="stat-box">
+                            <div class="number">${data.status === 'healthy' ? '✓' : '⚠'}</div>
+                            <div class="label">Overall Status</div>
+                        </div>
+                    `;
+                    document.getElementById('healthStatus').innerHTML = healthHtml;
+                }
+            } catch (error) {
+                document.getElementById('healthStatus').innerHTML =
+                    `<div style="grid-column: 1/-1; padding: 15px; background: #f8d7da; color: #721c24; border-radius: 8px;">Cannot connect to API at ${API_URL}</div>`;
+            }
+        }
+        async function resetSystem() {
+            if (!confirm('⚠️ Delete ALL documents and embeddings? This cannot be undone!')) {
+                return;
+            }
+            const statusDiv = document.getElementById('uploadStatus');
+            showStatus(statusDiv, 'Resetting system...', 'loading');
+            try {
+                const response = await fetch(`${API_URL}/reset`, {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json'
+                    }
+                });
+                if (response.ok) {
+                    const data = await response.json();
+                    showStatus(statusDiv, '✓ All documents deleted!', 'success');
+                    loadStats();
+                } else {
+                    const error = await response.json();
+                    showStatus(statusDiv, `Reset failed: ${error.detail || 'Unknown error'}`, 'error');
+                }
+            } catch (error) {
+                showStatus(statusDiv, `Error: ${error.message}`, 'error');
+            }
+        }
+        function showStatus(element, message, type) {
+            element.textContent = message;
+            element.className = `status ${type}`;
+            element.classList.remove('hidden');
+        }
+        // Load stats and health on page load
+        window.addEventListener('load', () => {
+            loadStats();
+            loadHealth();
+            setInterval(loadHealth, 30000); // Refresh every 30s
+        });
+    </script>
+</body>
+</html>

frontend/ragas.html ADDED Viewed

	@@ -0,0 +1,671 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>RAGAS Evaluation - Document Intelligence RAG</title>
+    <style>
+        /* Theme toggle */
+        .theme-toggle {
+            position: fixed;
+            top: 20px;
+            right: 20px;
+            padding: 8px 14px;
+            background: var(--surface);
+            border: 1px solid var(--border-soft);
+            border-radius: var(--radius-md);
+            cursor: pointer;
+            font-size: 0.85rem;
+            color: var(--text-main);
+            z-index: 100;
+            transition: background 0.15s ease;
+        }
+        .theme-toggle:hover {
+            background: var(--surface-subtle);
+        }
+        /* Button group */
+        .button-group {
+            display: flex;
+            gap: 10px;
+            margin-bottom: 16px;
+        }
+        .btn-small {
+            padding: 8px 14px;
+            background: var(--accent);
+            color: white;
+            border: none;
+            border-radius: var(--radius-sm);
+            font-size: 0.85rem;
+            cursor: pointer;
+            transition: background 0.15s ease;
+        }
+        .btn-small:hover {
+            background: #1d4ed8;
+        }
+        .btn-small.secondary {
+            background: var(--surface-subtle);
+            color: var(--text-main);
+            border: 1px solid var(--border-soft);
+        }
+        .btn-small.secondary:hover {
+            background: var(--border-soft);
+        }
+        /* Same design tokens as index.html */
+        :root {
+            --surface: #ffffff;
+            --surface-subtle: #fafafa;
+            --bg-main: #f5f7fb;
+            --card-bg: #ffffff;
+            --accent: #2563eb;
+            --accent-soft: #eff6ff;
+            --text-main: #111827;
+            --text-muted: #6b7280;
+            --border-soft: #e5e7eb;
+            --success: #16a34a;
+            --error: #dc2626;
+            --warning: #f59e0b;
+            --radius-sm: 6px;
+            --radius-md: 10px;
+            --radius-lg: 14px;
+        }
+        [data-theme="dark"] {
+            --surface: #1f2937;
+            --surface-subtle: #111827;
+            --bg-main: #0f172a;
+            --card-bg: #1e293b;
+            --accent: #60a5fa;
+            --accent-soft: #1e3a5f;
+            --text-main: #f1f5f9;
+            --text-muted: #94a3b8;
+            --border-soft: #334155;
+        }
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        body {
+            font-family: Inter, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
+            background: var(--bg-main);
+            min-height: 100vh;
+            padding: 24px;
+            color: var(--text-main);
+        }
+        .container {
+            max-width: 900px;
+            margin: 0 auto;
+        }
+        header {
+            text-align: center;
+            margin-bottom: 36px;
+        }
+        header h1 {
+            font-size: 2rem;
+            font-weight: 600;
+            letter-spacing: -0.02em;
+            margin-bottom: 8px;
+        }
+        header p {
+            font-size: 1rem;
+            color: var(--text-muted);
+            margin-bottom: 16px;
+        }
+        .nav-links {
+            display: flex;
+            justify-content: center;
+            gap: 12px;
+        }
+        .nav-links a {
+            color: var(--accent);
+            text-decoration: none;
+            font-size: 0.9rem;
+        }
+        .nav-links a:hover {
+            text-decoration: underline;
+        }
+        .card {
+            background: var(--surface);
+            border-radius: var(--radius-lg);
+            padding: 28px;
+            border: 1px solid var(--border-soft);
+            box-shadow: 0 8px 20px rgba(0, 0, 0, 0.05);
+            margin-bottom: 24px;
+        }
+        .card h2 {
+            font-size: 1.2rem;
+            font-weight: 600;
+            margin-bottom: 18px;
+        }
+        .query-section {
+            display: flex;
+            gap: 12px;
+            margin-bottom: 16px;
+        }
+        .query-section input {
+            flex: 1;
+            padding: 12px 14px;
+            border: 1.5px solid var(--border-soft);
+            border-radius: var(--radius-md);
+            font-size: 0.95rem;
+            background: var(--surface);
+            color: var(--text-main);
+        }
+        .query-section input:focus {
+            outline: none;
+            border-color: var(--accent);
+            box-shadow: 0 0 0 3px var(--accent-soft);
+        }
+        .btn {
+            background: var(--accent);
+            color: white;
+            border: none;
+            padding: 12px 24px;
+            border-radius: var(--radius-md);
+            font-size: 0.95rem;
+            font-weight: 500;
+            cursor: pointer;
+            transition: background 0.15s ease, transform 0.15s ease;
+        }
+        .btn:hover {
+            background: #1d4ed8;
+            transform: translateY(-1px);
+        }
+        .btn:disabled {
+            background: var(--text-muted);
+            cursor: not-allowed;
+            transform: none;
+        }
+        .status {
+            padding: 12px;
+            border-radius: var(--radius-md);
+            font-size: 0.9rem;
+            margin-bottom: 16px;
+        }
+        .status.loading {
+            background: var(--accent-soft);
+            color: var(--accent);
+        }
+        .status.error {
+            background: #fef2f2;
+            color: var(--error);
+        }
+        .status.success {
+            background: #f0fdf4;
+            color: var(--success);
+        }
+        .hidden {
+            display: none;
+        }
+        /* Answer Section */
+        .answer-box {
+            background: var(--surface-subtle);
+            border-radius: var(--radius-md);
+            padding: 16px;
+            margin-bottom: 20px;
+            border-left: 4px solid var(--accent);
+        }
+        .answer-box p {
+            line-height: 1.6;
+        }
+        /* RAGAS Scores */
+        .scores-grid {
+            display: grid;
+            grid-template-columns: repeat(3, 1fr);
+            gap: 16px;
+            margin-bottom: 20px;
+        }
+        .score-card {
+            background: var(--surface-subtle);
+            border-radius: var(--radius-md);
+            padding: 20px;
+            text-align: center;
+        }
+        .score-label {
+            font-size: 0.85rem;
+            color: var(--text-muted);
+            margin-bottom: 8px;
+            text-transform: uppercase;
+            letter-spacing: 0.5px;
+        }
+        .score-value {
+            font-size: 2rem;
+            font-weight: 700;
+        }
+        .score-value.high {
+            color: var(--success);
+        }
+        .score-value.medium {
+            color: var(--warning);
+        }
+        .score-value.low {
+            color: var(--error);
+        }
+        /* Progress Bar */
+        .progress-bar {
+            height: 8px;
+            background: var(--border-soft);
+            border-radius: 4px;
+            overflow: hidden;
+            margin-top: 8px;
+        }
+        .progress-fill {
+            height: 100%;
+            border-radius: 4px;
+            transition: width 0.5s ease;
+        }
+        .progress-fill.high {
+            background: var(--success);
+        }
+        .progress-fill.medium {
+            background: var(--warning);
+        }
+        .progress-fill.low {
+            background: var(--error);
+        }
+        /* Aggregate Stats */
+        .aggregate-stats {
+            display: flex;
+            justify-content: space-around;
+            padding: 16px;
+            background: var(--surface-subtle);
+            border-radius: var(--radius-md);
+        }
+        .stat-item {
+            text-align: center;
+        }
+        .stat-value {
+            font-size: 1.5rem;
+            font-weight: 600;
+            color: var(--accent);
+        }
+        .stat-label {
+            font-size: 0.8rem;
+            color: var(--text-muted);
+        }
+        /* Sources */
+        .sources-section {
+            margin-top: 20px;
+        }
+        .sources-section h3 {
+            font-size: 1rem;
+            margin-bottom: 12px;
+            color: var(--text-muted);
+        }
+        .source-item {
+            background: var(--surface-subtle);
+            border-radius: var(--radius-sm);
+            padding: 12px;
+            margin-bottom: 8px;
+            font-size: 0.85rem;
+        }
+        .source-similarity {
+            color: var(--accent);
+            font-weight: 500;
+            margin-bottom: 4px;
+        }
+        /* Info Box */
+        .info-box {
+            background: var(--accent-soft);
+            border-left: 4px solid var(--accent);
+            padding: 14px;
+            border-radius: var(--radius-sm);
+            margin-bottom: 20px;
+        }
+        .info-box p {
+            font-size: 0.9rem;
+            color: var(--text-muted);
+        }
+        /* Time display */
+        .time-info {
+            font-size: 0.85rem;
+            color: var(--text-muted);
+            text-align: right;
+            margin-top: 12px;
+        }
+        @media (max-width: 600px) {
+            .scores-grid {
+                grid-template-columns: 1fr;
+            }
+            .query-section {
+                flex-direction: column;
+            }
+        }
+    </style>
+</head>
+<body>
+    <button class="theme-toggle" id="themeToggle">🌙 Dark mode</button>
+    <div class="container">
+        <header>
+            <h1>🔬 RAGAS Evaluation</h1>
+            <p>Test the RAG pipeline and see real-time quality metrics</p>
+            <div class="nav-links">
+                <a href="/">← Back to Main</a>
+                <a href="/evaluation">View Full Dashboard</a>
+            </div>
+        </header>
+        <!-- Query Section -->
+        <div class="card">
+            <h2>Ask a Question</h2>
+            <div class="info-box">
+                <p>Enter a question to query the RAG system. RAGAS will evaluate the response for
+                    <strong>Faithfulness</strong> (is the answer grounded in context?) and <strong>Context
+                        Precision</strong> (were the retrieved chunks useful?).
+                </p>
+            </div>
+            <div class="query-section">
+                <input type="text" id="queryInput" placeholder="e.g., What is machine learning?"
+                    onkeypress="if(event.key === 'Enter') runEvaluation()">
+                <button class="btn" id="evalBtn" onclick="runEvaluation()">
+                    🔍 Query & Evaluate
+                </button>
+            </div>
+            <div id="status" class="status hidden"></div>
+        </div>
+        <!-- Results Section -->
+        <div id="resultsSection" class="card hidden">
+            <h2>📊 RAGAS Evaluation Results</h2>
+            <!-- Scores -->
+            <div class="scores-grid">
+                <div class="score-card">
+                    <div class="score-label">Faithfulness</div>
+                    <div class="score-value" id="faithScore">-</div>
+                    <div class="progress-bar">
+                        <div class="progress-fill" id="faithBar" style="width: 0%"></div>
+                    </div>
+                </div>
+                <div class="score-card">
+                    <div class="score-label">Context Precision</div>
+                    <div class="score-value" id="precisionScore">-</div>
+                    <div class="progress-bar">
+                        <div class="progress-fill" id="precisionBar" style="width: 0%"></div>
+                    </div>
+                </div>
+                <div class="score-card">
+                    <div class="score-label">RAGAS Score</div>
+                    <div class="score-value" id="ragasScore">-</div>
+                    <div class="progress-bar">
+                        <div class="progress-fill" id="ragasBar" style="width: 0%"></div>
+                    </div>
+                </div>
+            </div>
+            <!-- Answer -->
+            <h3 style="margin-bottom: 12px;">📝 Generated Answer</h3>
+            <div class="answer-box">
+                <p id="answerText">-</p>
+            </div>
+            <!-- Sources -->
+            <div class="sources-section">
+                <h3>📚 Retrieved Sources</h3>
+                <div id="sourcesList"></div>
+            </div>
+            <div class="time-info">
+                <span id="responseTime"></span> | <span id="evalTime"></span>
+            </div>
+        </div>
+        <!-- Aggregate Stats -->
+        <div class="card">
+            <h2>📈 Aggregate Statistics</h2>
+            <div class="button-group">
+                <button class="btn-small" onclick="loadAggregateStats()">🔄 Refresh</button>
+                <button class="btn-small secondary" onclick="clearRagasResults()">🗑️ Clear Results</button>
+            </div>
+            <div class="aggregate-stats">
+                <div class="stat-item">
+                    <div class="stat-value" id="totalEvals">0</div>
+                    <div class="stat-label">Total Evaluations</div>
+                </div>
+                <div class="stat-item">
+                    <div class="stat-value" id="avgFaith">-</div>
+                    <div class="stat-label">Avg Faithfulness</div>
+                </div>
+                <div class="stat-item">
+                    <div class="stat-value" id="avgPrecision">-</div>
+                    <div class="stat-label">Avg Precision</div>
+                </div>
+                <div class="stat-item">
+                    <div class="stat-value" id="avgRagas">-</div>
+                    <div class="stat-label">Avg RAGAS Score</div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script>
+        async function clearRagasResults() {
+            if (!confirm('Clear all RAGAS evaluation results? This cannot be undone.')) {
+                return;
+            }
+            try {
+                const response = await fetch(`${API_URL}/ragas/reset`, {
+                    method: 'POST'
+                });
+                if (response.ok) {
+                    // Reset displayed values
+                    document.getElementById('totalEvals').textContent = '0';
+                    document.getElementById('avgFaith').textContent = '-';
+                    document.getElementById('avgPrecision').textContent = '-';
+                    document.getElementById('avgRagas').textContent = '-';
+                    alert('RAGAS results cleared!');
+                } else {
+                    alert('Failed to clear results');
+                }
+            } catch (error) {
+                alert('Error: ' + error.message);
+            }
+        }
+        // Dark mode toggle
+        const themeToggle = document.getElementById("themeToggle");
+        const root = document.documentElement;
+        const savedTheme = localStorage.getItem("theme");
+        const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
+        if (savedTheme) {
+            root.setAttribute("data-theme", savedTheme);
+        } else if (prefersDark) {
+            root.setAttribute("data-theme", "dark");
+        }
+        function updateToggleText() {
+            const isDark = root.getAttribute("data-theme") === "dark";
+            themeToggle.textContent = isDark ? "☀️ Light mode" : "🌙 Dark mode";
+        }
+        updateToggleText();
+        themeToggle.addEventListener("click", () => {
+            const isDark = root.getAttribute("data-theme") === "dark";
+            const newTheme = isDark ? "light" : "dark";
+            root.setAttribute("data-theme", newTheme);
+            localStorage.setItem("theme", newTheme);
+            updateToggleText();
+        });
+        const API_URL = window.location.origin;
+        async function runEvaluation() {
+            const query = document.getElementById('queryInput').value.trim();
+            if (!query) {
+                showStatus('Please enter a question', 'error');
+                return;
+            }
+            const btn = document.getElementById('evalBtn');
+            btn.disabled = true;
+            btn.textContent = '⏳ Evaluating...';
+            showStatus('Querying RAG and running RAGAS evaluation...', 'loading');
+            document.getElementById('resultsSection').classList.add('hidden');
+            try {
+                const response = await fetch(`${API_URL}/ragas/query-and-evaluate`, {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ query, top_k: 3 })
+                });
+                if (!response.ok) {
+                    throw new Error('Evaluation failed');
+                }
+                const data = await response.json();
+                displayResults(data);
+                hideStatus();
+                loadAggregateStats();
+            } catch (error) {
+                showStatus(`Error: ${error.message}`, 'error');
+            } finally {
+                btn.disabled = false;
+                btn.textContent = '🔍 Query & Evaluate';
+            }
+        }
+        function displayResults(data) {
+            document.getElementById('resultsSection').classList.remove('hidden');
+            // Answer
+            document.getElementById('answerText').textContent = data.answer;
+            // RAGAS Scores
+            if (data.ragas) {
+                setScore('faithScore', 'faithBar', data.ragas.faithfulness);
+                setScore('precisionScore', 'precisionBar', data.ragas.context_precision);
+                setScore('ragasScore', 'ragasBar', data.ragas.ragas_score);
+                document.getElementById('evalTime').textContent =
+                    `Eval: ${data.ragas.eval_time_ms.toFixed(0)}ms`;
+            }
+            document.getElementById('responseTime').textContent =
+                `Response: ${data.response_time_ms.toFixed(0)}ms`;
+            // Sources
+            const sourcesList = document.getElementById('sourcesList');
+            sourcesList.innerHTML = data.sources.map(source => `
+                <div class="source-item">
+                    <div class="source-similarity">📌 Similarity: ${(source.similarity * 100).toFixed(0)}%</div>
+                    <div>${source.preview}</div>
+                </div>
+            `).join('');
+        }
+        function setScore(valueId, barId, score) {
+            const valueEl = document.getElementById(valueId);
+            const barEl = document.getElementById(barId);
+            const percent = (score * 100).toFixed(0);
+            valueEl.textContent = percent + '%';
+            barEl.style.width = percent + '%';
+            // Color coding
+            let colorClass = 'high';
+            if (score < 0.7) colorClass = 'medium';
+            if (score < 0.5) colorClass = 'low';
+            valueEl.className = 'score-value ' + colorClass;
+            barEl.className = 'progress-fill ' + colorClass;
+        }
+        async function loadAggregateStats() {
+            try {
+                const response = await fetch(`${API_URL}/ragas/metrics`);
+                const data = await response.json();
+                document.getElementById('totalEvals').textContent = data.total_evaluations || 0;
+                document.getElementById('avgFaith').textContent =
+                    data.avg_faithfulness ? (data.avg_faithfulness * 100).toFixed(0) + '%' : '-';
+                document.getElementById('avgPrecision').textContent =
+                    data.avg_context_precision ? (data.avg_context_precision * 100).toFixed(0) + '%' : '-';
+                document.getElementById('avgRagas').textContent =
+                    data.avg_ragas_score ? (data.avg_ragas_score * 100).toFixed(0) + '%' : '-';
+            } catch (e) {
+                console.error('Failed to load aggregate stats:', e);
+            }
+        }
+        function showStatus(message, type) {
+            const status = document.getElementById('status');
+            status.textContent = message;
+            status.className = `status ${type}`;
+            status.classList.remove('hidden');
+        }
+        function hideStatus() {
+            document.getElementById('status').classList.add('hidden');
+        }
+        // Load stats on page load
+        window.addEventListener('load', loadAggregateStats);
+    </script>
+</body>
+</html>

pyproject.toml CHANGED Viewed

@@ -16,7 +16,10 @@ dependencies = [
     "python-dotenv>=1.2.1",
     "python-multipart>=0.0.20",
     "requests>=2.32.5",
-    "sentence-transformers>=5.2.0",
     "uvicorn[standard]>=0.38.0",
-    "scikit-learn>=1.3.0"
 ]

     "python-dotenv>=1.2.1",
     "python-multipart>=0.0.20",
     "requests>=2.32.5",
+    "sentence-transformers>=5.2.0",
     "uvicorn[standard]>=0.38.0",
+    "scikit-learn>=1.3.0",
+    "ragas>=0.4.2",
+    "langchain-groq>=1.1.1",
+    "langchain-openai>=1.1.6",
 ]

run_real_evaluation.py ADDED Viewed

	@@ -0,0 +1,473 @@

+"""
+Real RAG Evaluation Script
+Runs actual queries through my RAG and computes real metrics.
+"""
+import json
+import tempfile
+import os
+import time
+from pathlib import Path
+from typing import List, Dict, Any
+import numpy as np
+from sentence_transformers import SentenceTransformer, util
+from src.rag.pipeline import RAGPipeline
+from src.evaluation import RAGEvaluator, EvaluationResult
+# ==================== TEST DATASET ====================
+# These are synthetic queries + documents, but metrics are REAL
+# based on actual retrieval and generation from my RAG
+TEST_DOCUMENTS = {
+    "medical_research_1.txt": """
+    Drug X Clinical Trial Results
+    A Phase III clinical trial was conducted to evaluate the efficacy and safety of Drug X
+    in patients with condition Y. The study enrolled 500 patients aged 18-65 with confirmed
+    diagnosis of condition Y.
+    Primary Efficacy Endpoint:
+    Drug X demonstrated a 65% response rate compared to 35% in the placebo group (p<0.001).
+    The median time to response was 4 weeks.
+    Safety Profile:
+    The most common adverse events were:
+    - Headache (12% of patients)
+    - Mild gastrointestinal upset (8% of patients)
+    - Dizziness (5% of patients)
+    - Fatigue (4% of patients)
+    Serious adverse events occurred in 2% of patients, including liver enzyme elevation.
+    No deaths were attributed to the drug during the trial period.
+    Dosage Recommendations:
+    The recommended dose is 500mg twice daily with meals. Dose adjustments may be necessary
+    for patients with renal impairment (dose reduction to 250mg twice daily recommended).
+    Mechanism of Action:
+    Drug X works by inhibiting protein kinase Y, which is overexpressed in condition Y cells.
+    This inhibition leads to cell cycle arrest and apoptosis of affected cells.
+    """,
+    "drug_interactions.txt": """
+    Drug X Drug Interaction Guide
+    Important Drug Interactions:
+    1. CYP3A4 Inhibitors (e.g., ketoconazole, ritonavir):
+       - May increase Drug X levels by 3-5 fold
+       - Monitor for adverse effects
+       - Consider dose reduction
+    2. Warfarin:
+       - Potential increased bleeding risk
+       - Monitor INR closely
+       - Baseline INR and weekly monitoring recommended
+    3. Oral Contraceptives:
+       - May reduce contraceptive efficacy
+       - Alternative contraception recommended
+       - No dose adjustment needed for Drug X
+    4. NSAIDs:
+       - Increased risk of GI bleeding
+       - Monitor for GI symptoms
+       - Consider gastroprotection
+    5. ACE Inhibitors:
+       - No significant interaction
+       - Safe to use concomitantly
+       - No monitoring required
+    """,
+    "patient_case_study.txt": """
+    Case Study: 45-year-old Female with Condition Y
+    Patient History:
+    A 45-year-old female presented with a 6-month history of progressive symptoms consistent
+    with condition Y. She has a past medical history of hypertension controlled on lisinopril
+    and type 2 diabetes on metformin.
+    Treatment Response:
+    Patient was started on Drug X 500mg twice daily. After 2 weeks of treatment, she reported
+    partial symptom improvement. By week 6, she achieved complete response with 95% symptom
+    resolution.
+    Side Effects Experienced:
+    - Mild headache (treated with acetaminophen)
+    - Occasional nausea (resolved with food intake)
+    - No serious adverse events
+    Follow-up:
+    Patient continues to do well on Drug X at 6-month follow-up with sustained response.
+    No dose adjustments were necessary. Lab values remain within normal limits.
+    """
+}
+TEST_CASES = [
+    {
+        "query": "What is the response rate of Drug X?",
+        "expected_answer_keywords": ["65%", "response rate"],
+        "expected_source_docs": ["medical_research_1"],
+        "description": "Should retrieve clinical trial data"
+    },
+    {
+        "query": "What are the side effects of Drug X?",
+        "expected_answer_keywords": ["headache", "gastrointestinal", "dizziness"],
+        "expected_source_docs": ["medical_research_1"],
+        "description": "Should retrieve safety profile section"
+    },
+    {
+        "query": "How does Drug X interact with warfarin?",
+        "expected_answer_keywords": ["warfarin", "bleeding", "INR"],
+        "expected_source_docs": ["drug_interactions"],
+        "description": "Should retrieve drug interactions guide"
+    },
+    {
+        "query": "What is the recommended dosage of Drug X?",
+        "expected_answer_keywords": ["500mg", "twice daily"],
+        "expected_source_docs": ["medical_research_1"],
+        "description": "Should retrieve dosage recommendations"
+    },
+    {
+        "query": "What is the mechanism of action for Drug X?",
+        "expected_answer_keywords": ["protein kinase", "inhibiting", "apoptosis"],
+        "expected_source_docs": ["medical_research_1"],
+        "description": "Should retrieve mechanism section"
+    },
+]
+# ==================== METRIC COMPUTATION ====================
+def compute_retrieval_precision(
+    retrieved_docs: List[str],
+    expected_docs: List[str]
+) -> float:
+    """
+    Precision: Of the docs we retrieved, what % were actually relevant?
+    Formula: TP / (TP + FP)
+    where TP = relevant docs we retrieved
+          FP = irrelevant docs we retrieved
+    """
+    if not retrieved_docs:
+        return 0.0
+    # Count how many retrieved docs match expected
+    relevant_count = sum(1 for doc in retrieved_docs if doc in expected_docs)
+    precision = relevant_count / len(retrieved_docs)
+    return float(precision)
+def compute_retrieval_recall(
+    retrieved_docs: List[str],
+    expected_docs: List[str]
+) -> float:
+    """
+    Recall: Of all relevant docs, what % did we actually retrieve?
+    Formula: TP / (TP + FN)
+    where TP = relevant docs we retrieved
+          FN = relevant docs we missed
+    """
+    if not expected_docs:
+        return 1.0  # If no docs expected, perfect recall
+    # Count how many expected docs were retrieved
+    relevant_count = sum(1 for doc in expected_docs if doc in retrieved_docs)
+    recall = relevant_count / len(expected_docs)
+    return float(recall)
+def compute_bert_score(generated_answer: str, expected_keywords: List[str]) -> float:
+    """
+    Semantic similarity: Does the answer contain the right semantic information?
+    Approximation: Check if expected keywords appear semantically in the answer
+    This is a simplified version. Real BERTScore would use embeddings.
+    """
+    if not expected_keywords:
+        return 1.0
+    answer_lower = generated_answer.lower()
+    # Count how many keywords appear in the answer (fuzzy match)
+    found_keywords = 0
+    for keyword in expected_keywords:
+        if keyword.lower() in answer_lower:
+            found_keywords += 1
+    # Score based on keyword coverage
+    score = found_keywords / len(expected_keywords)
+    # Cap at 0.95 since answer likely contains more than just keywords
+    return float(min(score, 0.95))
+def compute_answer_relevance(generated_answer: str, query: str) -> float:
+    """
+    Is the answer actually addressing the query?
+    Approximation: Check if answer is non-trivial and not a refusal
+    """
+    answer_lower = generated_answer.lower()
+    # Refusal indicators
+    refusal_phrases = [
+        "i don't know",
+        "i cannot",
+        "i'm unable",
+        "not found",
+        "no information",
+        "unable to find"
+    ]
+    if any(phrase in answer_lower for phrase in refusal_phrases):
+        return 0.3
+    # Answer has reasonable length
+    if len(generated_answer.split()) < 3:
+        return 0.4
+    return 0.85  # Assume relevant if not a refusal
+def detect_hallucinations(
+    generated_answer: str,
+    retrieved_context: str
+) -> bool:
+    """
+    Did the LLM make up information not in the sources?
+    Simplified approach: Check if answer contradicts source context
+    Real implementation would use NLI models
+    """
+    # This is hard to do perfectly without advanced NLI
+    # For now, assume no hallucinations if answer is relatively short and grounded
+    # In production, you'd use a dedicated hallucination detector
+    answer_words = set(generated_answer.lower().split())
+    context_words = set(retrieved_context.lower().split())
+    # If too many words from answer aren't in context, might be hallucinating
+    # (very loose approximation)
+    overlap = len(answer_words & context_words) / max(len(answer_words), 1)
+    # Conservative: flag as hallucination if very low overlap
+    is_hallucination = overlap < 0.3
+    return is_hallucination
+def compute_faithfulness(
+    generated_answer: str,
+    retrieved_context: str
+) -> float:
+    """
+    Is the answer grounded in the sources?
+    Approximation: Word overlap between answer and context
+    Higher overlap = more grounded
+    """
+    answer_words = set(generated_answer.lower().split())
+    context_words = set(retrieved_context.lower().split())
+    if not answer_words:
+        return 0.0
+    # Overlap ratio
+    overlap = len(answer_words & context_words) / len(answer_words)
+    # Convert to 0-1 scale (0.3 overlap = 0.6 faithfulness)
+    faithfulness = min(overlap * 2, 1.0)
+    return float(faithfulness)
+def compute_source_attribution(
+    generated_answer: str,
+    retrieved_context: str
+) -> float:
+    """
+    What % of the answer is backed by sources?
+    Approximation: Check what % of answer words appear in retrieved context
+    """
+    answer_words = generated_answer.lower().split()
+    context_words = set(retrieved_context.lower().split())
+    if not answer_words:
+        return 0.0
+    attributed_words = sum(1 for word in answer_words if word in context_words)
+    attribution_score = attributed_words / len(answer_words)
+    return float(attribution_score)
+# ==================== MAIN EVALUATION LOOP ====================
+def run_real_evaluation():
+    """
+    Run actual evaluation against your RAG system.
+    """
+    print("=" * 70)
+    print("REAL RAG EVALUATION")
+    print("=" * 70)
+    # Initialize evaluator
+    evaluator = RAGEvaluator(store_results=True, results_dir="evaluation_results")
+    # Create temporary directory for test documents
+    with tempfile.TemporaryDirectory() as tmpdir:
+        print(f"\nCreating test documents in {tmpdir}")
+        # Write test documents
+        doc_paths = {}
+        for doc_name, content in TEST_DOCUMENTS.items():
+            doc_path = Path(tmpdir) / doc_name
+            doc_path.write_text(content)
+            doc_paths[doc_name] = str(doc_path)
+            print(f"   ✓ {doc_name}")
+        # Initialize RAG pipeline
+        print(f"\n🚀 Initializing RAG pipeline...")
+        try:
+            pipeline = RAGPipeline()
+        except Exception as e:
+            print(f"   ✗ Failed to initialize pipeline: {e}")
+            return
+        # Ingest documents
+        print(f"\nIngesting documents into RAG...")
+        try:
+            for doc_name, content in TEST_DOCUMENTS.items():
+                doc_id = Path(doc_name).stem  # Remove extension
+                pipeline.ingest(doc_id, content)
+                print(f"   ✓ Ingested {doc_name} (doc_id: {doc_id})")
+        except Exception as e:
+            print(f"   ✗ Failed to ingest documents: {e}")
+            return
+        # Run test cases
+        print(f"\nRunning {len(TEST_CASES)} test cases...\n")
+        all_retrieved_docs = []
+        for i, test_case in enumerate(TEST_CASES, 1):
+            query = test_case["query"]
+            expected_keywords = test_case["expected_answer_keywords"]
+            expected_docs = test_case["expected_source_docs"]
+            print(f"Test {i}: {query}")
+            print(f"   Expected sources: {expected_docs}")
+            try:
+                start_time = time.time()
+                # Query the RAG
+                result = pipeline.query(query, return_sources=True)
+                latency_ms = (time.time() - start_time) * 1000
+                answer = result.get('answer', '')
+                retrieved_docs = result.get('sources', [])
+                context = result.get('context', '')
+                # If context is empty, reconstruct from retrieved sources
+                if not context and retrieved_docs:
+                    # Combine previews from all retrieved sources
+                    context = ' '.join([source.get('preview', '') for source in retrieved_docs if isinstance(source, dict)])
+                print(f"   DEBUG - context length: {len(context)}")
+                # Extract doc names from sources
+                # Sources are dicts with 'chunk_id' like 'medical_research_1_chunk_0'
+                retrieved_doc_names = []
+                for source in retrieved_docs:
+                    if isinstance(source, dict):
+                        chunk_id = source.get('chunk_id', '')
+                        # Extract doc_id from chunk_id (format: {doc_id}_chunk_{num})
+                        doc_id = chunk_id.rsplit('_chunk_', 1)[0] if '_chunk_' in chunk_id else 'unknown'
+                        retrieved_doc_names.append(doc_id)
+                all_retrieved_docs.extend(retrieved_doc_names)
+                print(f"   Retrieved: {retrieved_doc_names}")
+                print(f"   Answer: {answer[:100]}...")
+                # COMPUTE REAL METRICS
+                retrieval_precision = compute_retrieval_precision(
+                    retrieved_doc_names,
+                    expected_docs
+                )
+                retrieval_recall = compute_retrieval_recall(
+                    retrieved_doc_names,
+                    expected_docs
+                )
+                bert_score = compute_bert_score(answer, expected_keywords)
+                answer_relevance = compute_answer_relevance(answer, query)
+                faithfulness = compute_faithfulness(answer, context)
+                hallucination_detected = detect_hallucinations(answer, context)
+                source_attribution = compute_source_attribution(answer, context)
+                print(f"   Precision: {retrieval_precision:.2f} | Recall: {retrieval_recall:.2f}")
+                print(f"   BERTScore: {bert_score:.2f} | Relevance: {answer_relevance:.2f}")
+                print(f"   Faithfulness: {faithfulness:.2f} | Attribution: {source_attribution:.2f}")
+                print(f"   Hallucination: {hallucination_detected} | Latency: {latency_ms:.0f}ms")
+                # Create evaluation result
+                eval_result = EvaluationResult(
+                    query=query,
+                    answer=answer,
+                    source_docs=retrieved_doc_names,
+                    num_retrieved=len(retrieved_docs),
+                    retrieval_precision=retrieval_precision,
+                    retrieval_recall=retrieval_recall,
+                    rank_position=1 if expected_docs[0] in retrieved_doc_names else 2,
+                    rouge_l=bert_score,  # Approximation
+                    bert_score=bert_score,
+                    answer_relevance=answer_relevance,
+                    faithfulness=faithfulness,
+                    hallucination_detected=hallucination_detected,
+                    source_attribution_score=source_attribution,
+                    latency_ms=latency_ms,
+                    tokens_used=len(answer.split()),
+                    cost_cents=0.004,  # Estimate for Groq
+                )
+                evaluator.add_result(eval_result)
+                print()
+            except Exception as e:
+                print(f"   ✗ Test failed: {e}\n")
+        # Print summary
+        print("\n" + "=" * 70)
+        print("EVALUATION SUMMARY")
+        print("=" * 70)
+        metrics = evaluator.compute_aggregate_metrics()
+        print(f"\nResults:")
+        print(f"   Total Evaluations: {metrics['total_evaluations']}")
+        print(f"   Avg Precision: {metrics['retrieval_precision_mean']:.3f}")
+        print(f"   Avg Recall: {metrics['retrieval_recall_mean']:.3f}")
+        print(f"   Avg BERTScore: {metrics['bert_score_mean']:.3f}")
+        print(f"   Avg Faithfulness: {metrics['faithfulness_mean']:.3f}")
+        print(f"   Hallucination Rate: {metrics['hallucination_rate']*100:.1f}%")
+        print(f"   Avg Latency: {metrics['latency_mean']:.0f}ms")
+        print(f"   MRR: {metrics['mrr']:.3f}")
+        print(f"\nResults saved to: evaluation_results/results.jsonl")
+        print(f"View dashboard at: http://localhost:8000/evaluation")
+if __name__ == "__main__":
+    run_real_evaluation()

src/evaluation/ragas_integration/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""RAGAS Integration for Document Intelligence RAG"""
+from .pipeline_adapter import RagasReadyPipeline, EvaluationReadyResponse
+from .ragas_evaluator import RagasEvaluator, RagasEvaluationResult
+from .ragas_endpoints import init_ragas_router
+__all__ = [
+    "RagasReadyPipeline",
+    "EvaluationReadyResponse",
+    "RagasEvaluator",
+    "RagasEvaluationResult",
+    "init_ragas_router",
+]

src/evaluation/ragas_integration/pipeline_adapter.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+Pipeline Adapter - Captures full context for RAGAS evaluation
+"""
+import time
+from typing import Dict, Any, List, Optional
+from dataclasses import dataclass
+from src.rag.llm import build_context_string
+@dataclass
+class EvaluationReadyResponse:
+    """Response with full contexts for RAGAS."""
+    query: str
+    answer: str
+    contexts: List[str]  # Full text of each retrieved chunk
+    sources: List[Dict]  # Original source metadata
+    chunks_used: int
+    response_time_ms: float
+    status: str
+class RagasReadyPipeline:
+    """
+    Wraps your RAGPipeline to capture full context.
+    """
+    def __init__(self, base_pipeline):
+        """
+        Args:
+            base_pipeline: Your existing RAGPipeline instance
+        """
+        self.pipeline = base_pipeline
+        self.config = base_pipeline.config
+        self.embeddings = base_pipeline.embeddings
+        self.llm = base_pipeline.llm
+        self.vector_store = base_pipeline.vector_store
+    def query_for_evaluation(
+        self,
+        query: str,
+        top_k: Optional[int] = None
+    ) -> EvaluationReadyResponse:
+        """
+        Query and capture FULL context for RAGAS.
+        TODO: Implement this by:
+        1. Embedding the query (use self.pipeline.embeddings)
+        2. Retrieving chunks (use self.pipeline.vector_store.retrieve)
+        3. Extracting FULL text from each chunk
+        4. Building context for LLM
+        5. Getting answer from LLM
+        6. Returning EvaluationReadyResponse with full contexts
+        """
+        start_time = time.time()
+        if self.vector_store.size() == 0:
+            return EvaluationReadyResponse(
+                query=query,
+                answer="No documents in vector store",
+                contexts=[],
+                sources=[],
+                chunks_used=0,
+                response_time_ms=0,
+                status="no_documents"
+            )
+        query_embedding = self.embeddings.embed(query)
+        k = top_k or self.config.top_k
+        retrieved_chunks = self.vector_store.retrieve(query_embedding, top_k=k)
+        if not retrieved_chunks:
+            return EvaluationReadyResponse(
+                query=query,
+                answer="No relevant documents found.",
+                contexts=[],
+                sources=[],
+                chunks_used=0,
+                response_time_ms=(time.time() - start_time) * 1000,
+                status="no_results"
+            )
+        contexts = [chunk.text for chunk in retrieved_chunks]
+        context_string = build_context_string(retrieved_chunks)
+        answer = self.llm.query(context=context_string, query=query)
+        sources = [
+            {
+                "chunk_id": chunk.chunk_id,
+                "similarity": round(chunk.similarity, 3),
+                "preview": chunk.text[:100] + "..." if len(chunk.text) > 100 else chunk.text
+            }
+            for chunk in retrieved_chunks
+        ]
+        response_time_ms = (time.time() - start_time) * 1000
+        return EvaluationReadyResponse(
+            query=query,
+            answer=answer,
+            contexts=contexts,  # Full texts for RAGAS!
+            sources=sources,
+            chunks_used=len(contexts),
+            response_time_ms=response_time_ms,
+            status="success"
+        )

src/evaluation/ragas_integration/ragas_endpoints.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+RAGAS API Endpoints
+"""
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+from typing import List, Optional
+router = APIRouter()
+# We'll set these from main.py
+ragas_pipeline = None
+ragas_evaluator = None
+class RagasEvalRequest(BaseModel):
+    """Direct evaluation request."""
+    query: str
+    answer: str
+    contexts: List[str]
+class RagasQueryRequest(BaseModel):
+    """Query + evaluate request."""
+    query: str
+    top_k: int = 3
+def init_ragas_router(pipeline, evaluator):
+    """Initialize router with pipeline and evaluator instances."""
+    global ragas_pipeline, ragas_evaluator
+    ragas_pipeline = pipeline
+    ragas_evaluator = evaluator
+    return router
+@router.post("/evaluate")
+async def evaluate_direct(request: RagasEvalRequest):
+    """
+    Evaluate a query-answer pair directly.
+    Use this when you already have the answer and contexts.
+    """
+    if not ragas_evaluator:
+        raise HTTPException(status_code=503, detail="RAGAS not initialized")
+    result = await ragas_evaluator.evaluate_single(
+        query=request.query,
+        answer=request.answer,
+        contexts=request.contexts
+    )
+    return {
+        "eval_id": result.eval_id,
+        "faithfulness": result.faithfulness,
+        "context_precision": result.context_precision,
+        "ragas_score": result.ragas_score,
+        "latency_ms": round(result.latency_ms, 2)
+    }
+@router.post("/query-and-evaluate")
+async def query_and_evaluate(request: RagasQueryRequest):
+    """
+    Query the RAG system AND evaluate the response.
+    Returns both the answer and RAGAS metrics.
+    """
+    if not ragas_pipeline or not ragas_evaluator:
+        raise HTTPException(status_code=503, detail="RAGAS not initialized")
+    # Step 1: Query pipeline
+    response = ragas_pipeline.query_for_evaluation(
+        query=request.query,
+        top_k=request.top_k
+    )
+    if response.status != "success":
+        return {
+            "query": response.query,
+            "answer": response.answer,
+            "status": response.status,
+            "ragas": None
+        }
+    # Step 2: Evaluate with RAGAS
+    eval_result = await ragas_evaluator.evaluate_single(
+        query=response.query,
+        answer=response.answer,
+        contexts=response.contexts
+    )
+    return {
+        "query": response.query,
+        "answer": response.answer,
+        "sources": response.sources,
+        "chunks_used": response.chunks_used,
+        "response_time_ms": round(response.response_time_ms, 2),
+        "ragas": {
+            "eval_id": eval_result.eval_id,
+            "faithfulness": eval_result.faithfulness,
+            "context_precision": eval_result.context_precision,
+            "ragas_score": eval_result.ragas_score,
+            "eval_time_ms": round(eval_result.latency_ms, 2)
+        }
+    }
+@router.get("/metrics")
+async def get_metrics():
+    """Get aggregate RAGAS metrics from all evaluations."""
+    if not ragas_evaluator:
+        raise HTTPException(status_code=503, detail="RAGAS not initialized")
+    results = ragas_evaluator.results
+    if not results:
+        return {"total_evaluations": 0, "message": "No evaluations yet"}
+    # Calculate averages
+    avg_faithfulness = sum(r.faithfulness for r in results) / len(results)
+    avg_precision = sum(r.context_precision for r in results) / len(results)
+    avg_ragas = sum(r.ragas_score for r in results) / len(results)
+    return {
+        "total_evaluations": len(results),
+        "avg_faithfulness": round(avg_faithfulness, 3),
+        "avg_context_precision": round(avg_precision, 3),
+        "avg_ragas_score": round(avg_ragas, 3)
+    }

src/evaluation/ragas_integration/ragas_evaluator.py ADDED Viewed

	@@ -0,0 +1,139 @@

+"""
+RAGAS Evaluator - Core evaluation logic using RAGAS framework
+"""
+import os
+import logging
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+from datetime import datetime
+# RAGAS imports
+from ragas.metrics import (
+    Faithfulness,
+    ResponseRelevancy,
+    LLMContextPrecisionWithoutReference,
+)
+from ragas.llms import LangchainLLMWrapper
+from ragas.dataset_schema import SingleTurnSample
+# LangChain for LLM wrapper (RAGAS requirement)
+from langchain_groq import ChatGroq
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+@dataclass
+class RagasEvaluationResult:
+    """Result from RAGAS evaluation."""
+    eval_id: str
+    query: str
+    # RAGAS metrics (0-1 scale)
+    faithfulness: float
+    answer_relevancy: float
+    context_precision: float
+    # Composite score
+    ragas_score: float = 0.0
+    # Metadata
+    latency_ms: float = 0.0
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+    def __post_init__(self):
+        """Calculate composite RAGAS score."""
+        scores = [self.faithfulness, self.context_precision]
+        valid_scores = [s for s in scores if s > 0]
+        self.ragas_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
+class RagasEvaluator:
+    """
+    Evaluates RAG responses using RAGAS metrics.
+    Metrics:
+    - Faithfulness: Is the answer grounded in the context?
+    - Answer Relevancy: Does the answer address the question?
+    - Context Precision: Are the retrieved chunks useful?
+    """
+    def __init__(self, groq_api_key: Optional[str] = None):
+        """
+        Initialize RAGAS evaluator.
+        Args:
+            groq_api_key: Your Groq API key (or uses GROQ_API_KEY env var)
+        """
+        # TODO: Step 1 - Get API key
+        api_key = groq_api_key or os.getenv("GROQ_API_KEY")
+        if not api_key:
+            raise ValueError("GROQ_API_KEY required")
+        llm = ChatGroq(
+            api_key=api_key,
+            model_name="llama-3.3-70b-versatile",
+            temperature=0
+        )
+        self.evaluator_llm = LangchainLLMWrapper(llm)
+        self.faithfulness = Faithfulness(llm=self.evaluator_llm)
+        # self.answer_relevancy = ResponseRelevancy(llm=self.evaluator_llm)
+        self.context_precision = LLMContextPrecisionWithoutReference(llm=self.evaluator_llm)
+        # Storage for results
+        self.results: List[RagasEvaluationResult] = []
+        logger.info("✓ RAGAS Evaluator initialized (Faithfulness + Context Precision)")
+    async def evaluate_single(
+        self,
+        query: str,
+        answer: str,
+        contexts: List[str],
+        ground_truth: Optional[str] = None
+    ) -> RagasEvaluationResult:
+        """
+        Evaluate a single RAG response.
+        """
+        import time
+        import hashlib
+        start_time = time.time()
+        # 1. Create SingleTurnSample
+        sample = SingleTurnSample(
+            user_input=query,
+            response=answer,
+            retrieved_contexts=contexts,
+            reference=ground_truth or ""
+        )
+        # 2. Score with each metric (async!)
+        faithfulness_score = await self.faithfulness.single_turn_ascore(sample)
+        # answer_relevancy_score = await self.answer_relevancy.single_turn_ascore(sample)
+        answer_relevancy_score = None
+        context_precision_score = await self.context_precision.single_turn_ascore(sample)
+        # 3. Calculate latency
+        latency_ms = (time.time() - start_time) * 1000
+        # 4. Generate eval_id
+        eval_id = hashlib.md5(f"{query}{datetime.now().isoformat()}".encode()).hexdigest()[:8]
+        # 5. Create and store result
+        result = RagasEvaluationResult(
+            eval_id=eval_id,
+            query=query,
+            faithfulness=float(faithfulness_score),
+            answer_relevancy=0.0, #float(answer_relevancy_score),
+            context_precision=float(context_precision_score),
+            latency_ms=latency_ms
+        )
+        self.results.append(result)
+        logger.info(f"Evaluation complete: RAGAS score = {result.ragas_score:.3f}")
+        return result

src/evaluation/ragas_integration/test_ragas.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+Quick test for RAGAS integration
+Run: python -m src.evaluation.ragas_integration.test_ragas
+"""
+import asyncio
+from src.rag import RAGPipeline, RAGConfig
+from src.evaluation.ragas_integration import RagasReadyPipeline, RagasEvaluator
+async def test_ragas():
+    print("=" * 50)
+    print("Testing RAGAS Integration")
+    print("=" * 50)
+    # Step 1: Initialize pipeline
+    print("\n1. Initializing pipeline...")
+    pipeline = RAGPipeline(RAGConfig())
+    ragas_pipeline = RagasReadyPipeline(pipeline)
+    print("   ✓ Pipeline ready")
+    # Step 2: Initialize evaluator
+    print("\n2. Initializing RAGAS evaluator...")
+    evaluator = RagasEvaluator()
+    print("   ✓ Evaluator ready")
+    # Step 3: Ingest a test document
+    print("\n3. Ingesting test document...")
+    test_text = """
+    Machine learning is a subset of artificial intelligence that enables
+    systems to learn from data. Deep learning is a type of machine learning
+    that uses neural networks with multiple layers. Natural language processing
+    (NLP) is used to understand human language.
+    """
+    pipeline.ingest("test_doc", test_text)
+    print(f"   ✓ Ingested {pipeline.vector_store.size()} chunks")
+    # Step 4: Query with full context capture
+    print("\n4. Querying pipeline...")
+    response = ragas_pipeline.query_for_evaluation("What is machine learning?")
+    print(f"   Query: {response.query}")
+    print(f"   Answer: {response.answer[:100]}...")
+    print(f"   Contexts captured: {len(response.contexts)}")
+    print(f"   Status: {response.status}")
+    # Step 5: Evaluate with RAGAS
+    print("\n5. Running RAGAS evaluation...")
+    result = await evaluator.evaluate_single(
+        query=response.query,
+        answer=response.answer,
+        contexts=response.contexts
+    )
+    print(f"\n{'=' * 50}")
+    print("RAGAS RESULTS")
+    print(f"{'=' * 50}")
+    print(f"   Faithfulness:      {result.faithfulness:.3f}")
+    print(f"   Answer Relevancy:  {result.answer_relevancy:.3f}")
+    print(f"   Context Precision: {result.context_precision:.3f}")
+    print(f"   ─────────────────────────────")
+    print(f"   RAGAS Score:       {result.ragas_score:.3f}")
+    print(f"   Eval Time:         {result.latency_ms:.0f}ms")
+    print(f"{'=' * 50}")
+    return result
+if __name__ == "__main__":
+    asyncio.run(test_ragas())

src/main.py CHANGED Viewed

@@ -10,6 +10,12 @@ from datetime import datetime
 import tempfile
 from pathlib import Path
 from src.rag import RAGPipeline, RAGConfig
 from src.evaluation import RAGEvaluator, EvaluationResult
 import io
@@ -49,7 +55,8 @@ if os.path.exists("frontend"):
 # Global pipeline instance
 pipeline: Optional[RAGPipeline] = None
 # ==================== Pydantic Models ====================
@@ -108,7 +115,7 @@ class StatsResponse(BaseModel):
 @app.on_event("startup")
 async def startup_event():
     """Initialize pipeline on startup."""
-    global pipeline
     logger.info("=" * 60)
     logger.info("Starting Document Intelligence RAG API")
@@ -124,8 +131,18 @@ async def startup_event():
         # Initialize pipeline (automatically uses get_embeddings_client())
         pipeline = RAGPipeline(config=config)
         logger.info("✓ Pipeline initialized successfully")
         logger.info(f"✓ Embedding backend: {config.embedding_backend}")
         logger.info(f"✓ API ready at http://localhost:8000")
         logger.info(f"✓ Interactive docs at http://localhost:8000/docs")
@@ -429,7 +446,7 @@ async def reset_system():
     Returns:
         Reset confirmation
     """
-    global pipeline
     if not pipeline:
         raise HTTPException(status_code=503, detail="Pipeline not initialized")
@@ -439,12 +456,15 @@ async def reset_system():
         # Clear vector store
         pipeline.vector_store.clear()
         logger.info("✓ System reset complete")
         return {
             "status": "success",
-            "message": "All documents and embeddings cleared",
             "chunks_remaining": 0,
             "timestamp": datetime.now().isoformat()
         }
@@ -659,6 +679,16 @@ async def query_with_evaluation(request: dict):
         return {"error": str(e)}, 500
 # ==================== Root Endpoint ====================
 @app.get("/", response_class=FileResponse)

 import tempfile
 from pathlib import Path
+from src.evaluation.ragas_integration import (
+    RagasReadyPipeline,
+    RagasEvaluator,
+    init_ragas_router,
+)
 from src.rag import RAGPipeline, RAGConfig
 from src.evaluation import RAGEvaluator, EvaluationResult
 import io
 # Global pipeline instance
 pipeline: Optional[RAGPipeline] = None
+ragas_pipeline = None
+ragas_evaluator = None
 # ==================== Pydantic Models ====================
 @app.on_event("startup")
 async def startup_event():
     """Initialize pipeline on startup."""
+    global pipeline, ragas_pipeline, ragas_evaluator
     logger.info("=" * 60)
     logger.info("Starting Document Intelligence RAG API")
         # Initialize pipeline (automatically uses get_embeddings_client())
         pipeline = RAGPipeline(config=config)
         logger.info("✓ Pipeline initialized successfully")
+        # RAGAS integration
+        ragas_pipeline = RagasReadyPipeline(pipeline)
+        logger.info("✓ Ragas pipeline initialized successfully")
+        ragas_evaluator = RagasEvaluator()
+        logger.info("✓ Ragas evaluator initialized successfully")
+        ragas_router = init_ragas_router(ragas_pipeline, ragas_evaluator)
+        app.include_router(ragas_router, prefix="/ragas", tags=["RAGAS Evaluation"])
+        logger.info("✓ Ragas evaluator initialized successfully")
         logger.info(f"✓ Embedding backend: {config.embedding_backend}")
         logger.info(f"✓ API ready at http://localhost:8000")
         logger.info(f"✓ Interactive docs at http://localhost:8000/docs")
     Returns:
         Reset confirmation
     """
+    global pipeline, ragas_evaluator
     if not pipeline:
         raise HTTPException(status_code=503, detail="Pipeline not initialized")
         # Clear vector store
         pipeline.vector_store.clear()
+        if ragas_evaluator:
+            ragas_evaluator.results = []
+            logger.info("✓ RAGAS evaluations cleared")
         logger.info("✓ System reset complete")
         return {
             "status": "success",
+            "message": "All documents, embeddings, and RAGAS evaluations cleared",
             "chunks_remaining": 0,
             "timestamp": datetime.now().isoformat()
         }
         return {"error": str(e)}, 500
+# ===================== RAGAS Endpoints ====================
+@app.get("/ragas-demo")
+async def ragas_demo_page():
+    """Serve RAGAS evaluation demo page."""
+    frontend_path = "frontend/ragas.html"
+    if os.path.exists(frontend_path):
+        return FileResponse(frontend_path)
+    return {"error": "RAGAS demo page not found"}
 # ==================== Root Endpoint ====================
 @app.get("/", response_class=FileResponse)

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff