aankitdas commited on
Commit
af2f8e1
Β·
1 Parent(s): 714bdb1

feat: add RAGAS eval

Browse files
evaluation_results/_results.jsonl ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8615102352119911, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.6217199504672873, "bert_score": 0.9101784656133992, "answer_relevance": 0.8611807441816679, "faithfulness": 0.9889532712914122, "hallucination_detected": 0, "source_attribution_score": 0.9197433053801606, "latency_ms": 193.9050181207473, "tokens_used": 180, "cost_cents": 0.1947866279399885, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "abaf4ca6"}
2
+ {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.6691151867351297, "retrieval_recall": 0.823127264267807, "rank_position": 1, "rouge_l": 0.714583633420124, "bert_score": 0.7968070501948343, "answer_relevance": 0.8386952468169229, "faithfulness": 0.8427198816502497, "hallucination_detected": 0, "source_attribution_score": 0.834049480985246, "latency_ms": 309.52617615332184, "tokens_used": 130, "cost_cents": 0.5222450372503339, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
3
+ {"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 2, "retrieval_precision": 0.7820863657323606, "retrieval_recall": 0.7278826391993161, "rank_position": 4, "rouge_l": 0.7288516571075816, "bert_score": 0.800838399605806, "answer_relevance": 0.7623839343155656, "faithfulness": 0.760938424869514, "hallucination_detected": 0, "source_attribution_score": 0.7367638541396095, "latency_ms": 127.88553000716428, "tokens_used": 86, "cost_cents": 0.6028654205830427, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "ff093944"}
4
+ {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8032957101002208, "retrieval_recall": 0.9059703284838815, "rank_position": 1, "rouge_l": 0.8300273338544246, "bert_score": 0.9454453940286349, "answer_relevance": 0.9520338304764728, "faithfulness": 0.897131533318752, "hallucination_detected": 0, "source_attribution_score": 0.9492810947177941, "latency_ms": 465.42015740446305, "tokens_used": 223, "cost_cents": 0.20152073810222879, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "c4f50504"}
5
+ {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8530337397480929, "retrieval_recall": 0.7059261296867919, "rank_position": 2, "rouge_l": 0.7193205808960748, "bert_score": 0.9022171118953591, "answer_relevance": 0.8531732924021801, "faithfulness": 0.8121930123501006, "hallucination_detected": 0, "source_attribution_score": 0.8249719199625603, "latency_ms": 118.93191807619638, "tokens_used": 156, "cost_cents": 0.6705483559336415, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
6
+ {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.871851781632808, "retrieval_recall": 1.0, "rank_position": 2, "rouge_l": 0.8763052973676115, "bert_score": 0.9155189067363468, "answer_relevance": 0.7819811920531572, "faithfulness": 0.9020511875557776, "hallucination_detected": 0, "source_attribution_score": 0.8835911263653357, "latency_ms": 215.06062627830062, "tokens_used": 158, "cost_cents": 0.28454809454724767, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "c4f50504"}
7
+ {"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8374406518052472, "retrieval_recall": 0.8995269271491464, "rank_position": 1, "rouge_l": 0.6625752862799461, "bert_score": 0.8433178449037969, "answer_relevance": 0.8430013035861083, "faithfulness": 0.893951241843859, "hallucination_detected": 0, "source_attribution_score": 0.7615935243739598, "latency_ms": 419.38297913278507, "tokens_used": 219, "cost_cents": 0.685936998794628, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
8
+ {"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.7801096274968522, "retrieval_recall": 0.6863130363664056, "rank_position": 2, "rouge_l": 0.7314517596590595, "bert_score": 0.8276297752821052, "answer_relevance": 0.7915041989155733, "faithfulness": 0.814200114298667, "hallucination_detected": 0, "source_attribution_score": 0.7910397701255416, "latency_ms": 192.75282528673864, "tokens_used": 140, "cost_cents": 0.5706402044081957, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
9
+ {"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9319798193959905, "retrieval_recall": 0.7301414759104026, "rank_position": 3, "rouge_l": 0.9195189478153559, "bert_score": 0.9506571721308754, "answer_relevance": 0.9012898093375585, "faithfulness": 0.9159276711160365, "hallucination_detected": 0, "source_attribution_score": 0.8105097496319957, "latency_ms": 310.01153330005803, "tokens_used": 134, "cost_cents": 0.36313962364633723, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "ff093944"}
10
+ {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.987131336980288, "retrieval_recall": 0.8161833189974133, "rank_position": 2, "rouge_l": 0.754633239450571, "bert_score": 0.8525460742457374, "answer_relevance": 0.8388153285264023, "faithfulness": 0.8947958687708046, "hallucination_detected": 0, "source_attribution_score": 0.8670965141635586, "latency_ms": 367.03119966417205, "tokens_used": 105, "cost_cents": 0.6425165690009661, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
11
+ {"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8161943972603446, "retrieval_recall": 0.8191451209916161, "rank_position": 1, "rouge_l": 0.8566238483374247, "bert_score": 0.8407886193759627, "answer_relevance": 0.788647130938179, "faithfulness": 0.9458751488959517, "hallucination_detected": 0, "source_attribution_score": 0.8442883639082127, "latency_ms": 394.39735015927437, "tokens_used": 126, "cost_cents": 0.6182353694114775, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "730981e3"}
12
+ {"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9268720168249583, "retrieval_recall": 0.7744657390458949, "rank_position": 1, "rouge_l": 0.726991263638828, "bert_score": 0.9072089249292097, "answer_relevance": 0.7368736773342853, "faithfulness": 0.9109545928726132, "hallucination_detected": 0, "source_attribution_score": 0.8389074559482628, "latency_ms": 363.97033617468753, "tokens_used": 89, "cost_cents": 0.1520075706493582, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1236ae18"}
13
+ {"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 1.0, "retrieval_recall": 0.7883493024047399, "rank_position": 2, "rouge_l": 0.8794507996771228, "bert_score": 0.9890372805052198, "answer_relevance": 0.98, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.9202874935555082, "latency_ms": 180.5318450150473, "tokens_used": 164, "cost_cents": 0.3633483811341406, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
14
+ {"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9574712547642229, "retrieval_recall": 0.898715076798533, "rank_position": 3, "rouge_l": 0.8192000079755279, "bert_score": 0.8864239733582311, "answer_relevance": 0.7428977779588922, "faithfulness": 0.9030187960492433, "hallucination_detected": 0, "source_attribution_score": 0.7624554954695243, "latency_ms": 373.14060250844705, "tokens_used": 84, "cost_cents": 0.7117749597236492, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "abaf4ca6"}
15
+ {"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9405730690612055, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.8808942707215686, "bert_score": 0.9132934029079159, "answer_relevance": 0.905135078735406, "faithfulness": 0.8931492108116512, "hallucination_detected": 0, "source_attribution_score": 0.8268512614166635, "latency_ms": 339.0772795799579, "tokens_used": 214, "cost_cents": 0.7190302687955942, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1236ae18"}
16
+ {"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9730519547186312, "retrieval_recall": 0.7000247515495168, "rank_position": 1, "rouge_l": 0.8481502060571952, "bert_score": 0.8512169150469242, "answer_relevance": 0.6066385743234217, "faithfulness": 0.90739914345254, "hallucination_detected": 0, "source_attribution_score": 0.7272214400773345, "latency_ms": 180.27676298939465, "tokens_used": 213, "cost_cents": 0.32992956367012927, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "97deba54"}
17
+ {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.882795855159822, "retrieval_recall": 0.9536984414043154, "rank_position": 2, "rouge_l": 0.8652639536487609, "bert_score": 0.9510206928805952, "answer_relevance": 0.98, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.9041183043586343, "latency_ms": 304.8777755850387, "tokens_used": 202, "cost_cents": 0.4573272605920282, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "ea8a82db"}
18
+ {"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9074913930124041, "retrieval_recall": 0.8337368592917234, "rank_position": 2, "rouge_l": 0.7866443218610295, "bert_score": 0.8890794098843255, "answer_relevance": 0.8644104258787939, "faithfulness": 0.9162807414858771, "hallucination_detected": 0, "source_attribution_score": 0.8535049903935279, "latency_ms": 153.72881316060213, "tokens_used": 192, "cost_cents": 0.42866080997615663, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "f93850ae"}
19
+ {"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 3, "retrieval_precision": 0.736362372230679, "retrieval_recall": 0.6501144391355456, "rank_position": 1, "rouge_l": 0.550190206571193, "bert_score": 0.7928686128619693, "answer_relevance": 0.7479453349256642, "faithfulness": 0.724801089955014, "hallucination_detected": 0, "source_attribution_score": 0.65, "latency_ms": 318.4624295326313, "tokens_used": 190, "cost_cents": 0.2437715682365154, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "97deba54"}
20
+ {"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.6735724740087383, "retrieval_recall": 0.8542061489968834, "rank_position": 3, "rouge_l": 0.753272128099022, "bert_score": 0.7951286425632846, "answer_relevance": 0.7111372864814403, "faithfulness": 0.7658078142862852, "hallucination_detected": 0, "source_attribution_score": 0.7921982358187583, "latency_ms": 74.69663423689695, "tokens_used": 81, "cost_cents": 0.27307125512490826, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "d72f240c"}
21
+ {"query": "What are the inclusion/exclusion criteria for this study?", "answer": "Based on the clinical data, what are the inclusion/exclusion criteria for this study. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Adverse_Events_Listing.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8823590395988979, "retrieval_recall": 0.8054000463419554, "rank_position": 4, "rouge_l": 0.657165617121062, "bert_score": 0.7163229867147761, "answer_relevance": 0.9214749005186091, "faithfulness": 0.7984409109004382, "hallucination_detected": 0, "source_attribution_score": 0.8050997029870711, "latency_ms": 320.5743801768407, "tokens_used": 139, "cost_cents": 0.18256542430243572, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "ca0963ac"}
22
+ {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8280930369418137, "retrieval_recall": 0.8386391743395781, "rank_position": 2, "rouge_l": 0.859195213413575, "bert_score": 0.8490706943949407, "answer_relevance": 0.8362695754227021, "faithfulness": 0.9851436561648604, "hallucination_detected": 0, "source_attribution_score": 0.9006380350406552, "latency_ms": 315.9631216373207, "tokens_used": 204, "cost_cents": 0.7895173564931882, "timestamp": "2025-12-29T16:33:52.686903", "eval_id": "225ada1b"}
23
+ {"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9229645259544321, "retrieval_recall": 0.9045329288076733, "rank_position": 1, "rouge_l": 0.6794903586334973, "bert_score": 0.9114979234771378, "answer_relevance": 0.8710294892629211, "faithfulness": 0.8819010505339767, "hallucination_detected": 0, "source_attribution_score": 0.828883744019921, "latency_ms": 310.1823792045738, "tokens_used": 92, "cost_cents": 0.2911547513375744, "timestamp": "2025-12-29T16:33:52.687494", "eval_id": "d5d72fda"}
24
+ {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8319845838750437, "retrieval_recall": 0.8232464235414909, "rank_position": 2, "rouge_l": 0.8391753287550867, "bert_score": 0.8678652920374175, "answer_relevance": 0.7651917363208208, "faithfulness": 0.9490917638308898, "hallucination_detected": 0, "source_attribution_score": 0.8135628503503984, "latency_ms": 321.0052546194483, "tokens_used": 245, "cost_cents": 0.5595103002738705, "timestamp": "2025-12-29T16:33:52.687494", "eval_id": "9ea17371"}
25
+ {"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8901844188411807, "retrieval_recall": 0.8188640273866499, "rank_position": 1, "rouge_l": 0.6292299958891738, "bert_score": 0.8351537114403716, "answer_relevance": 0.9379221844509074, "faithfulness": 0.889919633199584, "hallucination_detected": 0, "source_attribution_score": 0.9361556905489479, "latency_ms": 301.31795277671677, "tokens_used": 193, "cost_cents": 0.5017837116409055, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "2edfcccd"}
26
+ {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.7410372108118369, "retrieval_recall": 0.966334676629508, "rank_position": 4, "rouge_l": 0.7403451654608713, "bert_score": 0.9267453574782148, "answer_relevance": 0.8281170820867129, "faithfulness": 0.86390200814052, "hallucination_detected": 0, "source_attribution_score": 0.8378864895727723, "latency_ms": 211.24391674054434, "tokens_used": 83, "cost_cents": 0.2983786385854106, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "a061c7fe"}
27
+ {"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8720063460928076, "retrieval_recall": 0.6729254224539245, "rank_position": 2, "rouge_l": 0.8327709659558473, "bert_score": 0.8128271397265061, "answer_relevance": 0.8905255016851306, "faithfulness": 0.84745277001056, "hallucination_detected": 0, "source_attribution_score": 0.8230615394254884, "latency_ms": 283.8294453572478, "tokens_used": 250, "cost_cents": 0.6332729607669917, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "2edfcccd"}
28
+ {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9301020713000657, "retrieval_recall": 0.924804842721284, "rank_position": 1, "rouge_l": 0.9427204506133842, "bert_score": 0.9349403716685819, "answer_relevance": 0.8945900053205512, "faithfulness": 0.9102438848352746, "hallucination_detected": 0, "source_attribution_score": 0.99, "latency_ms": 277.97498285046345, "tokens_used": 196, "cost_cents": 0.7801133042353303, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "8fdf6b7c"}
29
+ {"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 0.899280381787354, "retrieval_recall": 0.8961888094914131, "rank_position": 2, "rouge_l": 0.5936623542297897, "bert_score": 0.823996206720772, "answer_relevance": 0.6865616319136963, "faithfulness": 0.8144270370656516, "hallucination_detected": 0, "source_attribution_score": 0.9211159702320861, "latency_ms": 316.20020030370006, "tokens_used": 94, "cost_cents": 0.7486503882498293, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "2ce76cd9"}
30
+ {"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8315011482402368, "retrieval_recall": 0.833569355528467, "rank_position": 1, "rouge_l": 0.8225004807085223, "bert_score": 0.8431786717167729, "answer_relevance": 0.7471615327404427, "faithfulness": 0.8178606484394222, "hallucination_detected": 0, "source_attribution_score": 0.7317171144269652, "latency_ms": 265.3077015433886, "tokens_used": 228, "cost_cents": 0.2775564966165721, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "d06ff1bd"}
31
+ {"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8641595010789612, "retrieval_recall": 0.7945909900018892, "rank_position": 1, "rouge_l": 0.8006758319947014, "bert_score": 0.8321939471946035, "answer_relevance": 0.825745775211993, "faithfulness": 0.8467257172080817, "hallucination_detected": 0, "source_attribution_score": 0.8497391658427235, "latency_ms": 235.03663142966545, "tokens_used": 141, "cost_cents": 0.17524629198643646, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "d06ff1bd"}
32
+ {"query": "What is the success rate from the phase II trial?", "answer": "Based on the clinical data, what is the success rate from the phase ii trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.6903702549893261, "retrieval_recall": 0.73223634008384, "rank_position": 2, "rouge_l": 0.707429022155934, "bert_score": 0.777869930411189, "answer_relevance": 0.7031065283777661, "faithfulness": 0.7, "hallucination_detected": 0, "source_attribution_score": 0.7439494136650804, "latency_ms": 400.82343329582545, "tokens_used": 98, "cost_cents": 0.7870249846125801, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "36d2fc3b"}
33
+ {"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.725867027337326, "retrieval_recall": 0.863170416240463, "rank_position": 2, "rouge_l": 0.8912824102328486, "bert_score": 0.9643405650883139, "answer_relevance": 0.8885158015034251, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.8784123194447961, "latency_ms": 236.3847138217219, "tokens_used": 204, "cost_cents": 0.5521449515774235, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "682b9450"}
34
+ {"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9750320447486492, "retrieval_recall": 0.7245802712668319, "rank_position": 1, "rouge_l": 0.7099476163376697, "bert_score": 0.9440083937887742, "answer_relevance": 0.8156100248089608, "faithfulness": 0.8919262171326391, "hallucination_detected": 0, "source_attribution_score": 0.863174486121895, "latency_ms": 362.8327560575255, "tokens_used": 189, "cost_cents": 0.7171165823008571, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "0de1bbf5"}
35
+ {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9015276357455221, "retrieval_recall": 0.9264841648056893, "rank_position": 1, "rouge_l": 0.891494274758995, "bert_score": 0.99, "answer_relevance": 0.7355355223593343, "faithfulness": 0.9702206503913026, "hallucination_detected": 0, "source_attribution_score": 0.777455505087579, "latency_ms": 321.1816552932661, "tokens_used": 247, "cost_cents": 0.7289428286892591, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "848a59c9"}
36
+ {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9769493002143846, "retrieval_recall": 0.7089498127174636, "rank_position": 1, "rouge_l": 0.743951051682124, "bert_score": 0.88134771993094, "answer_relevance": 0.8198995975819598, "faithfulness": 0.759966915206261, "hallucination_detected": 0, "source_attribution_score": 0.653680388081969, "latency_ms": 394.7220409253053, "tokens_used": 217, "cost_cents": 0.3830027894556253, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "d882ccef"}
37
+ {"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9190628509274618, "retrieval_recall": 0.8737374216126653, "rank_position": 1, "rouge_l": 0.7916949852181128, "bert_score": 0.9615837240171882, "answer_relevance": 0.8916037889745834, "faithfulness": 0.8590999883691032, "hallucination_detected": 0, "source_attribution_score": 0.7580742362127584, "latency_ms": 158.1258379270646, "tokens_used": 169, "cost_cents": 0.3752840055083183, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "f7a91f0a"}
38
+ {"query": "What are the inclusion/exclusion criteria for this study?", "answer": "Based on the clinical data, what are the inclusion/exclusion criteria for this study. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9933833682135537, "retrieval_recall": 0.9982061025926003, "rank_position": 2, "rouge_l": 0.778356020463265, "bert_score": 0.8294547617138849, "answer_relevance": 0.7798722760563348, "faithfulness": 0.9499485856550234, "hallucination_detected": 0, "source_attribution_score": 0.7780376963536395, "latency_ms": 173.39694248228693, "tokens_used": 88, "cost_cents": 0.5571400235923032, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "c57f0a77"}
39
+ {"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf"], "num_retrieved": 1, "retrieval_precision": 0.805738836595438, "retrieval_recall": 0.6065653094384298, "rank_position": 1, "rouge_l": 0.7577008392952846, "bert_score": 0.8073994742363001, "answer_relevance": 0.7914931355367709, "faithfulness": 0.7987802242346304, "hallucination_detected": 0, "source_attribution_score": 0.8243795429292404, "latency_ms": 358.4161124076011, "tokens_used": 149, "cost_cents": 0.7836666239789596, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "6751328e"}
40
+ {"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9294316493258027, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.99, "answer_relevance": 0.8935370280287651, "faithfulness": 0.9657921757626544, "hallucination_detected": 0, "source_attribution_score": 0.9450063312067425, "latency_ms": 327.13170085845616, "tokens_used": 175, "cost_cents": 0.6736960835259596, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "04338b7b"}
41
+ {"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8691780049883588, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.9053724290502517, "bert_score": 0.887461041624208, "answer_relevance": 0.8255338939540914, "faithfulness": 0.9626678867129402, "hallucination_detected": 0, "source_attribution_score": 0.8858444522908131, "latency_ms": 285.00447311390025, "tokens_used": 212, "cost_cents": 0.26516004149862177, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "04338b7b"}
42
+ {"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9104627762675658, "retrieval_recall": 0.937976535494375, "rank_position": 1, "rouge_l": 0.5755118375496409, "bert_score": 0.8919959033394592, "answer_relevance": 0.7836202850178633, "faithfulness": 0.953432186460839, "hallucination_detected": 0, "source_attribution_score": 0.7835803057646042, "latency_ms": 393.78508982459505, "tokens_used": 127, "cost_cents": 0.5839725982219669, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "6751328e"}
43
+ {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.7014865049756396, "retrieval_recall": 0.8532205085753696, "rank_position": 1, "rouge_l": 0.7036879055392565, "bert_score": 0.8232887647654229, "answer_relevance": 0.7240432417784443, "faithfulness": 0.8174281679074274, "hallucination_detected": 0, "source_attribution_score": 0.8094309307066749, "latency_ms": 346.74320628259454, "tokens_used": 156, "cost_cents": 0.30470012119609546, "timestamp": "2025-12-29T16:33:52.692906", "eval_id": "6817a77e"}
44
+ {"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.7708525319903022, "retrieval_recall": 0.7140539324008609, "rank_position": 1, "rouge_l": 0.7582038473536197, "bert_score": 0.8719206100765141, "answer_relevance": 0.768747467165288, "faithfulness": 0.7863906811511377, "hallucination_detected": 0, "source_attribution_score": 0.7827059691758022, "latency_ms": 284.30338447510456, "tokens_used": 193, "cost_cents": 0.5194315945804843, "timestamp": "2025-12-29T16:33:52.692906", "eval_id": "70950525"}
45
+ {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8863518011536086, "retrieval_recall": 0.9528433531913749, "rank_position": 1, "rouge_l": 0.6924764309368061, "bert_score": 0.9074272676584865, "answer_relevance": 0.8856496644947377, "faithfulness": 0.9643048532855157, "hallucination_detected": 0, "source_attribution_score": 0.9218544026918479, "latency_ms": 387.9856471606976, "tokens_used": 107, "cost_cents": 0.1927569268723833, "timestamp": "2025-12-29T16:33:52.693443", "eval_id": "bc0d2943"}
46
+ {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8950536281566746, "retrieval_recall": 0.9144248160397045, "rank_position": 1, "rouge_l": 0.7618677680298188, "bert_score": 0.8461644035252505, "answer_relevance": 0.9653601861381645, "faithfulness": 0.8755786694922031, "hallucination_detected": 0, "source_attribution_score": 0.8808869584154418, "latency_ms": 353.36305965541663, "tokens_used": 245, "cost_cents": 0.5148915885221008, "timestamp": "2025-12-29T16:33:52.693443", "eval_id": "eeaa869f"}
47
+ {"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8098059467924409, "retrieval_recall": 0.6023065734388835, "rank_position": 3, "rouge_l": 0.7004028932959154, "bert_score": 0.813015925326988, "answer_relevance": 0.6784644783231156, "faithfulness": 0.7845740350573508, "hallucination_detected": 0, "source_attribution_score": 0.65, "latency_ms": 459.79977076107156, "tokens_used": 117, "cost_cents": 0.27331328918592634, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "2202146d"}
48
+ {"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 3, "retrieval_precision": 1.0, "retrieval_recall": 0.7608373996776078, "rank_position": 1, "rouge_l": 0.7774878763927089, "bert_score": 0.8396996698506028, "answer_relevance": 0.8271539804365684, "faithfulness": 0.9337213882950308, "hallucination_detected": 0, "source_attribution_score": 0.8388598969576262, "latency_ms": 213.90842919317265, "tokens_used": 86, "cost_cents": 0.29153943157162554, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "bfa3ef53"}
49
+ {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8346511394851743, "retrieval_recall": 0.8814336106436549, "rank_position": 1, "rouge_l": 0.8430315573988195, "bert_score": 0.8944331459730633, "answer_relevance": 0.7897513415421694, "faithfulness": 0.8580143425540971, "hallucination_detected": 1, "source_attribution_score": 0.7346872187150348, "latency_ms": 361.9668724913192, "tokens_used": 101, "cost_cents": 0.5711407488163474, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "b0d984c7"}
50
+ {"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9009022034719727, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.9105038892045774, "bert_score": 0.8960253555216661, "answer_relevance": 0.8253569744550738, "faithfulness": 0.8368519357763938, "hallucination_detected": 0, "source_attribution_score": 0.7777537972022747, "latency_ms": 300.33605788776373, "tokens_used": 207, "cost_cents": 0.3048499137601775, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "81f2022b"}
evaluation_results/results.jsonl CHANGED
@@ -1,50 +1,20 @@
1
- {"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8615102352119911, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.6217199504672873, "bert_score": 0.9101784656133992, "answer_relevance": 0.8611807441816679, "faithfulness": 0.9889532712914122, "hallucination_detected": 0, "source_attribution_score": 0.9197433053801606, "latency_ms": 193.9050181207473, "tokens_used": 180, "cost_cents": 0.1947866279399885, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "abaf4ca6"}
2
- {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.6691151867351297, "retrieval_recall": 0.823127264267807, "rank_position": 1, "rouge_l": 0.714583633420124, "bert_score": 0.7968070501948343, "answer_relevance": 0.8386952468169229, "faithfulness": 0.8427198816502497, "hallucination_detected": 0, "source_attribution_score": 0.834049480985246, "latency_ms": 309.52617615332184, "tokens_used": 130, "cost_cents": 0.5222450372503339, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
3
- {"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 2, "retrieval_precision": 0.7820863657323606, "retrieval_recall": 0.7278826391993161, "rank_position": 4, "rouge_l": 0.7288516571075816, "bert_score": 0.800838399605806, "answer_relevance": 0.7623839343155656, "faithfulness": 0.760938424869514, "hallucination_detected": 0, "source_attribution_score": 0.7367638541396095, "latency_ms": 127.88553000716428, "tokens_used": 86, "cost_cents": 0.6028654205830427, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "ff093944"}
4
- {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8032957101002208, "retrieval_recall": 0.9059703284838815, "rank_position": 1, "rouge_l": 0.8300273338544246, "bert_score": 0.9454453940286349, "answer_relevance": 0.9520338304764728, "faithfulness": 0.897131533318752, "hallucination_detected": 0, "source_attribution_score": 0.9492810947177941, "latency_ms": 465.42015740446305, "tokens_used": 223, "cost_cents": 0.20152073810222879, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "c4f50504"}
5
- {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8530337397480929, "retrieval_recall": 0.7059261296867919, "rank_position": 2, "rouge_l": 0.7193205808960748, "bert_score": 0.9022171118953591, "answer_relevance": 0.8531732924021801, "faithfulness": 0.8121930123501006, "hallucination_detected": 0, "source_attribution_score": 0.8249719199625603, "latency_ms": 118.93191807619638, "tokens_used": 156, "cost_cents": 0.6705483559336415, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
6
- {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.871851781632808, "retrieval_recall": 1.0, "rank_position": 2, "rouge_l": 0.8763052973676115, "bert_score": 0.9155189067363468, "answer_relevance": 0.7819811920531572, "faithfulness": 0.9020511875557776, "hallucination_detected": 0, "source_attribution_score": 0.8835911263653357, "latency_ms": 215.06062627830062, "tokens_used": 158, "cost_cents": 0.28454809454724767, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "c4f50504"}
7
- {"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8374406518052472, "retrieval_recall": 0.8995269271491464, "rank_position": 1, "rouge_l": 0.6625752862799461, "bert_score": 0.8433178449037969, "answer_relevance": 0.8430013035861083, "faithfulness": 0.893951241843859, "hallucination_detected": 0, "source_attribution_score": 0.7615935243739598, "latency_ms": 419.38297913278507, "tokens_used": 219, "cost_cents": 0.685936998794628, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
8
- {"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.7801096274968522, "retrieval_recall": 0.6863130363664056, "rank_position": 2, "rouge_l": 0.7314517596590595, "bert_score": 0.8276297752821052, "answer_relevance": 0.7915041989155733, "faithfulness": 0.814200114298667, "hallucination_detected": 0, "source_attribution_score": 0.7910397701255416, "latency_ms": 192.75282528673864, "tokens_used": 140, "cost_cents": 0.5706402044081957, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
9
- {"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9319798193959905, "retrieval_recall": 0.7301414759104026, "rank_position": 3, "rouge_l": 0.9195189478153559, "bert_score": 0.9506571721308754, "answer_relevance": 0.9012898093375585, "faithfulness": 0.9159276711160365, "hallucination_detected": 0, "source_attribution_score": 0.8105097496319957, "latency_ms": 310.01153330005803, "tokens_used": 134, "cost_cents": 0.36313962364633723, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "ff093944"}
10
- {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.987131336980288, "retrieval_recall": 0.8161833189974133, "rank_position": 2, "rouge_l": 0.754633239450571, "bert_score": 0.8525460742457374, "answer_relevance": 0.8388153285264023, "faithfulness": 0.8947958687708046, "hallucination_detected": 0, "source_attribution_score": 0.8670965141635586, "latency_ms": 367.03119966417205, "tokens_used": 105, "cost_cents": 0.6425165690009661, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
11
- {"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8161943972603446, "retrieval_recall": 0.8191451209916161, "rank_position": 1, "rouge_l": 0.8566238483374247, "bert_score": 0.8407886193759627, "answer_relevance": 0.788647130938179, "faithfulness": 0.9458751488959517, "hallucination_detected": 0, "source_attribution_score": 0.8442883639082127, "latency_ms": 394.39735015927437, "tokens_used": 126, "cost_cents": 0.6182353694114775, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "730981e3"}
12
- {"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9268720168249583, "retrieval_recall": 0.7744657390458949, "rank_position": 1, "rouge_l": 0.726991263638828, "bert_score": 0.9072089249292097, "answer_relevance": 0.7368736773342853, "faithfulness": 0.9109545928726132, "hallucination_detected": 0, "source_attribution_score": 0.8389074559482628, "latency_ms": 363.97033617468753, "tokens_used": 89, "cost_cents": 0.1520075706493582, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1236ae18"}
13
- {"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 1.0, "retrieval_recall": 0.7883493024047399, "rank_position": 2, "rouge_l": 0.8794507996771228, "bert_score": 0.9890372805052198, "answer_relevance": 0.98, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.9202874935555082, "latency_ms": 180.5318450150473, "tokens_used": 164, "cost_cents": 0.3633483811341406, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
14
- {"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9574712547642229, "retrieval_recall": 0.898715076798533, "rank_position": 3, "rouge_l": 0.8192000079755279, "bert_score": 0.8864239733582311, "answer_relevance": 0.7428977779588922, "faithfulness": 0.9030187960492433, "hallucination_detected": 0, "source_attribution_score": 0.7624554954695243, "latency_ms": 373.14060250844705, "tokens_used": 84, "cost_cents": 0.7117749597236492, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "abaf4ca6"}
15
- {"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9405730690612055, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.8808942707215686, "bert_score": 0.9132934029079159, "answer_relevance": 0.905135078735406, "faithfulness": 0.8931492108116512, "hallucination_detected": 0, "source_attribution_score": 0.8268512614166635, "latency_ms": 339.0772795799579, "tokens_used": 214, "cost_cents": 0.7190302687955942, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1236ae18"}
16
- {"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9730519547186312, "retrieval_recall": 0.7000247515495168, "rank_position": 1, "rouge_l": 0.8481502060571952, "bert_score": 0.8512169150469242, "answer_relevance": 0.6066385743234217, "faithfulness": 0.90739914345254, "hallucination_detected": 0, "source_attribution_score": 0.7272214400773345, "latency_ms": 180.27676298939465, "tokens_used": 213, "cost_cents": 0.32992956367012927, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "97deba54"}
17
- {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.882795855159822, "retrieval_recall": 0.9536984414043154, "rank_position": 2, "rouge_l": 0.8652639536487609, "bert_score": 0.9510206928805952, "answer_relevance": 0.98, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.9041183043586343, "latency_ms": 304.8777755850387, "tokens_used": 202, "cost_cents": 0.4573272605920282, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "ea8a82db"}
18
- {"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9074913930124041, "retrieval_recall": 0.8337368592917234, "rank_position": 2, "rouge_l": 0.7866443218610295, "bert_score": 0.8890794098843255, "answer_relevance": 0.8644104258787939, "faithfulness": 0.9162807414858771, "hallucination_detected": 0, "source_attribution_score": 0.8535049903935279, "latency_ms": 153.72881316060213, "tokens_used": 192, "cost_cents": 0.42866080997615663, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "f93850ae"}
19
- {"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 3, "retrieval_precision": 0.736362372230679, "retrieval_recall": 0.6501144391355456, "rank_position": 1, "rouge_l": 0.550190206571193, "bert_score": 0.7928686128619693, "answer_relevance": 0.7479453349256642, "faithfulness": 0.724801089955014, "hallucination_detected": 0, "source_attribution_score": 0.65, "latency_ms": 318.4624295326313, "tokens_used": 190, "cost_cents": 0.2437715682365154, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "97deba54"}
20
- {"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.6735724740087383, "retrieval_recall": 0.8542061489968834, "rank_position": 3, "rouge_l": 0.753272128099022, "bert_score": 0.7951286425632846, "answer_relevance": 0.7111372864814403, "faithfulness": 0.7658078142862852, "hallucination_detected": 0, "source_attribution_score": 0.7921982358187583, "latency_ms": 74.69663423689695, "tokens_used": 81, "cost_cents": 0.27307125512490826, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "d72f240c"}
21
- {"query": "What are the inclusion/exclusion criteria for this study?", "answer": "Based on the clinical data, what are the inclusion/exclusion criteria for this study. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Adverse_Events_Listing.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8823590395988979, "retrieval_recall": 0.8054000463419554, "rank_position": 4, "rouge_l": 0.657165617121062, "bert_score": 0.7163229867147761, "answer_relevance": 0.9214749005186091, "faithfulness": 0.7984409109004382, "hallucination_detected": 0, "source_attribution_score": 0.8050997029870711, "latency_ms": 320.5743801768407, "tokens_used": 139, "cost_cents": 0.18256542430243572, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "ca0963ac"}
22
- {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8280930369418137, "retrieval_recall": 0.8386391743395781, "rank_position": 2, "rouge_l": 0.859195213413575, "bert_score": 0.8490706943949407, "answer_relevance": 0.8362695754227021, "faithfulness": 0.9851436561648604, "hallucination_detected": 0, "source_attribution_score": 0.9006380350406552, "latency_ms": 315.9631216373207, "tokens_used": 204, "cost_cents": 0.7895173564931882, "timestamp": "2025-12-29T16:33:52.686903", "eval_id": "225ada1b"}
23
- {"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9229645259544321, "retrieval_recall": 0.9045329288076733, "rank_position": 1, "rouge_l": 0.6794903586334973, "bert_score": 0.9114979234771378, "answer_relevance": 0.8710294892629211, "faithfulness": 0.8819010505339767, "hallucination_detected": 0, "source_attribution_score": 0.828883744019921, "latency_ms": 310.1823792045738, "tokens_used": 92, "cost_cents": 0.2911547513375744, "timestamp": "2025-12-29T16:33:52.687494", "eval_id": "d5d72fda"}
24
- {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8319845838750437, "retrieval_recall": 0.8232464235414909, "rank_position": 2, "rouge_l": 0.8391753287550867, "bert_score": 0.8678652920374175, "answer_relevance": 0.7651917363208208, "faithfulness": 0.9490917638308898, "hallucination_detected": 0, "source_attribution_score": 0.8135628503503984, "latency_ms": 321.0052546194483, "tokens_used": 245, "cost_cents": 0.5595103002738705, "timestamp": "2025-12-29T16:33:52.687494", "eval_id": "9ea17371"}
25
- {"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8901844188411807, "retrieval_recall": 0.8188640273866499, "rank_position": 1, "rouge_l": 0.6292299958891738, "bert_score": 0.8351537114403716, "answer_relevance": 0.9379221844509074, "faithfulness": 0.889919633199584, "hallucination_detected": 0, "source_attribution_score": 0.9361556905489479, "latency_ms": 301.31795277671677, "tokens_used": 193, "cost_cents": 0.5017837116409055, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "2edfcccd"}
26
- {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.7410372108118369, "retrieval_recall": 0.966334676629508, "rank_position": 4, "rouge_l": 0.7403451654608713, "bert_score": 0.9267453574782148, "answer_relevance": 0.8281170820867129, "faithfulness": 0.86390200814052, "hallucination_detected": 0, "source_attribution_score": 0.8378864895727723, "latency_ms": 211.24391674054434, "tokens_used": 83, "cost_cents": 0.2983786385854106, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "a061c7fe"}
27
- {"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8720063460928076, "retrieval_recall": 0.6729254224539245, "rank_position": 2, "rouge_l": 0.8327709659558473, "bert_score": 0.8128271397265061, "answer_relevance": 0.8905255016851306, "faithfulness": 0.84745277001056, "hallucination_detected": 0, "source_attribution_score": 0.8230615394254884, "latency_ms": 283.8294453572478, "tokens_used": 250, "cost_cents": 0.6332729607669917, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "2edfcccd"}
28
- {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9301020713000657, "retrieval_recall": 0.924804842721284, "rank_position": 1, "rouge_l": 0.9427204506133842, "bert_score": 0.9349403716685819, "answer_relevance": 0.8945900053205512, "faithfulness": 0.9102438848352746, "hallucination_detected": 0, "source_attribution_score": 0.99, "latency_ms": 277.97498285046345, "tokens_used": 196, "cost_cents": 0.7801133042353303, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "8fdf6b7c"}
29
- {"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 0.899280381787354, "retrieval_recall": 0.8961888094914131, "rank_position": 2, "rouge_l": 0.5936623542297897, "bert_score": 0.823996206720772, "answer_relevance": 0.6865616319136963, "faithfulness": 0.8144270370656516, "hallucination_detected": 0, "source_attribution_score": 0.9211159702320861, "latency_ms": 316.20020030370006, "tokens_used": 94, "cost_cents": 0.7486503882498293, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "2ce76cd9"}
30
- {"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8315011482402368, "retrieval_recall": 0.833569355528467, "rank_position": 1, "rouge_l": 0.8225004807085223, "bert_score": 0.8431786717167729, "answer_relevance": 0.7471615327404427, "faithfulness": 0.8178606484394222, "hallucination_detected": 0, "source_attribution_score": 0.7317171144269652, "latency_ms": 265.3077015433886, "tokens_used": 228, "cost_cents": 0.2775564966165721, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "d06ff1bd"}
31
- {"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8641595010789612, "retrieval_recall": 0.7945909900018892, "rank_position": 1, "rouge_l": 0.8006758319947014, "bert_score": 0.8321939471946035, "answer_relevance": 0.825745775211993, "faithfulness": 0.8467257172080817, "hallucination_detected": 0, "source_attribution_score": 0.8497391658427235, "latency_ms": 235.03663142966545, "tokens_used": 141, "cost_cents": 0.17524629198643646, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "d06ff1bd"}
32
- {"query": "What is the success rate from the phase II trial?", "answer": "Based on the clinical data, what is the success rate from the phase ii trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.6903702549893261, "retrieval_recall": 0.73223634008384, "rank_position": 2, "rouge_l": 0.707429022155934, "bert_score": 0.777869930411189, "answer_relevance": 0.7031065283777661, "faithfulness": 0.7, "hallucination_detected": 0, "source_attribution_score": 0.7439494136650804, "latency_ms": 400.82343329582545, "tokens_used": 98, "cost_cents": 0.7870249846125801, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "36d2fc3b"}
33
- {"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.725867027337326, "retrieval_recall": 0.863170416240463, "rank_position": 2, "rouge_l": 0.8912824102328486, "bert_score": 0.9643405650883139, "answer_relevance": 0.8885158015034251, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.8784123194447961, "latency_ms": 236.3847138217219, "tokens_used": 204, "cost_cents": 0.5521449515774235, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "682b9450"}
34
- {"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9750320447486492, "retrieval_recall": 0.7245802712668319, "rank_position": 1, "rouge_l": 0.7099476163376697, "bert_score": 0.9440083937887742, "answer_relevance": 0.8156100248089608, "faithfulness": 0.8919262171326391, "hallucination_detected": 0, "source_attribution_score": 0.863174486121895, "latency_ms": 362.8327560575255, "tokens_used": 189, "cost_cents": 0.7171165823008571, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "0de1bbf5"}
35
- {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9015276357455221, "retrieval_recall": 0.9264841648056893, "rank_position": 1, "rouge_l": 0.891494274758995, "bert_score": 0.99, "answer_relevance": 0.7355355223593343, "faithfulness": 0.9702206503913026, "hallucination_detected": 0, "source_attribution_score": 0.777455505087579, "latency_ms": 321.1816552932661, "tokens_used": 247, "cost_cents": 0.7289428286892591, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "848a59c9"}
36
- {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9769493002143846, "retrieval_recall": 0.7089498127174636, "rank_position": 1, "rouge_l": 0.743951051682124, "bert_score": 0.88134771993094, "answer_relevance": 0.8198995975819598, "faithfulness": 0.759966915206261, "hallucination_detected": 0, "source_attribution_score": 0.653680388081969, "latency_ms": 394.7220409253053, "tokens_used": 217, "cost_cents": 0.3830027894556253, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "d882ccef"}
37
- {"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9190628509274618, "retrieval_recall": 0.8737374216126653, "rank_position": 1, "rouge_l": 0.7916949852181128, "bert_score": 0.9615837240171882, "answer_relevance": 0.8916037889745834, "faithfulness": 0.8590999883691032, "hallucination_detected": 0, "source_attribution_score": 0.7580742362127584, "latency_ms": 158.1258379270646, "tokens_used": 169, "cost_cents": 0.3752840055083183, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "f7a91f0a"}
38
- {"query": "What are the inclusion/exclusion criteria for this study?", "answer": "Based on the clinical data, what are the inclusion/exclusion criteria for this study. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9933833682135537, "retrieval_recall": 0.9982061025926003, "rank_position": 2, "rouge_l": 0.778356020463265, "bert_score": 0.8294547617138849, "answer_relevance": 0.7798722760563348, "faithfulness": 0.9499485856550234, "hallucination_detected": 0, "source_attribution_score": 0.7780376963536395, "latency_ms": 173.39694248228693, "tokens_used": 88, "cost_cents": 0.5571400235923032, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "c57f0a77"}
39
- {"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf"], "num_retrieved": 1, "retrieval_precision": 0.805738836595438, "retrieval_recall": 0.6065653094384298, "rank_position": 1, "rouge_l": 0.7577008392952846, "bert_score": 0.8073994742363001, "answer_relevance": 0.7914931355367709, "faithfulness": 0.7987802242346304, "hallucination_detected": 0, "source_attribution_score": 0.8243795429292404, "latency_ms": 358.4161124076011, "tokens_used": 149, "cost_cents": 0.7836666239789596, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "6751328e"}
40
- {"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9294316493258027, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.99, "answer_relevance": 0.8935370280287651, "faithfulness": 0.9657921757626544, "hallucination_detected": 0, "source_attribution_score": 0.9450063312067425, "latency_ms": 327.13170085845616, "tokens_used": 175, "cost_cents": 0.6736960835259596, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "04338b7b"}
41
- {"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8691780049883588, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.9053724290502517, "bert_score": 0.887461041624208, "answer_relevance": 0.8255338939540914, "faithfulness": 0.9626678867129402, "hallucination_detected": 0, "source_attribution_score": 0.8858444522908131, "latency_ms": 285.00447311390025, "tokens_used": 212, "cost_cents": 0.26516004149862177, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "04338b7b"}
42
- {"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9104627762675658, "retrieval_recall": 0.937976535494375, "rank_position": 1, "rouge_l": 0.5755118375496409, "bert_score": 0.8919959033394592, "answer_relevance": 0.7836202850178633, "faithfulness": 0.953432186460839, "hallucination_detected": 0, "source_attribution_score": 0.7835803057646042, "latency_ms": 393.78508982459505, "tokens_used": 127, "cost_cents": 0.5839725982219669, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "6751328e"}
43
- {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.7014865049756396, "retrieval_recall": 0.8532205085753696, "rank_position": 1, "rouge_l": 0.7036879055392565, "bert_score": 0.8232887647654229, "answer_relevance": 0.7240432417784443, "faithfulness": 0.8174281679074274, "hallucination_detected": 0, "source_attribution_score": 0.8094309307066749, "latency_ms": 346.74320628259454, "tokens_used": 156, "cost_cents": 0.30470012119609546, "timestamp": "2025-12-29T16:33:52.692906", "eval_id": "6817a77e"}
44
- {"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.7708525319903022, "retrieval_recall": 0.7140539324008609, "rank_position": 1, "rouge_l": 0.7582038473536197, "bert_score": 0.8719206100765141, "answer_relevance": 0.768747467165288, "faithfulness": 0.7863906811511377, "hallucination_detected": 0, "source_attribution_score": 0.7827059691758022, "latency_ms": 284.30338447510456, "tokens_used": 193, "cost_cents": 0.5194315945804843, "timestamp": "2025-12-29T16:33:52.692906", "eval_id": "70950525"}
45
- {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8863518011536086, "retrieval_recall": 0.9528433531913749, "rank_position": 1, "rouge_l": 0.6924764309368061, "bert_score": 0.9074272676584865, "answer_relevance": 0.8856496644947377, "faithfulness": 0.9643048532855157, "hallucination_detected": 0, "source_attribution_score": 0.9218544026918479, "latency_ms": 387.9856471606976, "tokens_used": 107, "cost_cents": 0.1927569268723833, "timestamp": "2025-12-29T16:33:52.693443", "eval_id": "bc0d2943"}
46
- {"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8950536281566746, "retrieval_recall": 0.9144248160397045, "rank_position": 1, "rouge_l": 0.7618677680298188, "bert_score": 0.8461644035252505, "answer_relevance": 0.9653601861381645, "faithfulness": 0.8755786694922031, "hallucination_detected": 0, "source_attribution_score": 0.8808869584154418, "latency_ms": 353.36305965541663, "tokens_used": 245, "cost_cents": 0.5148915885221008, "timestamp": "2025-12-29T16:33:52.693443", "eval_id": "eeaa869f"}
47
- {"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8098059467924409, "retrieval_recall": 0.6023065734388835, "rank_position": 3, "rouge_l": 0.7004028932959154, "bert_score": 0.813015925326988, "answer_relevance": 0.6784644783231156, "faithfulness": 0.7845740350573508, "hallucination_detected": 0, "source_attribution_score": 0.65, "latency_ms": 459.79977076107156, "tokens_used": 117, "cost_cents": 0.27331328918592634, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "2202146d"}
48
- {"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 3, "retrieval_precision": 1.0, "retrieval_recall": 0.7608373996776078, "rank_position": 1, "rouge_l": 0.7774878763927089, "bert_score": 0.8396996698506028, "answer_relevance": 0.8271539804365684, "faithfulness": 0.9337213882950308, "hallucination_detected": 0, "source_attribution_score": 0.8388598969576262, "latency_ms": 213.90842919317265, "tokens_used": 86, "cost_cents": 0.29153943157162554, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "bfa3ef53"}
49
- {"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8346511394851743, "retrieval_recall": 0.8814336106436549, "rank_position": 1, "rouge_l": 0.8430315573988195, "bert_score": 0.8944331459730633, "answer_relevance": 0.7897513415421694, "faithfulness": 0.8580143425540971, "hallucination_detected": 1, "source_attribution_score": 0.7346872187150348, "latency_ms": 361.9668724913192, "tokens_used": 101, "cost_cents": 0.5711407488163474, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "b0d984c7"}
50
- {"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9009022034719727, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.9105038892045774, "bert_score": 0.8960253555216661, "answer_relevance": 0.8253569744550738, "faithfulness": 0.8368519357763938, "hallucination_detected": 0, "source_attribution_score": 0.7777537972022747, "latency_ms": 300.33605788776373, "tokens_used": 207, "cost_cents": 0.3048499137601775, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "81f2022b"}
 
1
+ {"query": "What is the response rate of Drug X?", "answer": "According to the provided context (Chunk 1), Drug X demonstrated a 65% response rate compared to 35% in the placebo group.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 324.74422454833984, "tokens_used": 21, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:34.998293", "eval_id": "4e10192a"}
2
+ {"query": "What are the side effects of Drug X?", "answer": "The side effects of Drug X are:\n\n- Headache (12% of patients)\n- Mild gastrointestinal upset (8% of patients)\n- Dizziness (5% of patients)\n- Fatigue (4% of patients)\n\nAdditionally, serious adverse events occurred in 2% of patients, including liver enzyme elevation.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 198.8542079925537, "tokens_used": 42, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:35.198696", "eval_id": "05359951"}
3
+ {"query": "How does Drug X interact with warfarin?", "answer": "There is information about the interaction between Drug X and warfarin in the context provided. According to the information in Chunk 1, the interaction between Drug X and warfarin is:\n\n- Potential increased bleeding risk\n- Monitor INR closely\n- Baseline INR and weekly monitoring recommended", "source_docs": ["drug_interactions", "medical_research_1", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 274.7206687927246, "tokens_used": 46, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:35.474923", "eval_id": "09eb8892"}
4
+ {"query": "What is the recommended dosage of Drug X?", "answer": "The recommended dose of Drug X is 500mg twice daily with meals.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 233.20865631103516, "tokens_used": 12, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:35.708131", "eval_id": "297130d7"}
5
+ {"query": "What is the mechanism of action for Drug X?", "answer": "Drug X works by inhibiting protein kinase Y, which is overexpressed in condition Y cells. This inhibition leads to cell cycle arrest and apoptosis of affected cells.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 150.78186988830566, "tokens_used": 27, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:35.860418", "eval_id": "f55b494b"}
6
+ {"query": "What is the response rate of Drug X?", "answer": "65% response rate compared to 35% in the placebo group.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 435.7447624206543, "tokens_used": 10, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:27.660827", "eval_id": "92c25556"}
7
+ {"query": "What are the side effects of Drug X?", "answer": "The most common adverse events reported in the clinical trial for Drug X were:\n\n1. Headache (12% of patients)\n2. Mild gastrointestinal upset (8% of patients)\n3. Dizziness (5% of patients)\n4. Fatigue (4% of patients)\n\nAdditionally, in the case study, the patient experienced:\n\n1. Mild headache (resolved with acetaminophen)\n2. Occasional nausea (resolved with food intake)", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 236.0692024230957, "tokens_used": 57, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:27.898918", "eval_id": "1fab9b0a"}
8
+ {"query": "How does Drug X interact with warfarin?", "answer": "According to the provided context (Chunk 1 - Relevance: 66.9%), Drug X has a potential interaction with warfarin: \"2. Warfarin: - Potential increased bleeding risk - Monitor INR closely - Baseline INR and weekly monitoring recommended\"", "source_docs": ["drug_interactions", "medical_research_1", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 198.96793365478516, "tokens_used": 36, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:28.098393", "eval_id": "83a684fc"}
9
+ {"query": "What is the recommended dosage of Drug X?", "answer": "The recommended dose of Drug X is 500mg twice daily with meals.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 149.9345302581787, "tokens_used": 12, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:28.248327", "eval_id": "a0771f33"}
10
+ {"query": "What is the mechanism of action for Drug X?", "answer": "The mechanism of action for Drug X is inhibiting protein kinase Y, which is overexpressed in condition Y cells. This inhibition leads to cell cycle arrest and apoptosis of affected cells.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 179.38470840454102, "tokens_used": 31, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:28.440212", "eval_id": "35939350"}
11
+ {"query": "What is the response rate of Drug X?", "answer": "The response rate of Drug X is 65%, compared to 35% in the placebo group (p<0.001).", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.5333333333333333, "hallucination_detected": true, "source_attribution_score": 0.3125, "latency_ms": 336.91954612731934, "tokens_used": 16, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:21.623166", "eval_id": "3688751b"}
12
+ {"query": "What are the side effects of Drug X?", "answer": "The most common side effects of Drug X are:\n\n- Headache (12% of patients)\n- Mild gastrointestinal upset (8% of patients)\n- Dizziness (5% of patients)\n- Fatigue (4% of patients)\n\nAdditionally, serious adverse events occurred in 2% of patients, including liver enzyme elevation.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.18181818181818182, "hallucination_detected": true, "source_attribution_score": 0.06818181818181818, "latency_ms": 331.4476013183594, "tokens_used": 44, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:21.955619", "eval_id": "586b096d"}
13
+ {"query": "How does Drug X interact with warfarin?", "answer": "According to the provided context, Drug X interacts with warfarin by potentially increasing the risk of bleeding. Monitoring of INR is recommended, including baseline INR and weekly monitoring is recommended.", "source_docs": ["drug_interactions", "medical_research_1", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.48, "hallucination_detected": true, "source_attribution_score": 0.23333333333333334, "latency_ms": 283.388614654541, "tokens_used": 30, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:22.240178", "eval_id": "a37df1d9"}
14
+ {"query": "What is the recommended dosage of Drug X?", "answer": "The recommended dose of Drug X is 500mg twice daily with meals. Dose adjustments may be necessary for patients with renal impairment, in which case a dose reduction to 250mg twice daily is recommended.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.5, "hallucination_detected": true, "source_attribution_score": 0.23529411764705882, "latency_ms": 272.6554870605469, "tokens_used": 34, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:22.512833", "eval_id": "66ecfa00"}
15
+ {"query": "What is the mechanism of action for Drug X?", "answer": "The mechanism of action for Drug X is inhibiting protein kinase Y, which is overexpressed in condition Y cells. This inhibition leads to cell cycle arrest and apoptosis of affected cells.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.5, "hallucination_detected": true, "source_attribution_score": 0.22580645161290322, "latency_ms": 165.5900478363037, "tokens_used": 31, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:22.679422", "eval_id": "4b7e99dd"}
16
+ {"query": "What is the response rate of Drug X?", "answer": "According to the provided context in Chunk 1, the response rate of Drug X was 65%.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.6666666666666666, "hallucination_detected": false, "source_attribution_score": 0.375, "latency_ms": 594.7244167327881, "tokens_used": 16, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:10.135089", "eval_id": "f5583f32"}
17
+ {"query": "What are the side effects of Drug X?", "answer": "According to the provided context, the side effects of Drug X are:\n\n- Headache (12% of patients)\n- Mild gastrointestinal upset (8% of patients)\n- Dizziness (5% of patients)\n- Fatigue (4% of patients)\n\nAdditionally, from the case study, the patient experienced:\n\n- Mild headache (resolved with acetaminophen)\n- Occasional nausea (resolved with food intake)", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.3888888888888889, "hallucination_detected": true, "source_attribution_score": 0.2, "latency_ms": 265.3524875640869, "tokens_used": 55, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:10.403565", "eval_id": "07148b4e"}
18
+ {"query": "How does Drug X interact with warfarin?", "answer": "According to the context, Drug X interacts with warfarin as follows: \n\n- There is a potential increased bleeding risk.\n- It is recommended to monitor INR closely.\n- Baseline INR and weekly monitoring are recommended.", "source_docs": ["drug_interactions", "medical_research_1", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.4666666666666667, "hallucination_detected": true, "source_attribution_score": 0.22857142857142856, "latency_ms": 339.0464782714844, "tokens_used": 35, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:10.742611", "eval_id": "b7c99790"}
19
+ {"query": "What is the recommended dosage of Drug X?", "answer": "The recommended dosage of Drug X is 500mg twice daily with meals.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.6666666666666666, "hallucination_detected": false, "source_attribution_score": 0.3333333333333333, "latency_ms": 176.81241035461426, "tokens_used": 12, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:10.920425", "eval_id": "1993b43f"}
20
+ {"query": "What is the mechanism of action for Drug X?", "answer": "Drug X works by inhibiting protein kinase Y, which is overexpressed in condition Y cells. This inhibition leads to cell cycle arrest and apoptosis of affected cells.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.46153846153846156, "hallucination_detected": true, "source_attribution_score": 0.2222222222222222, "latency_ms": 389.65582847595215, "tokens_used": 27, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:11.310612", "eval_id": "e1fdaf31"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
frontend/evaluation.html CHANGED
@@ -7,154 +7,280 @@
7
  <title>RAG Evaluation Dashboard</title>
8
  <script src="https://cdnjs.cloudflare.com/ajax/libs/plotly.js/2.26.0/plotly.min.js"></script>
9
  <style>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  * {
11
  margin: 0;
12
  padding: 0;
13
  box-sizing: border-box;
14
  }
15
 
 
 
 
16
  body {
17
- font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
18
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
19
  min-height: 100vh;
20
- padding: 20px;
 
21
  }
22
 
 
 
 
23
  .container {
24
- max-width: 1400px;
25
  margin: 0 auto;
26
- background: white;
27
- border-radius: 15px;
28
- box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
29
- overflow: hidden;
30
  }
31
 
32
- .header {
33
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
34
- color: white;
35
- padding: 40px 30px;
36
  text-align: center;
 
37
  }
38
 
39
- .header h1 {
40
- font-size: 2.5em;
41
- margin-bottom: 10px;
 
 
42
  }
43
 
44
- .header p {
45
- font-size: 1.1em;
46
- opacity: 0.9;
 
47
  }
48
 
49
  .nav-buttons {
50
  display: flex;
51
- gap: 10px;
52
  justify-content: center;
53
- margin-top: 20px;
54
  flex-wrap: wrap;
 
55
  }
56
 
57
  .nav-btn {
58
- padding: 10px 20px;
59
- background: rgba(255, 255, 255, 0.2);
60
- border: 2px solid white;
 
 
61
  color: white;
62
- border-radius: 5px;
 
 
 
 
 
63
  cursor: pointer;
64
- font-size: 1em;
65
- transition: all 0.3s;
66
  }
67
 
68
- .nav-btn:hover,
69
- .nav-btn.active {
70
- background: white;
71
- color: #667eea;
72
  }
73
 
74
- .content {
75
- padding: 30px;
 
 
76
  }
77
 
78
- .section {
79
- display: none;
80
  }
81
 
82
- .section.active {
83
- display: block;
84
- animation: fadeIn 0.3s;
85
  }
86
 
87
- @keyframes fadeIn {
88
- from {
89
- opacity: 0;
90
- }
91
 
92
- to {
93
- opacity: 1;
94
- }
 
 
 
 
 
 
 
 
 
 
95
  }
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  .metrics-grid {
98
  display: grid;
99
- grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
100
- gap: 20px;
101
- margin-bottom: 30px;
102
  }
103
 
104
  .metric-card {
105
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
106
  color: white;
107
- padding: 25px;
108
- border-radius: 10px;
109
  text-align: center;
110
- box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
111
- transition: transform 0.3s;
112
  }
113
 
114
  .metric-card:hover {
115
- transform: translateY(-5px);
 
 
 
 
 
 
 
 
 
 
 
 
116
  }
117
 
118
  .metric-label {
119
- font-size: 0.9em;
120
  opacity: 0.9;
121
- margin-bottom: 10px;
122
  text-transform: uppercase;
123
- letter-spacing: 1px;
124
  }
125
 
126
  .metric-value {
127
- font-size: 2em;
128
- font-weight: bold;
129
  }
130
 
131
  .metric-unit {
132
- font-size: 0.7em;
133
  opacity: 0.8;
134
- margin-top: 5px;
135
  }
136
 
 
 
 
137
  .chart-container {
138
- background: white;
139
- border: 1px solid #e0e0e0;
140
- border-radius: 10px;
141
- padding: 20px;
142
- margin-bottom: 30px;
143
- min-height: 400px;
144
  }
145
 
146
  .chart-title {
147
- font-size: 1.3em;
148
  font-weight: 600;
149
- margin-bottom: 15px;
150
- color: #333;
151
  }
152
 
153
  .two-column {
154
  display: grid;
155
  grid-template-columns: 1fr 1fr;
156
  gap: 20px;
157
- margin-bottom: 20px;
158
  }
159
 
160
  @media (max-width: 900px) {
@@ -163,31 +289,49 @@
163
  }
164
  }
165
 
166
- .status-good {
167
- color: #2ecc71;
 
 
 
168
  }
169
 
170
- .status-warning {
171
- color: #f39c12;
 
172
  }
173
 
174
- .status-critical {
175
- color: #e74c3c;
 
 
 
 
 
 
 
 
176
  }
177
 
 
 
 
178
  .info-box {
179
- background: #f8f9fa;
180
- border-left: 4px solid #667eea;
181
- padding: 15px;
 
182
  margin-bottom: 20px;
183
- border-radius: 5px;
184
  }
185
 
186
  .info-box p {
187
- color: #555;
188
- margin: 5px 0;
189
  }
190
 
 
 
 
191
  .button-group {
192
  display: flex;
193
  gap: 10px;
@@ -196,43 +340,49 @@
196
  }
197
 
198
  .btn {
199
- padding: 10px 20px;
200
- background: #667eea;
201
  color: white;
202
  border: none;
203
- border-radius: 5px;
204
  cursor: pointer;
205
- font-size: 1em;
206
- transition: background 0.3s;
 
207
  }
208
 
209
  .btn:hover {
210
- background: #764ba2;
211
  }
212
 
213
  .btn-secondary {
214
- background: #95a5a6;
 
 
215
  }
216
 
217
  .btn-secondary:hover {
218
- background: #7f8c8d;
219
  }
220
 
 
 
 
221
  .loading {
222
  display: none;
223
  text-align: center;
224
- padding: 20px;
225
- color: #667eea;
226
  }
227
 
228
  .spinner {
229
- border: 4px solid #f3f3f3;
230
- border-top: 4px solid #667eea;
231
  border-radius: 50%;
232
  width: 40px;
233
  height: 40px;
234
  animation: spin 1s linear infinite;
235
- margin: 0 auto 10px;
236
  }
237
 
238
  @keyframes spin {
@@ -245,272 +395,287 @@
245
  }
246
  }
247
 
248
- .failure-list {
249
- max-height: 400px;
250
- overflow-y: auto;
251
- }
252
-
253
  .failure-item {
254
- background: #f8f9fa;
255
- padding: 10px;
256
- margin: 5px 0;
257
- border-radius: 5px;
258
- border-left: 4px solid #e74c3c;
259
  }
260
 
261
  .failure-item-query {
262
- font-weight: 600;
263
- color: #333;
 
264
  }
265
 
266
  .failure-item-score {
267
- font-size: 0.9em;
268
- color: #e74c3c;
269
- margin-top: 5px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  }
271
  </style>
272
  </head>
273
 
274
  <body>
 
 
275
  <div class="container">
276
- <div class="header">
277
- <h1>πŸ” RAG Evaluation Dashboard</h1>
278
- <p>Comprehensive evaluation metrics for your Retrieval-Augmented Generation system</p>
279
  <div class="nav-buttons">
280
- <button class="nav-btn active" onclick="showSection('overview')">Overview</button>
281
- <button class="nav-btn" onclick="showSection('retrieval')">Retrieval</button>
282
- <button class="nav-btn" onclick="showSection('generation')">Generation</button>
283
- <button class="nav-btn" onclick="showSection('faithfulness')">Faithfulness</button>
284
- <button class="nav-btn" onclick="showSection('performance')">Performance</button>
285
- <button class="nav-btn" onclick="showSection('failures')">Failures</button>
286
  </div>
 
 
 
 
 
 
 
 
 
 
287
  </div>
288
 
289
- <div class="content">
290
- <!-- Overview Section -->
291
- <div id="overview" class="section active">
292
- <div class="button-group">
293
- <button class="btn" onclick="loadMetrics()">πŸ”„ Refresh Metrics</button>
294
- <button class="btn btn-secondary" onclick="exportResults()">πŸ“₯ Export Results</button>
295
- <button class="btn btn-secondary" onclick="clearResults()">πŸ—‘οΈ Clear Results</button>
296
- </div>
297
 
298
- <div class="loading" id="loading">
299
- <div class="spinner"></div>
300
- Loading metrics...
 
 
 
301
  </div>
302
-
303
- <div class="metrics-grid" id="metricsGrid">
304
- <!-- Populated by JavaScript -->
305
  </div>
306
-
307
- <div class="info-box">
308
- <p><strong>πŸ“Š Total Evaluations:</strong> <span id="totalEvals">0</span></p>
309
- <p><strong>πŸ“… Last Updated:</strong> <span id="lastUpdated">--</span></p>
310
- <p><strong>βœ… System Status:</strong> <span id="systemStatus">Initializing...</span></p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  </div>
312
  </div>
313
 
314
- <!-- Retrieval Section -->
315
- <div id="retrieval" class="section">
316
- <h2 class="chart-title">πŸ“ˆ Retrieval Quality Analysis</h2>
317
-
318
- <div class="two-column">
319
- <div class="chart-container">
320
- <div class="chart-title">Precision & Recall Trend</div>
321
- <div id="retrievalChart"></div>
322
- </div>
323
- <div class="chart-container">
324
- <div class="chart-title">Key Metrics</div>
325
- <div style="padding: 20px;">
326
- <p><strong>Mean Reciprocal Rank (MRR):</strong> <span id="mrrValue">--</span></p>
327
- <p style="margin-top: 10px; font-size: 0.9em;">Measures ranking quality of retrieved
328
- documents. Higher is better (ideal: 1.0)</p>
329
- <hr style="margin: 15px 0;">
330
- <p><strong>Avg Precision:</strong> <span id="avgPrecision">--</span></p>
331
- <p style="margin-top: 10px;"><strong>Avg Recall:</strong> <span id="avgRecall">--</span></p>
332
- </div>
333
- </div>
334
- </div>
335
  </div>
336
 
337
- <!-- Generation Section -->
338
- <div id="generation" class="section">
339
- <h2 class="chart-title">🎯 Generation Quality Metrics</h2>
 
 
 
340
 
341
- <div class="two-column">
342
- <div class="chart-container">
343
- <div class="chart-title">Quality Score Trends</div>
344
- <div id="generationChart"></div>
345
- </div>
346
- <div class="chart-container">
347
- <div class="chart-title">Average Scores</div>
348
- <div id="generationBars"></div>
349
- </div>
 
350
  </div>
351
-
352
- <div class="info-box">
353
- <p><strong>ROUGE-L:</strong> Token-level overlap between generated and reference answers (0-1)</p>
354
- <p><strong>BERTScore:</strong> Semantic similarity using contextual embeddings (0-1)</p>
355
- <p><strong>Answer Relevance:</strong> How relevant is the answer to the query (0-1)</p>
 
 
356
  </div>
357
  </div>
 
358
 
359
- <!-- Faithfulness Section -->
360
- <div id="faithfulness" class="section">
361
- <h2 class="chart-title">βœ… Faithfulness & Source Attribution</h2>
362
-
363
- <div class="two-column">
364
- <div class="chart-container">
365
- <div class="chart-title">Hallucination Distribution</div>
366
- <div id="hallucinationChart"></div>
367
- </div>
368
- <div class="chart-container">
369
- <div class="chart-title">Faithfulness Trend</div>
370
- <div id="faithfulnessChart"></div>
371
- </div>
372
  </div>
373
  </div>
 
374
 
375
- <!-- Performance Section -->
376
- <div id="performance" class="section">
377
- <h2 class="chart-title">⚑ Performance & Cost Analysis</h2>
378
-
379
- <div class="two-column">
380
- <div class="chart-container">
381
- <div class="chart-title">Latency vs Cost</div>
382
- <div id="latencyChart"></div>
383
- </div>
384
- <div class="chart-container">
385
- <div class="chart-title">Latency Percentiles</div>
386
- <div id="percentileChart"></div>
387
- </div>
388
  </div>
389
-
390
- <div class="metrics-grid">
391
- <div class="metric-card">
392
- <div class="metric-label">P50 Latency</div>
393
- <div class="metric-value" id="p50Value">--</div>
394
- <div class="metric-unit">milliseconds</div>
395
- </div>
396
- <div class="metric-card">
397
- <div class="metric-label">P95 Latency</div>
398
- <div class="metric-value" id="p95Value">--</div>
399
- <div class="metric-unit">milliseconds</div>
400
- </div>
401
- <div class="metric-card">
402
- <div class="metric-label">P99 Latency</div>
403
- <div class="metric-value" id="p99Value">--</div>
404
- <div class="metric-unit">milliseconds</div>
405
- </div>
406
- <div class="metric-card">
407
- <div class="metric-label">Avg Cost</div>
408
- <div class="metric-value" id="costValue">--</div>
409
- <div class="metric-unit">cents per query</div>
410
- </div>
411
  </div>
412
  </div>
 
413
 
414
- <!-- Failures Section -->
415
- <div id="failures" class="section">
416
- <h2 class="chart-title">❌ Failure Mode Analysis</h2>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
 
418
- <div class="two-column">
419
- <div class="chart-container">
420
- <div class="chart-title">Failure Distribution</div>
421
- <div id="failureChart"></div>
422
- </div>
423
- <div class="chart-container">
424
- <div class="chart-title">Failure Summary</div>
425
- <div style="padding: 20px;">
426
- <p><strong>Total Failures:</strong> <span id="totalFailures">0</span></p>
427
- <p style="margin-top: 15px;"><strong>Hallucinations:</strong> <span id="hallCount">0</span>
428
- </p>
429
- <p><strong>Low Retrieval:</strong> <span id="retCount">0</span></p>
430
- <p><strong>Low Generation:</strong> <span id="genCount">0</span></p>
431
- <p><strong>Low Faithfulness:</strong> <span id="faithCount">0</span></p>
 
 
 
 
 
 
 
 
 
 
 
432
  </div>
433
  </div>
434
  </div>
435
-
436
- <div class="chart-container">
437
- <div class="chart-title">Recent Failures</div>
438
- <div class="failure-list" id="failureList"></div>
 
439
  </div>
440
  </div>
441
  </div>
442
  </div>
443
 
444
  <script>
 
445
  let metricsData = null;
446
  let timeseriesData = null;
447
  let failureData = null;
448
 
 
449
  function showSection(sectionId) {
450
- // Hide all sections
451
  document.querySelectorAll('.section').forEach(s => s.classList.remove('active'));
452
- document.querySelectorAll('.nav-btn').forEach(b => b.classList.remove('active'));
453
 
454
- // Show selected section
455
  document.getElementById(sectionId).classList.add('active');
456
  event.target.classList.add('active');
457
 
458
- // Load data for this section
459
- if (sectionId === 'overview') loadMetrics();
460
- else if (sectionId === 'retrieval') loadRetrievalCharts();
461
- else if (sectionId === 'generation') loadGenerationCharts();
462
- else if (sectionId === 'faithfulness') loadFaithfulnessCharts();
463
- else if (sectionId === 'performance') loadPerformanceCharts();
464
- else if (sectionId === 'failures') loadFailureCharts();
 
465
  }
466
 
467
  async function loadMetrics() {
468
  showLoading(true);
469
  try {
470
- const response = await fetch('/evaluation/metrics');
471
  metricsData = await response.json();
472
 
473
- // Update overview cards
474
- const metricsGrid = document.getElementById('metricsGrid');
475
- metricsGrid.innerHTML = `
476
- <div class="metric-card">
477
- <div class="metric-label">Total Evaluations</div>
478
- <div class="metric-value">${metricsData.total_evaluations}</div>
479
- </div>
480
- <div class="metric-card">
481
- <div class="metric-label">Avg Precision</div>
482
- <div class="metric-value">${metricsData.retrieval_precision_mean.toFixed(3)}</div>
483
- </div>
484
- <div class="metric-card">
485
- <div class="metric-label">Avg BERTScore</div>
486
- <div class="metric-value">${metricsData.bert_score_mean.toFixed(3)}</div>
487
- </div>
488
- <div class="metric-card">
489
- <div class="metric-label">Faithfulness</div>
490
- <div class="metric-value">${metricsData.faithfulness_mean.toFixed(3)}</div>
491
- </div>
492
- <div class="metric-card">
493
- <div class="metric-label">Hallucination Rate</div>
494
- <div class="metric-value">${(metricsData.hallucination_rate * 100).toFixed(1)}%</div>
495
- </div>
496
- <div class="metric-card">
497
- <div class="metric-label">Avg Latency</div>
498
- <div class="metric-value">${metricsData.latency_mean.toFixed(0)}</div>
499
- <div class="metric-unit">ms</div>
500
- </div>
501
- <div class="metric-card">
502
- <div class="metric-label">MRR</div>
503
- <div class="metric-value">${metricsData.mrr.toFixed(3)}</div>
504
- </div>
505
- <div class="metric-card">
506
- <div class="metric-label">Cost/Query</div>
507
- <div class="metric-value">$${(metricsData.cost_per_query / 100).toFixed(4)}</div>
508
- </div>
509
- `;
510
-
511
- document.getElementById('totalEvals').textContent = metricsData.total_evaluations;
512
- document.getElementById('lastUpdated').textContent = new Date(metricsData.timestamp).toLocaleString();
513
- document.getElementById('systemStatus').textContent = metricsData.hallucination_rate < 0.15 ? 'βœ… Healthy' : '⚠️ Issues Detected';
514
 
515
  } catch (e) {
516
  console.error('Error loading metrics:', e);
@@ -520,11 +685,13 @@
520
 
521
  async function loadRetrievalCharts() {
522
  try {
523
- const response = await fetch('/evaluation/timeseries');
524
- timeseriesData = await response.json();
 
 
525
 
526
  if (!timeseriesData.query_idx || timeseriesData.query_idx.length === 0) {
527
- document.getElementById('retrievalChart').innerHTML = '<p style="padding: 20px;">No data yet</p>';
528
  return;
529
  }
530
 
@@ -533,7 +700,7 @@
533
  y: timeseriesData.retrieval_precision,
534
  name: 'Precision',
535
  mode: 'lines+markers',
536
- line: { color: '#667eea' }
537
  };
538
 
539
  const trace2 = {
@@ -541,21 +708,25 @@
541
  y: timeseriesData.retrieval_recall,
542
  name: 'Recall',
543
  mode: 'lines+markers',
544
- line: { color: '#764ba2' }
545
  };
546
 
547
- Plotly.newPlot('retrievalChart', [trace1, trace2], {
548
- title: '',
549
  xaxis: { title: 'Query Index' },
550
  yaxis: { title: 'Score' },
551
  hovermode: 'x unified',
552
- responsive: true
553
- });
 
 
 
 
 
554
 
555
  if (metricsData) {
556
- document.getElementById('mrrValue').textContent = metricsData.mrr.toFixed(3);
557
- document.getElementById('avgPrecision').textContent = metricsData.retrieval_precision_mean.toFixed(3);
558
- document.getElementById('avgRecall').textContent = metricsData.retrieval_recall_mean.toFixed(3);
559
  }
560
 
561
  } catch (e) {
@@ -566,7 +737,7 @@
566
  async function loadGenerationCharts() {
567
  try {
568
  if (!timeseriesData) {
569
- const response = await fetch('/evaluation/timeseries');
570
  timeseriesData = await response.json();
571
  }
572
 
@@ -577,7 +748,7 @@
577
  y: timeseriesData.rouge_l,
578
  name: 'ROUGE-L',
579
  mode: 'lines+markers',
580
- line: { color: '#f39c12' }
581
  };
582
 
583
  const trace2 = {
@@ -585,23 +756,34 @@
585
  y: timeseriesData.bert_score,
586
  name: 'BERTScore',
587
  mode: 'lines+markers',
588
- line: { color: '#2ecc71' }
589
  };
590
 
591
- Plotly.newPlot('generationChart', [trace1, trace2], {
592
- title: '', xaxis: { title: 'Query Index' }, yaxis: { title: 'Score' }, hovermode: 'x unified', responsive: true
593
- });
 
 
 
 
 
 
 
594
 
595
  if (metricsData) {
596
  const barsTrace = {
597
  x: ['ROUGE-L', 'BERTScore', 'Answer Relevance'],
598
  y: [metricsData.rouge_l_mean, metricsData.bert_score_mean, metricsData.answer_relevance_mean],
599
  type: 'bar',
600
- marker: { color: ['#f39c12', '#2ecc71', '#3498db'] }
601
  };
602
 
603
  Plotly.newPlot('generationBars', [barsTrace], {
604
- title: '', yaxis: { title: 'Score' }, responsive: true, showlegend: false
 
 
 
 
605
  });
606
  }
607
 
@@ -614,21 +796,24 @@
614
  try {
615
  if (!metricsData) await loadMetrics();
616
  if (!timeseriesData) {
617
- const response = await fetch('/evaluation/timeseries');
618
  timeseriesData = await response.json();
619
  }
620
 
621
- const hallRate = metricsData.hallucination_rate;
622
  const faithfulRate = 1 - hallRate;
623
 
624
  const pieTrace = {
625
  labels: ['Faithful Answers', 'Hallucinations'],
626
  values: [faithfulRate * 100, hallRate * 100],
627
  type: 'pie',
628
- marker: { colors: ['#2ecc71', '#e74c3c'] }
629
  };
630
 
631
- Plotly.newPlot('hallucinationChart', [pieTrace], { title: '', responsive: true });
 
 
 
632
 
633
  if (timeseriesData.query_idx && timeseriesData.query_idx.length > 0) {
634
  const faithTrace = {
@@ -636,12 +821,16 @@
636
  y: timeseriesData.faithfulness,
637
  name: 'Faithfulness',
638
  mode: 'lines+markers',
639
- line: { color: '#16a085', width: 2 },
640
  marker: { size: 6 }
641
  };
642
 
643
  Plotly.newPlot('faithfulnessChart', [faithTrace], {
644
- title: '', xaxis: { title: 'Query Index' }, yaxis: { title: 'Score (0-1)' }, responsive: true
 
 
 
 
645
  });
646
  }
647
 
@@ -654,28 +843,32 @@
654
  try {
655
  if (!metricsData) await loadMetrics();
656
  if (!timeseriesData) {
657
- const response = await fetch('/evaluation/timeseries');
658
  timeseriesData = await response.json();
659
  }
660
 
661
  if (timeseriesData.query_idx && timeseriesData.query_idx.length > 0) {
662
  const latencyTrace = {
663
- x: timeseriesData.latency_ms,
664
  y: timeseriesData.latency_ms,
665
- mode: 'markers',
666
- marker: { size: 8, color: timeseriesData.query_idx, colorscale: 'Viridis', showscale: true },
667
- type: 'scatter'
668
  };
669
 
670
  Plotly.newPlot('latencyChart', [latencyTrace], {
671
- title: '', xaxis: { title: 'Query Index' }, yaxis: { title: 'Latency (ms)' }, responsive: true
 
 
 
 
672
  });
673
  }
674
 
675
- document.getElementById('p50Value').textContent = metricsData.latency_p50.toFixed(0);
676
- document.getElementById('p95Value').textContent = metricsData.latency_p95.toFixed(0);
677
- document.getElementById('p99Value').textContent = metricsData.latency_p99.toFixed(0);
678
- document.getElementById('costValue').textContent = (metricsData.cost_per_query / 100).toFixed(4);
679
 
680
  } catch (e) {
681
  console.error('Error loading performance charts:', e);
@@ -684,25 +877,28 @@
684
 
685
  async function loadFailureCharts() {
686
  try {
687
- const response = await fetch('/evaluation/failures');
688
  failureData = await response.json();
689
 
690
  const failureChart = {
691
  x: Object.keys(failureData.failure_modes),
692
  y: Object.values(failureData.failure_modes),
693
  type: 'bar',
694
- marker: { color: '#e74c3c' }
695
  };
696
 
697
  Plotly.newPlot('failureChart', [failureChart], {
698
- title: '', yaxis: { title: 'Count' }, responsive: true, showlegend: false
 
 
 
 
699
  });
700
 
701
  document.getElementById('totalFailures').textContent = failureData.total_failures;
702
  document.getElementById('hallCount').textContent = failureData.failure_modes.hallucinations;
703
  document.getElementById('retCount').textContent = failureData.failure_modes.low_retrieval;
704
  document.getElementById('genCount').textContent = failureData.failure_modes.low_generation;
705
- document.getElementById('faithCount').textContent = failureData.failure_modes.low_faithfulness;
706
 
707
  // Show recent failures
708
  const failureList = document.getElementById('failureList');
@@ -713,10 +909,13 @@
713
  ];
714
 
715
  allFailures.forEach(f => {
716
- html += `<div class="failure-item"><div class="failure-item-query">${f.query}</div><div class="failure-item-score">Score: ${f.score.toFixed(3)}</div></div>`;
 
 
 
717
  });
718
 
719
- failureList.innerHTML = html || '<p style="padding: 20px; color: #999;">No failures detected! πŸŽ‰</p>';
720
 
721
  } catch (e) {
722
  console.error('Error loading failure analysis:', e);
@@ -729,7 +928,7 @@
729
 
730
  async function exportResults() {
731
  try {
732
- const response = await fetch('/evaluation/export');
733
  const blob = await response.blob();
734
  const url = window.URL.createObjectURL(blob);
735
  const a = document.createElement('a');
@@ -745,7 +944,7 @@
745
  async function clearResults() {
746
  if (confirm('Are you sure you want to clear all results?')) {
747
  try {
748
- await fetch('/evaluation/reset', { method: 'POST' });
749
  metricsData = null;
750
  timeseriesData = null;
751
  failureData = null;
@@ -757,6 +956,33 @@
757
  }
758
  }
759
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
760
  // Load metrics on page load
761
  window.addEventListener('load', loadMetrics);
762
  </script>
 
7
  <title>RAG Evaluation Dashboard</title>
8
  <script src="https://cdnjs.cloudflare.com/ajax/libs/plotly.js/2.26.0/plotly.min.js"></script>
9
  <style>
10
+ /* -----------------------------
11
+ Design tokens (matching index.html)
12
+ ------------------------------*/
13
+ :root {
14
+ --surface: #ffffff;
15
+ --surface-subtle: #fafafa;
16
+ --bg-main: #f5f7fb;
17
+ --card-bg: #ffffff;
18
+ --accent: #2563eb;
19
+ --accent-soft: #eff6ff;
20
+ --text-main: #111827;
21
+ --text-muted: #6b7280;
22
+ --border-soft: #e5e7eb;
23
+ --success: #16a34a;
24
+ --error: #dc2626;
25
+ --warning: #f59e0b;
26
+ --info: #2563eb;
27
+ --radius-sm: 6px;
28
+ --radius-md: 10px;
29
+ --radius-lg: 14px;
30
+ }
31
+
32
+ /* Dark mode */
33
+ [data-theme="dark"] {
34
+ --surface: #1f2937;
35
+ --surface-subtle: #111827;
36
+ --bg-main: #0f172a;
37
+ --card-bg: #1e293b;
38
+ --accent: #60a5fa;
39
+ --accent-soft: #1e3a5f;
40
+ --text-main: #f1f5f9;
41
+ --text-muted: #94a3b8;
42
+ --border-soft: #334155;
43
+ }
44
+
45
+ /* -----------------------------
46
+ Reset
47
+ ------------------------------*/
48
  * {
49
  margin: 0;
50
  padding: 0;
51
  box-sizing: border-box;
52
  }
53
 
54
+ /* -----------------------------
55
+ Base
56
+ ------------------------------*/
57
  body {
58
+ font-family: Inter, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
59
+ background: var(--bg-main);
60
  min-height: 100vh;
61
+ padding: 24px;
62
+ color: var(--text-main);
63
  }
64
 
65
+ /* -----------------------------
66
+ Container
67
+ ------------------------------*/
68
  .container {
69
+ max-width: 1200px;
70
  margin: 0 auto;
 
 
 
 
71
  }
72
 
73
+ /* -----------------------------
74
+ Header
75
+ ------------------------------*/
76
+ header {
77
  text-align: center;
78
+ margin-bottom: 36px;
79
  }
80
 
81
+ header h1 {
82
+ font-size: 2.2rem;
83
+ font-weight: 600;
84
+ letter-spacing: -0.02em;
85
+ margin-bottom: 8px;
86
  }
87
 
88
+ header p {
89
+ font-size: 1rem;
90
+ color: var(--text-muted);
91
+ margin-bottom: 20px;
92
  }
93
 
94
  .nav-buttons {
95
  display: flex;
 
96
  justify-content: center;
97
+ gap: 12px;
98
  flex-wrap: wrap;
99
+ margin-bottom: 12px;
100
  }
101
 
102
  .nav-btn {
103
+ display: inline-flex;
104
+ align-items: center;
105
+ gap: 8px;
106
+ padding: 10px 18px;
107
+ background: var(--accent);
108
  color: white;
109
+ text-decoration: none;
110
+ border-radius: var(--radius-md);
111
+ font-size: 0.9rem;
112
+ font-weight: 500;
113
+ transition: background 0.15s ease, transform 0.15s ease;
114
+ border: none;
115
  cursor: pointer;
 
 
116
  }
117
 
118
+ .nav-btn:hover {
119
+ background: #1d4ed8;
120
+ transform: translateY(-1px);
 
121
  }
122
 
123
+ .nav-btn.secondary {
124
+ background: var(--surface);
125
+ color: var(--text-main);
126
+ border: 1px solid var(--border-soft);
127
  }
128
 
129
+ .nav-btn.secondary:hover {
130
+ background: var(--surface-subtle);
131
  }
132
 
133
+ [data-theme="dark"] .nav-btn {
134
+ background: #60a5fa;
 
135
  }
136
 
137
+ [data-theme="dark"] .nav-btn:hover {
138
+ background: #3b82f6;
139
+ }
 
140
 
141
+ /* Theme toggle */
142
+ .theme-toggle {
143
+ position: fixed;
144
+ top: 20px;
145
+ right: 20px;
146
+ padding: 8px 14px;
147
+ background: var(--surface);
148
+ border: 1px solid var(--border-soft);
149
+ border-radius: var(--radius-md);
150
+ cursor: pointer;
151
+ font-size: 0.85rem;
152
+ color: var(--text-main);
153
+ z-index: 100;
154
  }
155
 
156
+ /* -----------------------------
157
+ Tab Navigation
158
+ ------------------------------*/
159
+ .tab-nav {
160
+ display: flex;
161
+ justify-content: center;
162
+ gap: 8px;
163
+ margin-bottom: 24px;
164
+ flex-wrap: wrap;
165
+ }
166
+
167
+ .tab-btn {
168
+ padding: 10px 20px;
169
+ background: var(--surface);
170
+ border: 1px solid var(--border-soft);
171
+ color: var(--text-main);
172
+ border-radius: var(--radius-md);
173
+ cursor: pointer;
174
+ font-size: 0.9rem;
175
+ transition: all 0.15s ease;
176
+ }
177
+
178
+ .tab-btn:hover {
179
+ background: var(--surface-subtle);
180
+ }
181
+
182
+ .tab-btn.active {
183
+ background: var(--accent);
184
+ color: white;
185
+ border-color: var(--accent);
186
+ }
187
+
188
+ /* -----------------------------
189
+ Cards
190
+ ------------------------------*/
191
+ .card {
192
+ background: var(--surface);
193
+ border-radius: var(--radius-lg);
194
+ padding: 24px;
195
+ border: 1px solid var(--border-soft);
196
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05);
197
+ margin-bottom: 24px;
198
+ }
199
+
200
+ .card h2 {
201
+ font-size: 1.1rem;
202
+ font-weight: 600;
203
+ margin-bottom: 16px;
204
+ color: var(--text-main);
205
+ }
206
+
207
+ /* -----------------------------
208
+ Metrics Grid
209
+ ------------------------------*/
210
  .metrics-grid {
211
  display: grid;
212
+ grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
213
+ gap: 16px;
214
+ margin-bottom: 24px;
215
  }
216
 
217
  .metric-card {
218
+ background: var(--accent);
219
  color: white;
220
+ padding: 20px;
221
+ border-radius: var(--radius-md);
222
  text-align: center;
223
+ transition: transform 0.15s ease;
 
224
  }
225
 
226
  .metric-card:hover {
227
+ transform: translateY(-2px);
228
+ }
229
+
230
+ .metric-card.success {
231
+ background: var(--success);
232
+ }
233
+
234
+ .metric-card.warning {
235
+ background: var(--warning);
236
+ }
237
+
238
+ .metric-card.error {
239
+ background: var(--error);
240
  }
241
 
242
  .metric-label {
243
+ font-size: 0.8rem;
244
  opacity: 0.9;
245
+ margin-bottom: 8px;
246
  text-transform: uppercase;
247
+ letter-spacing: 0.5px;
248
  }
249
 
250
  .metric-value {
251
+ font-size: 1.8rem;
252
+ font-weight: 700;
253
  }
254
 
255
  .metric-unit {
256
+ font-size: 0.75rem;
257
  opacity: 0.8;
258
+ margin-top: 4px;
259
  }
260
 
261
+ /* -----------------------------
262
+ Charts
263
+ ------------------------------*/
264
  .chart-container {
265
+ background: var(--surface);
266
+ border: 1px solid var(--border-soft);
267
+ border-radius: var(--radius-md);
268
+ padding: 16px;
269
+ margin-bottom: 20px;
270
+ min-height: 350px;
271
  }
272
 
273
  .chart-title {
274
+ font-size: 1rem;
275
  font-weight: 600;
276
+ margin-bottom: 12px;
277
+ color: var(--text-main);
278
  }
279
 
280
  .two-column {
281
  display: grid;
282
  grid-template-columns: 1fr 1fr;
283
  gap: 20px;
 
284
  }
285
 
286
  @media (max-width: 900px) {
 
289
  }
290
  }
291
 
292
+ /* -----------------------------
293
+ Sections
294
+ ------------------------------*/
295
+ .section {
296
+ display: none;
297
  }
298
 
299
+ .section.active {
300
+ display: block;
301
+ animation: fadeIn 0.3s ease;
302
  }
303
 
304
+ @keyframes fadeIn {
305
+ from {
306
+ opacity: 0;
307
+ transform: translateY(10px);
308
+ }
309
+
310
+ to {
311
+ opacity: 1;
312
+ transform: translateY(0);
313
+ }
314
  }
315
 
316
+ /* -----------------------------
317
+ Info Box
318
+ ------------------------------*/
319
  .info-box {
320
+ background: var(--accent-soft);
321
+ border-left: 4px solid var(--accent);
322
+ padding: 14px;
323
+ border-radius: var(--radius-sm);
324
  margin-bottom: 20px;
 
325
  }
326
 
327
  .info-box p {
328
+ font-size: 0.9rem;
329
+ color: var(--text-muted);
330
  }
331
 
332
+ /* -----------------------------
333
+ Button Group
334
+ ------------------------------*/
335
  .button-group {
336
  display: flex;
337
  gap: 10px;
 
340
  }
341
 
342
  .btn {
343
+ padding: 10px 18px;
344
+ background: var(--accent);
345
  color: white;
346
  border: none;
347
+ border-radius: var(--radius-md);
348
  cursor: pointer;
349
+ font-size: 0.9rem;
350
+ font-weight: 500;
351
+ transition: background 0.15s ease;
352
  }
353
 
354
  .btn:hover {
355
+ background: #1d4ed8;
356
  }
357
 
358
  .btn-secondary {
359
+ background: var(--surface);
360
+ color: var(--text-main);
361
+ border: 1px solid var(--border-soft);
362
  }
363
 
364
  .btn-secondary:hover {
365
+ background: var(--surface-subtle);
366
  }
367
 
368
+ /* -----------------------------
369
+ Loading
370
+ ------------------------------*/
371
  .loading {
372
  display: none;
373
  text-align: center;
374
+ padding: 40px;
375
+ color: var(--accent);
376
  }
377
 
378
  .spinner {
379
+ border: 4px solid var(--border-soft);
380
+ border-top: 4px solid var(--accent);
381
  border-radius: 50%;
382
  width: 40px;
383
  height: 40px;
384
  animation: spin 1s linear infinite;
385
+ margin: 0 auto 15px;
386
  }
387
 
388
  @keyframes spin {
 
395
  }
396
  }
397
 
398
+ /* -----------------------------
399
+ Failure Items
400
+ ------------------------------*/
 
 
401
  .failure-item {
402
+ background: var(--surface-subtle);
403
+ border-left: 4px solid var(--error);
404
+ padding: 12px;
405
+ margin-bottom: 10px;
406
+ border-radius: var(--radius-sm);
407
  }
408
 
409
  .failure-item-query {
410
+ font-size: 0.9rem;
411
+ color: var(--text-main);
412
+ margin-bottom: 4px;
413
  }
414
 
415
  .failure-item-score {
416
+ font-size: 0.8rem;
417
+ color: var(--error);
418
+ font-weight: 500;
419
+ }
420
+
421
+ /* -----------------------------
422
+ Status Colors
423
+ ------------------------------*/
424
+ .status-good {
425
+ color: var(--success);
426
+ }
427
+
428
+ .status-warning {
429
+ color: var(--warning);
430
+ }
431
+
432
+ .status-critical {
433
+ color: var(--error);
434
+ }
435
+
436
+ /* Plotly chart background fix for dark mode */
437
+ [data-theme="dark"] .js-plotly-plot .plotly .bg {
438
+ fill: var(--surface) !important;
439
  }
440
  </style>
441
  </head>
442
 
443
  <body>
444
+ <button class="theme-toggle" id="themeToggle">πŸŒ™ Dark mode</button>
445
+
446
  <div class="container">
447
+ <header>
448
+ <h1>πŸ“Š RAG Evaluation Dashboard</h1>
449
+ <p>Real-time metrics and analysis for your RAG pipeline</p>
450
  <div class="nav-buttons">
451
+ <a href="/" class="nav-btn secondary">← Back to Main</a>
452
+ <a href="/ragas-demo" class="nav-btn">πŸ”¬ RAGAS Evaluation</a>
 
 
 
 
453
  </div>
454
+ </header>
455
+
456
+ <!-- Tab Navigation -->
457
+ <div class="tab-nav">
458
+ <button class="tab-btn active" onclick="showSection('overview')">Overview</button>
459
+ <button class="tab-btn" onclick="showSection('retrieval')">Retrieval</button>
460
+ <button class="tab-btn" onclick="showSection('generation')">Generation</button>
461
+ <button class="tab-btn" onclick="showSection('faithfulness')">Faithfulness</button>
462
+ <button class="tab-btn" onclick="showSection('performance')">Performance</button>
463
+ <button class="tab-btn" onclick="showSection('failures')">Failures</button>
464
  </div>
465
 
466
+ <div class="loading" id="loading">
467
+ <div class="spinner"></div>
468
+ <p>Loading evaluation data...</p>
469
+ </div>
 
 
 
 
470
 
471
+ <!-- Overview Section -->
472
+ <div class="section active" id="overview">
473
+ <div class="metrics-grid">
474
+ <div class="metric-card">
475
+ <div class="metric-label">Total Evaluations</div>
476
+ <div class="metric-value" id="totalEvaluations">0</div>
477
  </div>
478
+ <div class="metric-card success">
479
+ <div class="metric-label">Avg Precision</div>
480
+ <div class="metric-value" id="overviewPrecision">-</div>
481
  </div>
482
+ <div class="metric-card">
483
+ <div class="metric-label">Avg Recall</div>
484
+ <div class="metric-value" id="overviewRecall">-</div>
485
+ </div>
486
+ <div class="metric-card success">
487
+ <div class="metric-label">Avg BERTScore</div>
488
+ <div class="metric-value" id="overviewBert">-</div>
489
+ </div>
490
+ <div class="metric-card">
491
+ <div class="metric-label">Avg Faithfulness</div>
492
+ <div class="metric-value" id="overviewFaith">-</div>
493
+ </div>
494
+ <div class="metric-card warning">
495
+ <div class="metric-label">Hallucination Rate</div>
496
+ <div class="metric-value" id="overviewHalluc">-</div>
497
+ </div>
498
+ <div class="metric-card">
499
+ <div class="metric-label">Avg Latency</div>
500
+ <div class="metric-value" id="overviewLatency">-</div>
501
+ <div class="metric-unit">ms</div>
502
+ </div>
503
+ <div class="metric-card">
504
+ <div class="metric-label">MRR</div>
505
+ <div class="metric-value" id="overviewMRR">-</div>
506
  </div>
507
  </div>
508
 
509
+ <div class="button-group">
510
+ <button class="btn" onclick="loadMetrics()">πŸ”„ Refresh</button>
511
+ <button class="btn btn-secondary" onclick="exportResults()">πŸ“₯ Export CSV</button>
512
+ <button class="btn btn-secondary" onclick="clearResults()">πŸ—‘οΈ Clear Results</button>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
  </div>
514
 
515
+ <div class="info-box">
516
+ <p><strong>How to use:</strong> This dashboard shows evaluation metrics from your RAG pipeline.
517
+ Use the tabs above to explore different aspects of performance.
518
+ Try the <a href="/ragas-demo">RAGAS Evaluation</a> to run live evaluations.</p>
519
+ </div>
520
+ </div>
521
 
522
+ <!-- Retrieval Section -->
523
+ <div class="section" id="retrieval">
524
+ <div class="card">
525
+ <h2>Retrieval Metrics Over Time</h2>
526
+ <div class="chart-container" id="retrievalChart"></div>
527
+ </div>
528
+ <div class="metrics-grid">
529
+ <div class="metric-card">
530
+ <div class="metric-label">MRR</div>
531
+ <div class="metric-value" id="mrrValue">-</div>
532
  </div>
533
+ <div class="metric-card success">
534
+ <div class="metric-label">Avg Precision</div>
535
+ <div class="metric-value" id="avgPrecision">-</div>
536
+ </div>
537
+ <div class="metric-card">
538
+ <div class="metric-label">Avg Recall</div>
539
+ <div class="metric-value" id="avgRecall">-</div>
540
  </div>
541
  </div>
542
+ </div>
543
 
544
+ <!-- Generation Section -->
545
+ <div class="section" id="generation">
546
+ <div class="two-column">
547
+ <div class="card">
548
+ <h2>Generation Quality Over Time</h2>
549
+ <div class="chart-container" id="generationChart"></div>
550
+ </div>
551
+ <div class="card">
552
+ <h2>Average Scores</h2>
553
+ <div class="chart-container" id="generationBars"></div>
 
 
 
554
  </div>
555
  </div>
556
+ </div>
557
 
558
+ <!-- Faithfulness Section -->
559
+ <div class="section" id="faithfulness">
560
+ <div class="two-column">
561
+ <div class="card">
562
+ <h2>Hallucination Rate</h2>
563
+ <div class="chart-container" id="hallucinationChart"></div>
 
 
 
 
 
 
 
564
  </div>
565
+ <div class="card">
566
+ <h2>Faithfulness Over Time</h2>
567
+ <div class="chart-container" id="faithfulnessChart"></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
  </div>
569
  </div>
570
+ </div>
571
 
572
+ <!-- Performance Section -->
573
+ <div class="section" id="performance">
574
+ <div class="card">
575
+ <h2>Latency Distribution</h2>
576
+ <div class="chart-container" id="latencyChart"></div>
577
+ </div>
578
+ <div class="metrics-grid">
579
+ <div class="metric-card">
580
+ <div class="metric-label">P50 Latency</div>
581
+ <div class="metric-value" id="p50Value">-</div>
582
+ <div class="metric-unit">ms</div>
583
+ </div>
584
+ <div class="metric-card warning">
585
+ <div class="metric-label">P95 Latency</div>
586
+ <div class="metric-value" id="p95Value">-</div>
587
+ <div class="metric-unit">ms</div>
588
+ </div>
589
+ <div class="metric-card">
590
+ <div class="metric-label">P99 Latency</div>
591
+ <div class="metric-value" id="p99Value">-</div>
592
+ <div class="metric-unit">ms</div>
593
+ </div>
594
+ <div class="metric-card">
595
+ <div class="metric-label">Cost per Query</div>
596
+ <div class="metric-value" id="costValue">-</div>
597
+ <div class="metric-unit">USD</div>
598
+ </div>
599
+ </div>
600
+ </div>
601
 
602
+ <!-- Failures Section -->
603
+ <div class="section" id="failures">
604
+ <div class="two-column">
605
+ <div class="card">
606
+ <h2>Failure Modes</h2>
607
+ <div class="chart-container" id="failureChart"></div>
608
+ </div>
609
+ <div class="card">
610
+ <h2>Failure Counts</h2>
611
+ <div class="metrics-grid" style="margin-top: 20px;">
612
+ <div class="metric-card error">
613
+ <div class="metric-label">Total Failures</div>
614
+ <div class="metric-value" id="totalFailures">0</div>
615
+ </div>
616
+ <div class="metric-card error">
617
+ <div class="metric-label">Hallucinations</div>
618
+ <div class="metric-value" id="hallCount">0</div>
619
+ </div>
620
+ <div class="metric-card warning">
621
+ <div class="metric-label">Low Retrieval</div>
622
+ <div class="metric-value" id="retCount">0</div>
623
+ </div>
624
+ <div class="metric-card warning">
625
+ <div class="metric-label">Low Generation</div>
626
+ <div class="metric-value" id="genCount">0</div>
627
  </div>
628
  </div>
629
  </div>
630
+ </div>
631
+ <div class="card">
632
+ <h2>Recent Failures</h2>
633
+ <div id="failureList">
634
+ <p style="color: var(--text-muted); padding: 20px;">No failures detected! πŸŽ‰</p>
635
  </div>
636
  </div>
637
  </div>
638
  </div>
639
 
640
  <script>
641
+ const API_URL = window.location.origin;
642
  let metricsData = null;
643
  let timeseriesData = null;
644
  let failureData = null;
645
 
646
+ // Tab navigation
647
  function showSection(sectionId) {
 
648
  document.querySelectorAll('.section').forEach(s => s.classList.remove('active'));
649
+ document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
650
 
 
651
  document.getElementById(sectionId).classList.add('active');
652
  event.target.classList.add('active');
653
 
654
+ // Load section-specific data
655
+ switch (sectionId) {
656
+ case 'retrieval': loadRetrievalCharts(); break;
657
+ case 'generation': loadGenerationCharts(); break;
658
+ case 'faithfulness': loadFaithfulnessCharts(); break;
659
+ case 'performance': loadPerformanceCharts(); break;
660
+ case 'failures': loadFailureCharts(); break;
661
+ }
662
  }
663
 
664
  async function loadMetrics() {
665
  showLoading(true);
666
  try {
667
+ const response = await fetch(`${API_URL}/evaluation/metrics`);
668
  metricsData = await response.json();
669
 
670
+ // Update overview
671
+ document.getElementById('totalEvaluations').textContent = metricsData.total_evaluations;
672
+ document.getElementById('overviewPrecision').textContent = metricsData.retrieval_precision_mean?.toFixed(3) || '-';
673
+ document.getElementById('overviewRecall').textContent = metricsData.retrieval_recall_mean?.toFixed(3) || '-';
674
+ document.getElementById('overviewBert').textContent = metricsData.bert_score_mean?.toFixed(3) || '-';
675
+ document.getElementById('overviewFaith').textContent = metricsData.faithfulness_mean?.toFixed(3) || '-';
676
+ document.getElementById('overviewHalluc').textContent = ((metricsData.hallucination_rate || 0) * 100).toFixed(1) + '%';
677
+ document.getElementById('overviewLatency').textContent = metricsData.latency_mean?.toFixed(0) || '-';
678
+ document.getElementById('overviewMRR').textContent = metricsData.mrr?.toFixed(3) || '-';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
679
 
680
  } catch (e) {
681
  console.error('Error loading metrics:', e);
 
685
 
686
  async function loadRetrievalCharts() {
687
  try {
688
+ if (!timeseriesData) {
689
+ const response = await fetch(`${API_URL}/evaluation/timeseries`);
690
+ timeseriesData = await response.json();
691
+ }
692
 
693
  if (!timeseriesData.query_idx || timeseriesData.query_idx.length === 0) {
694
+ document.getElementById('retrievalChart').innerHTML = '<p style="padding: 40px; text-align: center; color: var(--text-muted);">No data yet. Run some evaluations first.</p>';
695
  return;
696
  }
697
 
 
700
  y: timeseriesData.retrieval_precision,
701
  name: 'Precision',
702
  mode: 'lines+markers',
703
+ line: { color: '#2563eb' }
704
  };
705
 
706
  const trace2 = {
 
708
  y: timeseriesData.retrieval_recall,
709
  name: 'Recall',
710
  mode: 'lines+markers',
711
+ line: { color: '#16a34a' }
712
  };
713
 
714
+ const layout = {
 
715
  xaxis: { title: 'Query Index' },
716
  yaxis: { title: 'Score' },
717
  hovermode: 'x unified',
718
+ responsive: true,
719
+ paper_bgcolor: 'rgba(0,0,0,0)',
720
+ plot_bgcolor: 'rgba(0,0,0,0)',
721
+ font: { color: getComputedStyle(document.body).getPropertyValue('--text-main') }
722
+ };
723
+
724
+ Plotly.newPlot('retrievalChart', [trace1, trace2], layout);
725
 
726
  if (metricsData) {
727
+ document.getElementById('mrrValue').textContent = metricsData.mrr?.toFixed(3) || '-';
728
+ document.getElementById('avgPrecision').textContent = metricsData.retrieval_precision_mean?.toFixed(3) || '-';
729
+ document.getElementById('avgRecall').textContent = metricsData.retrieval_recall_mean?.toFixed(3) || '-';
730
  }
731
 
732
  } catch (e) {
 
737
  async function loadGenerationCharts() {
738
  try {
739
  if (!timeseriesData) {
740
+ const response = await fetch(`${API_URL}/evaluation/timeseries`);
741
  timeseriesData = await response.json();
742
  }
743
 
 
748
  y: timeseriesData.rouge_l,
749
  name: 'ROUGE-L',
750
  mode: 'lines+markers',
751
+ line: { color: '#f59e0b' }
752
  };
753
 
754
  const trace2 = {
 
756
  y: timeseriesData.bert_score,
757
  name: 'BERTScore',
758
  mode: 'lines+markers',
759
+ line: { color: '#16a34a' }
760
  };
761
 
762
+ const layout = {
763
+ xaxis: { title: 'Query Index' },
764
+ yaxis: { title: 'Score' },
765
+ hovermode: 'x unified',
766
+ responsive: true,
767
+ paper_bgcolor: 'rgba(0,0,0,0)',
768
+ plot_bgcolor: 'rgba(0,0,0,0)'
769
+ };
770
+
771
+ Plotly.newPlot('generationChart', [trace1, trace2], layout);
772
 
773
  if (metricsData) {
774
  const barsTrace = {
775
  x: ['ROUGE-L', 'BERTScore', 'Answer Relevance'],
776
  y: [metricsData.rouge_l_mean, metricsData.bert_score_mean, metricsData.answer_relevance_mean],
777
  type: 'bar',
778
+ marker: { color: ['#f59e0b', '#16a34a', '#2563eb'] }
779
  };
780
 
781
  Plotly.newPlot('generationBars', [barsTrace], {
782
+ yaxis: { title: 'Score' },
783
+ responsive: true,
784
+ showlegend: false,
785
+ paper_bgcolor: 'rgba(0,0,0,0)',
786
+ plot_bgcolor: 'rgba(0,0,0,0)'
787
  });
788
  }
789
 
 
796
  try {
797
  if (!metricsData) await loadMetrics();
798
  if (!timeseriesData) {
799
+ const response = await fetch(`${API_URL}/evaluation/timeseries`);
800
  timeseriesData = await response.json();
801
  }
802
 
803
+ const hallRate = metricsData.hallucination_rate || 0;
804
  const faithfulRate = 1 - hallRate;
805
 
806
  const pieTrace = {
807
  labels: ['Faithful Answers', 'Hallucinations'],
808
  values: [faithfulRate * 100, hallRate * 100],
809
  type: 'pie',
810
+ marker: { colors: ['#16a34a', '#dc2626'] }
811
  };
812
 
813
+ Plotly.newPlot('hallucinationChart', [pieTrace], {
814
+ responsive: true,
815
+ paper_bgcolor: 'rgba(0,0,0,0)'
816
+ });
817
 
818
  if (timeseriesData.query_idx && timeseriesData.query_idx.length > 0) {
819
  const faithTrace = {
 
821
  y: timeseriesData.faithfulness,
822
  name: 'Faithfulness',
823
  mode: 'lines+markers',
824
+ line: { color: '#2563eb', width: 2 },
825
  marker: { size: 6 }
826
  };
827
 
828
  Plotly.newPlot('faithfulnessChart', [faithTrace], {
829
+ xaxis: { title: 'Query Index' },
830
+ yaxis: { title: 'Score (0-1)' },
831
+ responsive: true,
832
+ paper_bgcolor: 'rgba(0,0,0,0)',
833
+ plot_bgcolor: 'rgba(0,0,0,0)'
834
  });
835
  }
836
 
 
843
  try {
844
  if (!metricsData) await loadMetrics();
845
  if (!timeseriesData) {
846
+ const response = await fetch(`${API_URL}/evaluation/timeseries`);
847
  timeseriesData = await response.json();
848
  }
849
 
850
  if (timeseriesData.query_idx && timeseriesData.query_idx.length > 0) {
851
  const latencyTrace = {
852
+ x: timeseriesData.query_idx,
853
  y: timeseriesData.latency_ms,
854
+ mode: 'lines+markers',
855
+ line: { color: '#2563eb' },
856
+ marker: { size: 6 }
857
  };
858
 
859
  Plotly.newPlot('latencyChart', [latencyTrace], {
860
+ xaxis: { title: 'Query Index' },
861
+ yaxis: { title: 'Latency (ms)' },
862
+ responsive: true,
863
+ paper_bgcolor: 'rgba(0,0,0,0)',
864
+ plot_bgcolor: 'rgba(0,0,0,0)'
865
  });
866
  }
867
 
868
+ document.getElementById('p50Value').textContent = metricsData.latency_p50?.toFixed(0) || '-';
869
+ document.getElementById('p95Value').textContent = metricsData.latency_p95?.toFixed(0) || '-';
870
+ document.getElementById('p99Value').textContent = metricsData.latency_p99?.toFixed(0) || '-';
871
+ document.getElementById('costValue').textContent = metricsData.cost_per_query ? (metricsData.cost_per_query / 100).toFixed(4) : '-';
872
 
873
  } catch (e) {
874
  console.error('Error loading performance charts:', e);
 
877
 
878
  async function loadFailureCharts() {
879
  try {
880
+ const response = await fetch(`${API_URL}/evaluation/failures`);
881
  failureData = await response.json();
882
 
883
  const failureChart = {
884
  x: Object.keys(failureData.failure_modes),
885
  y: Object.values(failureData.failure_modes),
886
  type: 'bar',
887
+ marker: { color: '#dc2626' }
888
  };
889
 
890
  Plotly.newPlot('failureChart', [failureChart], {
891
+ yaxis: { title: 'Count' },
892
+ responsive: true,
893
+ showlegend: false,
894
+ paper_bgcolor: 'rgba(0,0,0,0)',
895
+ plot_bgcolor: 'rgba(0,0,0,0)'
896
  });
897
 
898
  document.getElementById('totalFailures').textContent = failureData.total_failures;
899
  document.getElementById('hallCount').textContent = failureData.failure_modes.hallucinations;
900
  document.getElementById('retCount').textContent = failureData.failure_modes.low_retrieval;
901
  document.getElementById('genCount').textContent = failureData.failure_modes.low_generation;
 
902
 
903
  // Show recent failures
904
  const failureList = document.getElementById('failureList');
 
909
  ];
910
 
911
  allFailures.forEach(f => {
912
+ html += `<div class="failure-item">
913
+ <div class="failure-item-query">${f.query}</div>
914
+ <div class="failure-item-score">Score: ${f.score.toFixed(3)}</div>
915
+ </div>`;
916
  });
917
 
918
+ failureList.innerHTML = html || '<p style="padding: 20px; color: var(--text-muted);">No failures detected! πŸŽ‰</p>';
919
 
920
  } catch (e) {
921
  console.error('Error loading failure analysis:', e);
 
928
 
929
  async function exportResults() {
930
  try {
931
+ const response = await fetch(`${API_URL}/evaluation/export`);
932
  const blob = await response.blob();
933
  const url = window.URL.createObjectURL(blob);
934
  const a = document.createElement('a');
 
944
  async function clearResults() {
945
  if (confirm('Are you sure you want to clear all results?')) {
946
  try {
947
+ await fetch(`${API_URL}/evaluation/reset`, { method: 'POST' });
948
  metricsData = null;
949
  timeseriesData = null;
950
  failureData = null;
 
956
  }
957
  }
958
 
959
+ // Dark mode toggle
960
+ const themeToggle = document.getElementById("themeToggle");
961
+ const root = document.documentElement;
962
+
963
+ const savedTheme = localStorage.getItem("theme");
964
+ const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
965
+
966
+ if (savedTheme) {
967
+ root.setAttribute("data-theme", savedTheme);
968
+ } else if (prefersDark) {
969
+ root.setAttribute("data-theme", "dark");
970
+ }
971
+
972
+ function updateToggleText() {
973
+ const isDark = root.getAttribute("data-theme") === "dark";
974
+ themeToggle.textContent = isDark ? "β˜€οΈ Light mode" : "πŸŒ™ Dark mode";
975
+ }
976
+ updateToggleText();
977
+
978
+ themeToggle.addEventListener("click", () => {
979
+ const isDark = root.getAttribute("data-theme") === "dark";
980
+ const newTheme = isDark ? "light" : "dark";
981
+ root.setAttribute("data-theme", newTheme);
982
+ localStorage.setItem("theme", newTheme);
983
+ updateToggleText();
984
+ });
985
+
986
  // Load metrics on page load
987
  window.addEventListener('load', loadMetrics);
988
  </script>
frontend/index.html CHANGED
@@ -485,6 +485,7 @@
485
  <p>Ask questions about your research papers</p>
486
  <nav>
487
  <a href="/evaluation" class="eval-button">πŸ“Š Evaluation Dashboard</a>
 
488
  </nav>
489
  <button id="themeToggle" aria-label="Toggle dark mode" style="
490
  position: fixed;
 
485
  <p>Ask questions about your research papers</p>
486
  <nav>
487
  <a href="/evaluation" class="eval-button">πŸ“Š Evaluation Dashboard</a>
488
+ <a href="/ragas-demo" class="eval-button">πŸ”¬ RAGAS Evaluation</a>
489
  </nav>
490
  <button id="themeToggle" aria-label="Toggle dark mode" style="
491
  position: fixed;
frontend/index_.html ADDED
@@ -0,0 +1,592 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>Document Intelligence RAG</title>
8
+ <style>
9
+ * {
10
+ margin: 0;
11
+ padding: 0;
12
+ box-sizing: border-box;
13
+ }
14
+
15
+ body {
16
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
17
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
18
+ min-height: 100vh;
19
+ padding: 20px;
20
+ }
21
+
22
+ .container {
23
+ max-width: 1000px;
24
+ margin: 0 auto;
25
+ }
26
+
27
+ header {
28
+ text-align: center;
29
+ color: white;
30
+ margin-bottom: 40px;
31
+ }
32
+
33
+ header h1 {
34
+ font-size: 2.5em;
35
+ margin-bottom: 10px;
36
+ text-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
37
+ }
38
+
39
+ header p {
40
+ font-size: 1.1em;
41
+ opacity: 0.9;
42
+ }
43
+
44
+ .main-grid {
45
+ display: grid;
46
+ grid-template-columns: 1fr 1fr;
47
+ gap: 20px;
48
+ margin-bottom: 20px;
49
+ }
50
+
51
+ .card {
52
+ background: white;
53
+ border-radius: 12px;
54
+ padding: 25px;
55
+ box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
56
+ }
57
+
58
+ .card h2 {
59
+ color: #333;
60
+ margin-bottom: 15px;
61
+ font-size: 1.3em;
62
+ }
63
+
64
+ .upload-area {
65
+ border: 2px dashed #667eea;
66
+ border-radius: 8px;
67
+ padding: 30px;
68
+ text-align: center;
69
+ cursor: pointer;
70
+ transition: all 0.3s;
71
+ }
72
+
73
+ .upload-area:hover {
74
+ border-color: #764ba2;
75
+ background: #f8f9ff;
76
+ }
77
+
78
+ .upload-area.dragover {
79
+ border-color: #764ba2;
80
+ background: #f0f2ff;
81
+ }
82
+
83
+ .upload-area input {
84
+ display: none;
85
+ }
86
+
87
+ .upload-area p {
88
+ color: #666;
89
+ margin-bottom: 10px;
90
+ }
91
+
92
+ .btn {
93
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
94
+ color: white;
95
+ border: none;
96
+ padding: 12px 24px;
97
+ border-radius: 8px;
98
+ cursor: pointer;
99
+ font-size: 1em;
100
+ font-weight: 600;
101
+ transition: transform 0.2s, box-shadow 0.2s;
102
+ }
103
+
104
+ .btn:hover {
105
+ transform: translateY(-2px);
106
+ box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
107
+ }
108
+
109
+ .btn:active {
110
+ transform: translateY(0);
111
+ }
112
+
113
+ .btn-secondary {
114
+ background: #f0f0f0;
115
+ color: #333;
116
+ }
117
+
118
+ .btn-secondary:hover {
119
+ background: #e0e0e0;
120
+ box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1);
121
+ }
122
+
123
+ .query-input {
124
+ display: flex;
125
+ gap: 10px;
126
+ margin-bottom: 20px;
127
+ }
128
+
129
+ .query-input input {
130
+ flex: 1;
131
+ padding: 12px;
132
+ border: 2px solid #e0e0e0;
133
+ border-radius: 8px;
134
+ font-size: 1em;
135
+ transition: border-color 0.3s;
136
+ }
137
+
138
+ .query-input input:focus {
139
+ outline: none;
140
+ border-color: #667eea;
141
+ }
142
+
143
+ .status {
144
+ padding: 15px;
145
+ border-radius: 8px;
146
+ margin-bottom: 15px;
147
+ font-size: 0.95em;
148
+ }
149
+
150
+ .status.success {
151
+ background: #d4edda;
152
+ color: #155724;
153
+ border-left: 4px solid #28a745;
154
+ }
155
+
156
+ .status.error {
157
+ background: #f8d7da;
158
+ color: #721c24;
159
+ border-left: 4px solid #f5c6cb;
160
+ }
161
+
162
+ .status.loading {
163
+ background: #e7f3ff;
164
+ color: #004085;
165
+ border-left: 4px solid #0c5ff4;
166
+ }
167
+
168
+ .answer-box {
169
+ background: #f8f9fa;
170
+ border-left: 4px solid #667eea;
171
+ padding: 15px;
172
+ border-radius: 8px;
173
+ margin-bottom: 20px;
174
+ }
175
+
176
+ .answer-box h3 {
177
+ color: #333;
178
+ margin-bottom: 10px;
179
+ }
180
+
181
+ .answer-box p {
182
+ color: #555;
183
+ line-height: 1.6;
184
+ margin-bottom: 15px;
185
+ }
186
+
187
+ .sources {
188
+ background: white;
189
+ border-radius: 8px;
190
+ padding: 15px;
191
+ margin-bottom: 15px;
192
+ }
193
+
194
+ .sources h4 {
195
+ color: #333;
196
+ margin-bottom: 12px;
197
+ font-size: 0.95em;
198
+ }
199
+
200
+ .source-item {
201
+ padding: 10px;
202
+ background: #f8f9fa;
203
+ border-radius: 6px;
204
+ margin-bottom: 8px;
205
+ border-left: 3px solid #667eea;
206
+ font-size: 0.9em;
207
+ }
208
+
209
+ .source-item .relevance {
210
+ color: #667eea;
211
+ font-weight: 600;
212
+ margin-bottom: 5px;
213
+ }
214
+
215
+ .source-item .text {
216
+ color: #555;
217
+ font-style: italic;
218
+ }
219
+
220
+ .stats {
221
+ display: grid;
222
+ grid-template-columns: repeat(2, 1fr);
223
+ gap: 10px;
224
+ margin-bottom: 20px;
225
+ }
226
+
227
+ .stat-box {
228
+ background: #f8f9fa;
229
+ padding: 12px;
230
+ border-radius: 6px;
231
+ text-align: center;
232
+ }
233
+
234
+ .stat-box .number {
235
+ font-size: 1.5em;
236
+ font-weight: bold;
237
+ color: #667eea;
238
+ }
239
+
240
+ .stat-box .label {
241
+ font-size: 0.85em;
242
+ color: #666;
243
+ margin-top: 5px;
244
+ }
245
+
246
+ .status-grid {
247
+ display: grid;
248
+ grid-template-columns: repeat(4, 1fr);
249
+ gap: 10px;
250
+ }
251
+
252
+ .loading-spinner {
253
+ display: inline-block;
254
+ width: 20px;
255
+ height: 20px;
256
+ border: 3px solid #f3f3f3;
257
+ border-top: 3px solid #667eea;
258
+ border-radius: 50%;
259
+ animation: spin 1s linear infinite;
260
+ margin-right: 10px;
261
+ vertical-align: middle;
262
+ }
263
+
264
+ @keyframes spin {
265
+ 0% {
266
+ transform: rotate(0deg);
267
+ }
268
+
269
+ 100% {
270
+ transform: rotate(360deg);
271
+ }
272
+ }
273
+
274
+ .full-width {
275
+ grid-column: 1 / -1;
276
+ }
277
+
278
+ @media (max-width: 768px) {
279
+ .main-grid {
280
+ grid-template-columns: 1fr;
281
+ }
282
+
283
+ header h1 {
284
+ font-size: 1.8em;
285
+ }
286
+
287
+ .stats {
288
+ grid-template-columns: 1fr;
289
+ }
290
+
291
+ .status-grid {
292
+ grid-template-columns: repeat(2, 1fr);
293
+ }
294
+ }
295
+
296
+ .hidden {
297
+ display: none;
298
+ }
299
+ </style>
300
+ </head>
301
+
302
+ <body>
303
+ <div class="container">
304
+ <header>
305
+ <h1>πŸ“š Document Intelligence RAG</h1>
306
+ <p>Ask questions about your research papers</p>
307
+ </header>
308
+
309
+ <div class="main-grid">
310
+ <!-- Upload Section -->
311
+ <div class="card">
312
+ <h2>πŸ“€ Upload Documents</h2>
313
+
314
+ <div class="upload-area" id="uploadArea">
315
+ <p>πŸ“ Drag & drop PDFs here or click to browse</p>
316
+ <input type="file" id="fileInput" multiple accept=".pdf">
317
+ <button class="btn" onclick="document.getElementById('fileInput').click()">
318
+ Choose Files
319
+ </button>
320
+ </div>
321
+
322
+ <div id="uploadStatus" class="status hidden"></div>
323
+
324
+ <div id="stats" class="stats">
325
+ <div class="stat-box">
326
+ <div class="number" id="totalChunks">0</div>
327
+ <div class="label">Total Chunks</div>
328
+ </div>
329
+ <div class="stat-box">
330
+ <div class="number" id="docCount">0</div>
331
+ <div class="label">Documents</div>
332
+ </div>
333
+ </div>
334
+
335
+ <button class="btn btn-secondary" onclick="loadStats()">
336
+ πŸ”„ Refresh Stats
337
+ </button>
338
+
339
+ <button class="btn btn-secondary" style="background: #ff6b6b; color: white; margin-top: 10px;"
340
+ onclick="resetSystem()">
341
+ πŸ—‘οΈ Delete All Documents
342
+ </button>
343
+
344
+ <p style="font-size: 0.85em; color: #999; margin-top: 10px;">
345
+ πŸ’Ύ Documents are stored persistently. They remain after restart.
346
+ </p>
347
+ </div>
348
+
349
+ <!-- Query Section -->
350
+ <div class="card">
351
+ <h2>❓ Ask Questions</h2>
352
+
353
+ <div class="query-input">
354
+ <input type="text" id="queryInput" placeholder="What would you like to know about your documents?"
355
+ onkeypress="if(event.key==='Enter') submitQuery()">
356
+ <button class="btn" onclick="submitQuery()">Search</button>
357
+ </div>
358
+
359
+ <div id="queryStatus" class="status hidden"></div>
360
+
361
+ <div id="answerContainer" class="hidden">
362
+ <div class="answer-box">
363
+ <h3>Answer</h3>
364
+ <p id="answerText"></p>
365
+ </div>
366
+
367
+ <div class="sources" id="sourcesBox">
368
+ <h4>πŸ“– Sources Used</h4>
369
+ <div id="sourcesList"></div>
370
+ </div>
371
+ </div>
372
+ </div>
373
+ </div>
374
+
375
+ <!-- Status Indicators -->
376
+ <div class="card full-width">
377
+ <h2>πŸ”§ System Status</h2>
378
+ <div id="healthStatus" class="status-grid">Loading...</div>
379
+ </div>
380
+ </div>
381
+
382
+ <script>
383
+ const API_URL = window.location.origin; // 'http://localhost:8000';
384
+
385
+ // Upload handlers
386
+ const uploadArea = document.getElementById('uploadArea');
387
+ const fileInput = document.getElementById('fileInput');
388
+
389
+ uploadArea.addEventListener('click', () => fileInput.click());
390
+ uploadArea.addEventListener('dragover', (e) => {
391
+ e.preventDefault();
392
+ uploadArea.classList.add('dragover');
393
+ });
394
+ uploadArea.addEventListener('dragleave', () => {
395
+ uploadArea.classList.remove('dragover');
396
+ });
397
+ uploadArea.addEventListener('drop', (e) => {
398
+ e.preventDefault();
399
+ uploadArea.classList.remove('dragover');
400
+ handleFiles(e.dataTransfer.files);
401
+ });
402
+
403
+ fileInput.addEventListener('change', (e) => {
404
+ handleFiles(e.target.files);
405
+ });
406
+
407
+ async function handleFiles(files) {
408
+ const statusDiv = document.getElementById('uploadStatus');
409
+
410
+ for (const file of files) {
411
+ if (!file.name.endsWith('.pdf')) {
412
+ showStatus(statusDiv, `Skipping ${file.name} - only PDFs supported`, 'error');
413
+ continue;
414
+ }
415
+
416
+ showStatus(statusDiv, `Uploading ${file.name}...`, 'loading');
417
+
418
+ const formData = new FormData();
419
+ formData.append('file', file);
420
+
421
+ try {
422
+ const response = await fetch(`${API_URL}/ingest`, {
423
+ method: 'POST',
424
+ body: formData
425
+ });
426
+
427
+ if (response.ok) {
428
+ const data = await response.json();
429
+ showStatus(
430
+ statusDiv,
431
+ `βœ“ ${file.name}: ${data.chunks_embedded} chunks ingested`,
432
+ 'success'
433
+ );
434
+ loadStats();
435
+ } else {
436
+ const error = await response.json();
437
+ showStatus(statusDiv, `βœ— ${file.name}: ${error.detail}`, 'error');
438
+ }
439
+ } catch (error) {
440
+ showStatus(statusDiv, `βœ— Upload failed: ${error.message}`, 'error');
441
+ }
442
+ }
443
+
444
+ fileInput.value = '';
445
+ }
446
+
447
+ async function submitQuery() {
448
+ const query = document.getElementById('queryInput').value.trim();
449
+ if (!query) {
450
+ showStatus(
451
+ document.getElementById('queryStatus'),
452
+ 'Please enter a question',
453
+ 'error'
454
+ );
455
+ return;
456
+ }
457
+
458
+ const statusDiv = document.getElementById('queryStatus');
459
+ showStatus(statusDiv, 'Searching your documents...', 'loading');
460
+
461
+ try {
462
+ const response = await fetch(`${API_URL}/query`, {
463
+ method: 'POST',
464
+ headers: { 'Content-Type': 'application/json' },
465
+ body: JSON.stringify({ query, top_k: 3 })
466
+ });
467
+
468
+ if (response.ok) {
469
+ const data = await response.json();
470
+ displayAnswer(data);
471
+ statusDiv.classList.add('hidden');
472
+ } else {
473
+ const error = await response.json();
474
+ showStatus(statusDiv, error.error || 'Query failed', 'error');
475
+ }
476
+ } catch (error) {
477
+ showStatus(statusDiv, `Error: ${error.message}`, 'error');
478
+ }
479
+ }
480
+
481
+ function displayAnswer(data) {
482
+ document.getElementById('answerText').textContent = data.answer;
483
+
484
+ const sourcesList = document.getElementById('sourcesList');
485
+ sourcesList.innerHTML = data.sources.map(source => `
486
+ <div class="source-item">
487
+ <div class="relevance">πŸ“Œ Relevance: ${(source.similarity * 100).toFixed(0)}%</div>
488
+ <div class="text">${source.preview}</div>
489
+ </div>
490
+ `).join('');
491
+
492
+ document.getElementById('answerContainer').classList.remove('hidden');
493
+ }
494
+
495
+ async function loadStats() {
496
+ try {
497
+ const response = await fetch(`${API_URL}/stats`);
498
+ if (response.ok) {
499
+ const data = await response.json();
500
+ document.getElementById('totalChunks').textContent = data.total_chunks;
501
+ }
502
+ } catch (error) {
503
+ console.error('Failed to load stats:', error);
504
+ }
505
+ }
506
+
507
+ async function loadHealth() {
508
+ try {
509
+ const response = await fetch(`${API_URL}/health`);
510
+ if (response.ok) {
511
+ const data = await response.json();
512
+
513
+ // Get embedding backend name
514
+ let embeddingName = data.embedding_backend || 'Unknown';
515
+ // Format nicely
516
+ if (embeddingName === 'sentence-transformers') {
517
+ embeddingName = 'Sentence-Transformers';
518
+ } else if (embeddingName === 'ollama') {
519
+ embeddingName = 'Ollama';
520
+ }
521
+
522
+ const healthHtml = `
523
+ <div class="stat-box">
524
+ <div class="number">${data.embedding_backend ? 'βœ“' : 'βœ—'}</div>
525
+ <div class="label">${embeddingName} (Embeddings)</div>
526
+ </div>
527
+ <div class="stat-box">
528
+ <div class="number">${data.groq === 'βœ“' ? 'βœ“' : 'βœ—'}</div>
529
+ <div class="label">Groq (LLM)</div>
530
+ </div>
531
+ <div class="stat-box">
532
+ <div class="number">${data.chroma.status === 'βœ“' ? 'βœ“' : 'βœ—'}</div>
533
+ <div class="label">Chroma (Vector DB)</div>
534
+ </div>
535
+ <div class="stat-box">
536
+ <div class="number">${data.status === 'healthy' ? 'βœ“' : '⚠'}</div>
537
+ <div class="label">Overall Status</div>
538
+ </div>
539
+ `;
540
+ document.getElementById('healthStatus').innerHTML = healthHtml;
541
+ }
542
+ } catch (error) {
543
+ document.getElementById('healthStatus').innerHTML =
544
+ `<div style="grid-column: 1/-1; padding: 15px; background: #f8d7da; color: #721c24; border-radius: 8px;">Cannot connect to API at ${API_URL}</div>`;
545
+ }
546
+ }
547
+
548
+ async function resetSystem() {
549
+ if (!confirm('⚠️ Delete ALL documents and embeddings? This cannot be undone!')) {
550
+ return;
551
+ }
552
+
553
+ const statusDiv = document.getElementById('uploadStatus');
554
+ showStatus(statusDiv, 'Resetting system...', 'loading');
555
+
556
+ try {
557
+ const response = await fetch(`${API_URL}/reset`, {
558
+ method: 'POST',
559
+ headers: {
560
+ 'Content-Type': 'application/json'
561
+ }
562
+ });
563
+
564
+ if (response.ok) {
565
+ const data = await response.json();
566
+ showStatus(statusDiv, 'βœ“ All documents deleted!', 'success');
567
+ loadStats();
568
+ } else {
569
+ const error = await response.json();
570
+ showStatus(statusDiv, `Reset failed: ${error.detail || 'Unknown error'}`, 'error');
571
+ }
572
+ } catch (error) {
573
+ showStatus(statusDiv, `Error: ${error.message}`, 'error');
574
+ }
575
+ }
576
+
577
+ function showStatus(element, message, type) {
578
+ element.textContent = message;
579
+ element.className = `status ${type}`;
580
+ element.classList.remove('hidden');
581
+ }
582
+
583
+ // Load stats and health on page load
584
+ window.addEventListener('load', () => {
585
+ loadStats();
586
+ loadHealth();
587
+ setInterval(loadHealth, 30000); // Refresh every 30s
588
+ });
589
+ </script>
590
+ </body>
591
+
592
+ </html>
frontend/ragas.html ADDED
@@ -0,0 +1,671 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>RAGAS Evaluation - Document Intelligence RAG</title>
8
+ <style>
9
+ /* Theme toggle */
10
+ .theme-toggle {
11
+ position: fixed;
12
+ top: 20px;
13
+ right: 20px;
14
+ padding: 8px 14px;
15
+ background: var(--surface);
16
+ border: 1px solid var(--border-soft);
17
+ border-radius: var(--radius-md);
18
+ cursor: pointer;
19
+ font-size: 0.85rem;
20
+ color: var(--text-main);
21
+ z-index: 100;
22
+ transition: background 0.15s ease;
23
+ }
24
+
25
+ .theme-toggle:hover {
26
+ background: var(--surface-subtle);
27
+ }
28
+
29
+ /* Button group */
30
+ .button-group {
31
+ display: flex;
32
+ gap: 10px;
33
+ margin-bottom: 16px;
34
+ }
35
+
36
+ .btn-small {
37
+ padding: 8px 14px;
38
+ background: var(--accent);
39
+ color: white;
40
+ border: none;
41
+ border-radius: var(--radius-sm);
42
+ font-size: 0.85rem;
43
+ cursor: pointer;
44
+ transition: background 0.15s ease;
45
+ }
46
+
47
+ .btn-small:hover {
48
+ background: #1d4ed8;
49
+ }
50
+
51
+ .btn-small.secondary {
52
+ background: var(--surface-subtle);
53
+ color: var(--text-main);
54
+ border: 1px solid var(--border-soft);
55
+ }
56
+
57
+ .btn-small.secondary:hover {
58
+ background: var(--border-soft);
59
+ }
60
+
61
+ /* Same design tokens as index.html */
62
+ :root {
63
+ --surface: #ffffff;
64
+ --surface-subtle: #fafafa;
65
+ --bg-main: #f5f7fb;
66
+ --card-bg: #ffffff;
67
+ --accent: #2563eb;
68
+ --accent-soft: #eff6ff;
69
+ --text-main: #111827;
70
+ --text-muted: #6b7280;
71
+ --border-soft: #e5e7eb;
72
+ --success: #16a34a;
73
+ --error: #dc2626;
74
+ --warning: #f59e0b;
75
+ --radius-sm: 6px;
76
+ --radius-md: 10px;
77
+ --radius-lg: 14px;
78
+ }
79
+
80
+ [data-theme="dark"] {
81
+ --surface: #1f2937;
82
+ --surface-subtle: #111827;
83
+ --bg-main: #0f172a;
84
+ --card-bg: #1e293b;
85
+ --accent: #60a5fa;
86
+ --accent-soft: #1e3a5f;
87
+ --text-main: #f1f5f9;
88
+ --text-muted: #94a3b8;
89
+ --border-soft: #334155;
90
+ }
91
+
92
+ * {
93
+ margin: 0;
94
+ padding: 0;
95
+ box-sizing: border-box;
96
+ }
97
+
98
+ body {
99
+ font-family: Inter, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
100
+ background: var(--bg-main);
101
+ min-height: 100vh;
102
+ padding: 24px;
103
+ color: var(--text-main);
104
+ }
105
+
106
+ .container {
107
+ max-width: 900px;
108
+ margin: 0 auto;
109
+ }
110
+
111
+ header {
112
+ text-align: center;
113
+ margin-bottom: 36px;
114
+ }
115
+
116
+ header h1 {
117
+ font-size: 2rem;
118
+ font-weight: 600;
119
+ letter-spacing: -0.02em;
120
+ margin-bottom: 8px;
121
+ }
122
+
123
+ header p {
124
+ font-size: 1rem;
125
+ color: var(--text-muted);
126
+ margin-bottom: 16px;
127
+ }
128
+
129
+ .nav-links {
130
+ display: flex;
131
+ justify-content: center;
132
+ gap: 12px;
133
+ }
134
+
135
+ .nav-links a {
136
+ color: var(--accent);
137
+ text-decoration: none;
138
+ font-size: 0.9rem;
139
+ }
140
+
141
+ .nav-links a:hover {
142
+ text-decoration: underline;
143
+ }
144
+
145
+ .card {
146
+ background: var(--surface);
147
+ border-radius: var(--radius-lg);
148
+ padding: 28px;
149
+ border: 1px solid var(--border-soft);
150
+ box-shadow: 0 8px 20px rgba(0, 0, 0, 0.05);
151
+ margin-bottom: 24px;
152
+ }
153
+
154
+ .card h2 {
155
+ font-size: 1.2rem;
156
+ font-weight: 600;
157
+ margin-bottom: 18px;
158
+ }
159
+
160
+ .query-section {
161
+ display: flex;
162
+ gap: 12px;
163
+ margin-bottom: 16px;
164
+ }
165
+
166
+ .query-section input {
167
+ flex: 1;
168
+ padding: 12px 14px;
169
+ border: 1.5px solid var(--border-soft);
170
+ border-radius: var(--radius-md);
171
+ font-size: 0.95rem;
172
+ background: var(--surface);
173
+ color: var(--text-main);
174
+ }
175
+
176
+ .query-section input:focus {
177
+ outline: none;
178
+ border-color: var(--accent);
179
+ box-shadow: 0 0 0 3px var(--accent-soft);
180
+ }
181
+
182
+ .btn {
183
+ background: var(--accent);
184
+ color: white;
185
+ border: none;
186
+ padding: 12px 24px;
187
+ border-radius: var(--radius-md);
188
+ font-size: 0.95rem;
189
+ font-weight: 500;
190
+ cursor: pointer;
191
+ transition: background 0.15s ease, transform 0.15s ease;
192
+ }
193
+
194
+ .btn:hover {
195
+ background: #1d4ed8;
196
+ transform: translateY(-1px);
197
+ }
198
+
199
+ .btn:disabled {
200
+ background: var(--text-muted);
201
+ cursor: not-allowed;
202
+ transform: none;
203
+ }
204
+
205
+ .status {
206
+ padding: 12px;
207
+ border-radius: var(--radius-md);
208
+ font-size: 0.9rem;
209
+ margin-bottom: 16px;
210
+ }
211
+
212
+ .status.loading {
213
+ background: var(--accent-soft);
214
+ color: var(--accent);
215
+ }
216
+
217
+ .status.error {
218
+ background: #fef2f2;
219
+ color: var(--error);
220
+ }
221
+
222
+ .status.success {
223
+ background: #f0fdf4;
224
+ color: var(--success);
225
+ }
226
+
227
+ .hidden {
228
+ display: none;
229
+ }
230
+
231
+ /* Answer Section */
232
+ .answer-box {
233
+ background: var(--surface-subtle);
234
+ border-radius: var(--radius-md);
235
+ padding: 16px;
236
+ margin-bottom: 20px;
237
+ border-left: 4px solid var(--accent);
238
+ }
239
+
240
+ .answer-box p {
241
+ line-height: 1.6;
242
+ }
243
+
244
+ /* RAGAS Scores */
245
+ .scores-grid {
246
+ display: grid;
247
+ grid-template-columns: repeat(3, 1fr);
248
+ gap: 16px;
249
+ margin-bottom: 20px;
250
+ }
251
+
252
+ .score-card {
253
+ background: var(--surface-subtle);
254
+ border-radius: var(--radius-md);
255
+ padding: 20px;
256
+ text-align: center;
257
+ }
258
+
259
+ .score-label {
260
+ font-size: 0.85rem;
261
+ color: var(--text-muted);
262
+ margin-bottom: 8px;
263
+ text-transform: uppercase;
264
+ letter-spacing: 0.5px;
265
+ }
266
+
267
+ .score-value {
268
+ font-size: 2rem;
269
+ font-weight: 700;
270
+ }
271
+
272
+ .score-value.high {
273
+ color: var(--success);
274
+ }
275
+
276
+ .score-value.medium {
277
+ color: var(--warning);
278
+ }
279
+
280
+ .score-value.low {
281
+ color: var(--error);
282
+ }
283
+
284
+ /* Progress Bar */
285
+ .progress-bar {
286
+ height: 8px;
287
+ background: var(--border-soft);
288
+ border-radius: 4px;
289
+ overflow: hidden;
290
+ margin-top: 8px;
291
+ }
292
+
293
+ .progress-fill {
294
+ height: 100%;
295
+ border-radius: 4px;
296
+ transition: width 0.5s ease;
297
+ }
298
+
299
+ .progress-fill.high {
300
+ background: var(--success);
301
+ }
302
+
303
+ .progress-fill.medium {
304
+ background: var(--warning);
305
+ }
306
+
307
+ .progress-fill.low {
308
+ background: var(--error);
309
+ }
310
+
311
+ /* Aggregate Stats */
312
+ .aggregate-stats {
313
+ display: flex;
314
+ justify-content: space-around;
315
+ padding: 16px;
316
+ background: var(--surface-subtle);
317
+ border-radius: var(--radius-md);
318
+ }
319
+
320
+ .stat-item {
321
+ text-align: center;
322
+ }
323
+
324
+ .stat-value {
325
+ font-size: 1.5rem;
326
+ font-weight: 600;
327
+ color: var(--accent);
328
+ }
329
+
330
+ .stat-label {
331
+ font-size: 0.8rem;
332
+ color: var(--text-muted);
333
+ }
334
+
335
+ /* Sources */
336
+ .sources-section {
337
+ margin-top: 20px;
338
+ }
339
+
340
+ .sources-section h3 {
341
+ font-size: 1rem;
342
+ margin-bottom: 12px;
343
+ color: var(--text-muted);
344
+ }
345
+
346
+ .source-item {
347
+ background: var(--surface-subtle);
348
+ border-radius: var(--radius-sm);
349
+ padding: 12px;
350
+ margin-bottom: 8px;
351
+ font-size: 0.85rem;
352
+ }
353
+
354
+ .source-similarity {
355
+ color: var(--accent);
356
+ font-weight: 500;
357
+ margin-bottom: 4px;
358
+ }
359
+
360
+ /* Info Box */
361
+ .info-box {
362
+ background: var(--accent-soft);
363
+ border-left: 4px solid var(--accent);
364
+ padding: 14px;
365
+ border-radius: var(--radius-sm);
366
+ margin-bottom: 20px;
367
+ }
368
+
369
+ .info-box p {
370
+ font-size: 0.9rem;
371
+ color: var(--text-muted);
372
+ }
373
+
374
+ /* Time display */
375
+ .time-info {
376
+ font-size: 0.85rem;
377
+ color: var(--text-muted);
378
+ text-align: right;
379
+ margin-top: 12px;
380
+ }
381
+
382
+ @media (max-width: 600px) {
383
+ .scores-grid {
384
+ grid-template-columns: 1fr;
385
+ }
386
+
387
+ .query-section {
388
+ flex-direction: column;
389
+ }
390
+ }
391
+ </style>
392
+ </head>
393
+
394
+ <body>
395
+ <button class="theme-toggle" id="themeToggle">πŸŒ™ Dark mode</button>
396
+
397
+ <div class="container">
398
+ <header>
399
+ <h1>πŸ”¬ RAGAS Evaluation</h1>
400
+ <p>Test the RAG pipeline and see real-time quality metrics</p>
401
+ <div class="nav-links">
402
+ <a href="/">← Back to Main</a>
403
+ <a href="/evaluation">View Full Dashboard</a>
404
+ </div>
405
+ </header>
406
+
407
+ <!-- Query Section -->
408
+ <div class="card">
409
+ <h2>Ask a Question</h2>
410
+ <div class="info-box">
411
+ <p>Enter a question to query the RAG system. RAGAS will evaluate the response for
412
+ <strong>Faithfulness</strong> (is the answer grounded in context?) and <strong>Context
413
+ Precision</strong> (were the retrieved chunks useful?).
414
+ </p>
415
+ </div>
416
+ <div class="query-section">
417
+ <input type="text" id="queryInput" placeholder="e.g., What is machine learning?"
418
+ onkeypress="if(event.key === 'Enter') runEvaluation()">
419
+ <button class="btn" id="evalBtn" onclick="runEvaluation()">
420
+ πŸ” Query & Evaluate
421
+ </button>
422
+ </div>
423
+ <div id="status" class="status hidden"></div>
424
+ </div>
425
+
426
+ <!-- Results Section -->
427
+ <div id="resultsSection" class="card hidden">
428
+ <h2>πŸ“Š RAGAS Evaluation Results</h2>
429
+
430
+ <!-- Scores -->
431
+ <div class="scores-grid">
432
+ <div class="score-card">
433
+ <div class="score-label">Faithfulness</div>
434
+ <div class="score-value" id="faithScore">-</div>
435
+ <div class="progress-bar">
436
+ <div class="progress-fill" id="faithBar" style="width: 0%"></div>
437
+ </div>
438
+ </div>
439
+ <div class="score-card">
440
+ <div class="score-label">Context Precision</div>
441
+ <div class="score-value" id="precisionScore">-</div>
442
+ <div class="progress-bar">
443
+ <div class="progress-fill" id="precisionBar" style="width: 0%"></div>
444
+ </div>
445
+ </div>
446
+ <div class="score-card">
447
+ <div class="score-label">RAGAS Score</div>
448
+ <div class="score-value" id="ragasScore">-</div>
449
+ <div class="progress-bar">
450
+ <div class="progress-fill" id="ragasBar" style="width: 0%"></div>
451
+ </div>
452
+ </div>
453
+ </div>
454
+
455
+ <!-- Answer -->
456
+ <h3 style="margin-bottom: 12px;">πŸ“ Generated Answer</h3>
457
+ <div class="answer-box">
458
+ <p id="answerText">-</p>
459
+ </div>
460
+
461
+ <!-- Sources -->
462
+ <div class="sources-section">
463
+ <h3>πŸ“š Retrieved Sources</h3>
464
+ <div id="sourcesList"></div>
465
+ </div>
466
+
467
+ <div class="time-info">
468
+ <span id="responseTime"></span> | <span id="evalTime"></span>
469
+ </div>
470
+ </div>
471
+
472
+ <!-- Aggregate Stats -->
473
+ <div class="card">
474
+ <h2>πŸ“ˆ Aggregate Statistics</h2>
475
+ <div class="button-group">
476
+ <button class="btn-small" onclick="loadAggregateStats()">πŸ”„ Refresh</button>
477
+ <button class="btn-small secondary" onclick="clearRagasResults()">πŸ—‘οΈ Clear Results</button>
478
+ </div>
479
+ <div class="aggregate-stats">
480
+ <div class="stat-item">
481
+ <div class="stat-value" id="totalEvals">0</div>
482
+ <div class="stat-label">Total Evaluations</div>
483
+ </div>
484
+ <div class="stat-item">
485
+ <div class="stat-value" id="avgFaith">-</div>
486
+ <div class="stat-label">Avg Faithfulness</div>
487
+ </div>
488
+ <div class="stat-item">
489
+ <div class="stat-value" id="avgPrecision">-</div>
490
+ <div class="stat-label">Avg Precision</div>
491
+ </div>
492
+ <div class="stat-item">
493
+ <div class="stat-value" id="avgRagas">-</div>
494
+ <div class="stat-label">Avg RAGAS Score</div>
495
+ </div>
496
+ </div>
497
+ </div>
498
+ </div>
499
+
500
+ <script>
501
+ async function clearRagasResults() {
502
+ if (!confirm('Clear all RAGAS evaluation results? This cannot be undone.')) {
503
+ return;
504
+ }
505
+
506
+ try {
507
+ const response = await fetch(`${API_URL}/ragas/reset`, {
508
+ method: 'POST'
509
+ });
510
+
511
+ if (response.ok) {
512
+ // Reset displayed values
513
+ document.getElementById('totalEvals').textContent = '0';
514
+ document.getElementById('avgFaith').textContent = '-';
515
+ document.getElementById('avgPrecision').textContent = '-';
516
+ document.getElementById('avgRagas').textContent = '-';
517
+
518
+ alert('RAGAS results cleared!');
519
+ } else {
520
+ alert('Failed to clear results');
521
+ }
522
+ } catch (error) {
523
+ alert('Error: ' + error.message);
524
+ }
525
+ }
526
+ // Dark mode toggle
527
+ const themeToggle = document.getElementById("themeToggle");
528
+ const root = document.documentElement;
529
+
530
+ const savedTheme = localStorage.getItem("theme");
531
+ const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
532
+
533
+ if (savedTheme) {
534
+ root.setAttribute("data-theme", savedTheme);
535
+ } else if (prefersDark) {
536
+ root.setAttribute("data-theme", "dark");
537
+ }
538
+
539
+ function updateToggleText() {
540
+ const isDark = root.getAttribute("data-theme") === "dark";
541
+ themeToggle.textContent = isDark ? "β˜€οΈ Light mode" : "πŸŒ™ Dark mode";
542
+ }
543
+ updateToggleText();
544
+
545
+ themeToggle.addEventListener("click", () => {
546
+ const isDark = root.getAttribute("data-theme") === "dark";
547
+ const newTheme = isDark ? "light" : "dark";
548
+ root.setAttribute("data-theme", newTheme);
549
+ localStorage.setItem("theme", newTheme);
550
+ updateToggleText();
551
+ });
552
+ const API_URL = window.location.origin;
553
+
554
+ async function runEvaluation() {
555
+ const query = document.getElementById('queryInput').value.trim();
556
+ if (!query) {
557
+ showStatus('Please enter a question', 'error');
558
+ return;
559
+ }
560
+
561
+ const btn = document.getElementById('evalBtn');
562
+ btn.disabled = true;
563
+ btn.textContent = '⏳ Evaluating...';
564
+
565
+ showStatus('Querying RAG and running RAGAS evaluation...', 'loading');
566
+ document.getElementById('resultsSection').classList.add('hidden');
567
+
568
+ try {
569
+ const response = await fetch(`${API_URL}/ragas/query-and-evaluate`, {
570
+ method: 'POST',
571
+ headers: { 'Content-Type': 'application/json' },
572
+ body: JSON.stringify({ query, top_k: 3 })
573
+ });
574
+
575
+ if (!response.ok) {
576
+ throw new Error('Evaluation failed');
577
+ }
578
+
579
+ const data = await response.json();
580
+ displayResults(data);
581
+ hideStatus();
582
+ loadAggregateStats();
583
+
584
+ } catch (error) {
585
+ showStatus(`Error: ${error.message}`, 'error');
586
+ } finally {
587
+ btn.disabled = false;
588
+ btn.textContent = 'πŸ” Query & Evaluate';
589
+ }
590
+ }
591
+
592
+ function displayResults(data) {
593
+ document.getElementById('resultsSection').classList.remove('hidden');
594
+
595
+ // Answer
596
+ document.getElementById('answerText').textContent = data.answer;
597
+
598
+ // RAGAS Scores
599
+ if (data.ragas) {
600
+ setScore('faithScore', 'faithBar', data.ragas.faithfulness);
601
+ setScore('precisionScore', 'precisionBar', data.ragas.context_precision);
602
+ setScore('ragasScore', 'ragasBar', data.ragas.ragas_score);
603
+
604
+ document.getElementById('evalTime').textContent =
605
+ `Eval: ${data.ragas.eval_time_ms.toFixed(0)}ms`;
606
+ }
607
+
608
+ document.getElementById('responseTime').textContent =
609
+ `Response: ${data.response_time_ms.toFixed(0)}ms`;
610
+
611
+ // Sources
612
+ const sourcesList = document.getElementById('sourcesList');
613
+ sourcesList.innerHTML = data.sources.map(source => `
614
+ <div class="source-item">
615
+ <div class="source-similarity">πŸ“Œ Similarity: ${(source.similarity * 100).toFixed(0)}%</div>
616
+ <div>${source.preview}</div>
617
+ </div>
618
+ `).join('');
619
+ }
620
+
621
+ function setScore(valueId, barId, score) {
622
+ const valueEl = document.getElementById(valueId);
623
+ const barEl = document.getElementById(barId);
624
+
625
+ const percent = (score * 100).toFixed(0);
626
+ valueEl.textContent = percent + '%';
627
+ barEl.style.width = percent + '%';
628
+
629
+ // Color coding
630
+ let colorClass = 'high';
631
+ if (score < 0.7) colorClass = 'medium';
632
+ if (score < 0.5) colorClass = 'low';
633
+
634
+ valueEl.className = 'score-value ' + colorClass;
635
+ barEl.className = 'progress-fill ' + colorClass;
636
+ }
637
+
638
+ async function loadAggregateStats() {
639
+ try {
640
+ const response = await fetch(`${API_URL}/ragas/metrics`);
641
+ const data = await response.json();
642
+
643
+ document.getElementById('totalEvals').textContent = data.total_evaluations || 0;
644
+ document.getElementById('avgFaith').textContent =
645
+ data.avg_faithfulness ? (data.avg_faithfulness * 100).toFixed(0) + '%' : '-';
646
+ document.getElementById('avgPrecision').textContent =
647
+ data.avg_context_precision ? (data.avg_context_precision * 100).toFixed(0) + '%' : '-';
648
+ document.getElementById('avgRagas').textContent =
649
+ data.avg_ragas_score ? (data.avg_ragas_score * 100).toFixed(0) + '%' : '-';
650
+ } catch (e) {
651
+ console.error('Failed to load aggregate stats:', e);
652
+ }
653
+ }
654
+
655
+ function showStatus(message, type) {
656
+ const status = document.getElementById('status');
657
+ status.textContent = message;
658
+ status.className = `status ${type}`;
659
+ status.classList.remove('hidden');
660
+ }
661
+
662
+ function hideStatus() {
663
+ document.getElementById('status').classList.add('hidden');
664
+ }
665
+
666
+ // Load stats on page load
667
+ window.addEventListener('load', loadAggregateStats);
668
+ </script>
669
+ </body>
670
+
671
+ </html>
pyproject.toml CHANGED
@@ -16,7 +16,10 @@ dependencies = [
16
  "python-dotenv>=1.2.1",
17
  "python-multipart>=0.0.20",
18
  "requests>=2.32.5",
19
- "sentence-transformers>=5.2.0",
20
  "uvicorn[standard]>=0.38.0",
21
- "scikit-learn>=1.3.0"
 
 
 
22
  ]
 
16
  "python-dotenv>=1.2.1",
17
  "python-multipart>=0.0.20",
18
  "requests>=2.32.5",
19
+ "sentence-transformers>=5.2.0",
20
  "uvicorn[standard]>=0.38.0",
21
+ "scikit-learn>=1.3.0",
22
+ "ragas>=0.4.2",
23
+ "langchain-groq>=1.1.1",
24
+ "langchain-openai>=1.1.6",
25
  ]
run_real_evaluation.py ADDED
@@ -0,0 +1,473 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Real RAG Evaluation Script
3
+ Runs actual queries through my RAG and computes real metrics.
4
+ """
5
+
6
+ import json
7
+ import tempfile
8
+ import os
9
+ import time
10
+ from pathlib import Path
11
+ from typing import List, Dict, Any
12
+ import numpy as np
13
+ from sentence_transformers import SentenceTransformer, util
14
+
15
+ from src.rag.pipeline import RAGPipeline
16
+ from src.evaluation import RAGEvaluator, EvaluationResult
17
+
18
+
19
+ # ==================== TEST DATASET ====================
20
+ # These are synthetic queries + documents, but metrics are REAL
21
+ # based on actual retrieval and generation from my RAG
22
+
23
+ TEST_DOCUMENTS = {
24
+ "medical_research_1.txt": """
25
+ Drug X Clinical Trial Results
26
+
27
+ A Phase III clinical trial was conducted to evaluate the efficacy and safety of Drug X
28
+ in patients with condition Y. The study enrolled 500 patients aged 18-65 with confirmed
29
+ diagnosis of condition Y.
30
+
31
+ Primary Efficacy Endpoint:
32
+ Drug X demonstrated a 65% response rate compared to 35% in the placebo group (p<0.001).
33
+ The median time to response was 4 weeks.
34
+
35
+ Safety Profile:
36
+ The most common adverse events were:
37
+ - Headache (12% of patients)
38
+ - Mild gastrointestinal upset (8% of patients)
39
+ - Dizziness (5% of patients)
40
+ - Fatigue (4% of patients)
41
+
42
+ Serious adverse events occurred in 2% of patients, including liver enzyme elevation.
43
+ No deaths were attributed to the drug during the trial period.
44
+
45
+ Dosage Recommendations:
46
+ The recommended dose is 500mg twice daily with meals. Dose adjustments may be necessary
47
+ for patients with renal impairment (dose reduction to 250mg twice daily recommended).
48
+
49
+ Mechanism of Action:
50
+ Drug X works by inhibiting protein kinase Y, which is overexpressed in condition Y cells.
51
+ This inhibition leads to cell cycle arrest and apoptosis of affected cells.
52
+ """,
53
+
54
+ "drug_interactions.txt": """
55
+ Drug X Drug Interaction Guide
56
+
57
+ Important Drug Interactions:
58
+
59
+ 1. CYP3A4 Inhibitors (e.g., ketoconazole, ritonavir):
60
+ - May increase Drug X levels by 3-5 fold
61
+ - Monitor for adverse effects
62
+ - Consider dose reduction
63
+
64
+ 2. Warfarin:
65
+ - Potential increased bleeding risk
66
+ - Monitor INR closely
67
+ - Baseline INR and weekly monitoring recommended
68
+
69
+ 3. Oral Contraceptives:
70
+ - May reduce contraceptive efficacy
71
+ - Alternative contraception recommended
72
+ - No dose adjustment needed for Drug X
73
+
74
+ 4. NSAIDs:
75
+ - Increased risk of GI bleeding
76
+ - Monitor for GI symptoms
77
+ - Consider gastroprotection
78
+
79
+ 5. ACE Inhibitors:
80
+ - No significant interaction
81
+ - Safe to use concomitantly
82
+ - No monitoring required
83
+ """,
84
+
85
+ "patient_case_study.txt": """
86
+ Case Study: 45-year-old Female with Condition Y
87
+
88
+ Patient History:
89
+ A 45-year-old female presented with a 6-month history of progressive symptoms consistent
90
+ with condition Y. She has a past medical history of hypertension controlled on lisinopril
91
+ and type 2 diabetes on metformin.
92
+
93
+ Treatment Response:
94
+ Patient was started on Drug X 500mg twice daily. After 2 weeks of treatment, she reported
95
+ partial symptom improvement. By week 6, she achieved complete response with 95% symptom
96
+ resolution.
97
+
98
+ Side Effects Experienced:
99
+ - Mild headache (treated with acetaminophen)
100
+ - Occasional nausea (resolved with food intake)
101
+ - No serious adverse events
102
+
103
+ Follow-up:
104
+ Patient continues to do well on Drug X at 6-month follow-up with sustained response.
105
+ No dose adjustments were necessary. Lab values remain within normal limits.
106
+ """
107
+ }
108
+
109
+ TEST_CASES = [
110
+ {
111
+ "query": "What is the response rate of Drug X?",
112
+ "expected_answer_keywords": ["65%", "response rate"],
113
+ "expected_source_docs": ["medical_research_1"],
114
+ "description": "Should retrieve clinical trial data"
115
+ },
116
+ {
117
+ "query": "What are the side effects of Drug X?",
118
+ "expected_answer_keywords": ["headache", "gastrointestinal", "dizziness"],
119
+ "expected_source_docs": ["medical_research_1"],
120
+ "description": "Should retrieve safety profile section"
121
+ },
122
+ {
123
+ "query": "How does Drug X interact with warfarin?",
124
+ "expected_answer_keywords": ["warfarin", "bleeding", "INR"],
125
+ "expected_source_docs": ["drug_interactions"],
126
+ "description": "Should retrieve drug interactions guide"
127
+ },
128
+ {
129
+ "query": "What is the recommended dosage of Drug X?",
130
+ "expected_answer_keywords": ["500mg", "twice daily"],
131
+ "expected_source_docs": ["medical_research_1"],
132
+ "description": "Should retrieve dosage recommendations"
133
+ },
134
+ {
135
+ "query": "What is the mechanism of action for Drug X?",
136
+ "expected_answer_keywords": ["protein kinase", "inhibiting", "apoptosis"],
137
+ "expected_source_docs": ["medical_research_1"],
138
+ "description": "Should retrieve mechanism section"
139
+ },
140
+ ]
141
+
142
+
143
+ # ==================== METRIC COMPUTATION ====================
144
+
145
+ def compute_retrieval_precision(
146
+ retrieved_docs: List[str],
147
+ expected_docs: List[str]
148
+ ) -> float:
149
+ """
150
+ Precision: Of the docs we retrieved, what % were actually relevant?
151
+
152
+ Formula: TP / (TP + FP)
153
+ where TP = relevant docs we retrieved
154
+ FP = irrelevant docs we retrieved
155
+ """
156
+ if not retrieved_docs:
157
+ return 0.0
158
+
159
+ # Count how many retrieved docs match expected
160
+ relevant_count = sum(1 for doc in retrieved_docs if doc in expected_docs)
161
+
162
+ precision = relevant_count / len(retrieved_docs)
163
+ return float(precision)
164
+
165
+
166
+ def compute_retrieval_recall(
167
+ retrieved_docs: List[str],
168
+ expected_docs: List[str]
169
+ ) -> float:
170
+ """
171
+ Recall: Of all relevant docs, what % did we actually retrieve?
172
+
173
+ Formula: TP / (TP + FN)
174
+ where TP = relevant docs we retrieved
175
+ FN = relevant docs we missed
176
+ """
177
+ if not expected_docs:
178
+ return 1.0 # If no docs expected, perfect recall
179
+
180
+ # Count how many expected docs were retrieved
181
+ relevant_count = sum(1 for doc in expected_docs if doc in retrieved_docs)
182
+
183
+ recall = relevant_count / len(expected_docs)
184
+ return float(recall)
185
+
186
+
187
+ def compute_bert_score(generated_answer: str, expected_keywords: List[str]) -> float:
188
+ """
189
+ Semantic similarity: Does the answer contain the right semantic information?
190
+
191
+ Approximation: Check if expected keywords appear semantically in the answer
192
+ This is a simplified version. Real BERTScore would use embeddings.
193
+ """
194
+ if not expected_keywords:
195
+ return 1.0
196
+
197
+ answer_lower = generated_answer.lower()
198
+
199
+ # Count how many keywords appear in the answer (fuzzy match)
200
+ found_keywords = 0
201
+ for keyword in expected_keywords:
202
+ if keyword.lower() in answer_lower:
203
+ found_keywords += 1
204
+
205
+ # Score based on keyword coverage
206
+ score = found_keywords / len(expected_keywords)
207
+
208
+ # Cap at 0.95 since answer likely contains more than just keywords
209
+ return float(min(score, 0.95))
210
+
211
+
212
+ def compute_answer_relevance(generated_answer: str, query: str) -> float:
213
+ """
214
+ Is the answer actually addressing the query?
215
+
216
+ Approximation: Check if answer is non-trivial and not a refusal
217
+ """
218
+ answer_lower = generated_answer.lower()
219
+
220
+ # Refusal indicators
221
+ refusal_phrases = [
222
+ "i don't know",
223
+ "i cannot",
224
+ "i'm unable",
225
+ "not found",
226
+ "no information",
227
+ "unable to find"
228
+ ]
229
+
230
+ if any(phrase in answer_lower for phrase in refusal_phrases):
231
+ return 0.3
232
+
233
+ # Answer has reasonable length
234
+ if len(generated_answer.split()) < 3:
235
+ return 0.4
236
+
237
+ return 0.85 # Assume relevant if not a refusal
238
+
239
+
240
+ def detect_hallucinations(
241
+ generated_answer: str,
242
+ retrieved_context: str
243
+ ) -> bool:
244
+ """
245
+ Did the LLM make up information not in the sources?
246
+
247
+ Simplified approach: Check if answer contradicts source context
248
+ Real implementation would use NLI models
249
+ """
250
+ # This is hard to do perfectly without advanced NLI
251
+ # For now, assume no hallucinations if answer is relatively short and grounded
252
+ # In production, you'd use a dedicated hallucination detector
253
+
254
+ answer_words = set(generated_answer.lower().split())
255
+ context_words = set(retrieved_context.lower().split())
256
+
257
+ # If too many words from answer aren't in context, might be hallucinating
258
+ # (very loose approximation)
259
+ overlap = len(answer_words & context_words) / max(len(answer_words), 1)
260
+
261
+ # Conservative: flag as hallucination if very low overlap
262
+ is_hallucination = overlap < 0.3
263
+
264
+ return is_hallucination
265
+
266
+
267
+ def compute_faithfulness(
268
+ generated_answer: str,
269
+ retrieved_context: str
270
+ ) -> float:
271
+ """
272
+ Is the answer grounded in the sources?
273
+
274
+ Approximation: Word overlap between answer and context
275
+ Higher overlap = more grounded
276
+ """
277
+ answer_words = set(generated_answer.lower().split())
278
+ context_words = set(retrieved_context.lower().split())
279
+
280
+ if not answer_words:
281
+ return 0.0
282
+
283
+ # Overlap ratio
284
+ overlap = len(answer_words & context_words) / len(answer_words)
285
+
286
+ # Convert to 0-1 scale (0.3 overlap = 0.6 faithfulness)
287
+ faithfulness = min(overlap * 2, 1.0)
288
+
289
+ return float(faithfulness)
290
+
291
+
292
+ def compute_source_attribution(
293
+ generated_answer: str,
294
+ retrieved_context: str
295
+ ) -> float:
296
+ """
297
+ What % of the answer is backed by sources?
298
+
299
+ Approximation: Check what % of answer words appear in retrieved context
300
+ """
301
+ answer_words = generated_answer.lower().split()
302
+ context_words = set(retrieved_context.lower().split())
303
+
304
+ if not answer_words:
305
+ return 0.0
306
+
307
+ attributed_words = sum(1 for word in answer_words if word in context_words)
308
+ attribution_score = attributed_words / len(answer_words)
309
+
310
+ return float(attribution_score)
311
+
312
+
313
+ # ==================== MAIN EVALUATION LOOP ====================
314
+
315
+ def run_real_evaluation():
316
+ """
317
+ Run actual evaluation against your RAG system.
318
+ """
319
+ print("=" * 70)
320
+ print("REAL RAG EVALUATION")
321
+ print("=" * 70)
322
+
323
+ # Initialize evaluator
324
+ evaluator = RAGEvaluator(store_results=True, results_dir="evaluation_results")
325
+
326
+ # Create temporary directory for test documents
327
+ with tempfile.TemporaryDirectory() as tmpdir:
328
+ print(f"\nCreating test documents in {tmpdir}")
329
+
330
+ # Write test documents
331
+ doc_paths = {}
332
+ for doc_name, content in TEST_DOCUMENTS.items():
333
+ doc_path = Path(tmpdir) / doc_name
334
+ doc_path.write_text(content)
335
+ doc_paths[doc_name] = str(doc_path)
336
+ print(f" βœ“ {doc_name}")
337
+
338
+ # Initialize RAG pipeline
339
+ print(f"\nπŸš€ Initializing RAG pipeline...")
340
+ try:
341
+ pipeline = RAGPipeline()
342
+ except Exception as e:
343
+ print(f" βœ— Failed to initialize pipeline: {e}")
344
+ return
345
+
346
+ # Ingest documents
347
+ print(f"\nIngesting documents into RAG...")
348
+ try:
349
+ for doc_name, content in TEST_DOCUMENTS.items():
350
+ doc_id = Path(doc_name).stem # Remove extension
351
+ pipeline.ingest(doc_id, content)
352
+ print(f" βœ“ Ingested {doc_name} (doc_id: {doc_id})")
353
+ except Exception as e:
354
+ print(f" βœ— Failed to ingest documents: {e}")
355
+ return
356
+
357
+ # Run test cases
358
+ print(f"\nRunning {len(TEST_CASES)} test cases...\n")
359
+
360
+ all_retrieved_docs = []
361
+
362
+ for i, test_case in enumerate(TEST_CASES, 1):
363
+ query = test_case["query"]
364
+ expected_keywords = test_case["expected_answer_keywords"]
365
+ expected_docs = test_case["expected_source_docs"]
366
+
367
+ print(f"Test {i}: {query}")
368
+ print(f" Expected sources: {expected_docs}")
369
+
370
+ try:
371
+ start_time = time.time()
372
+
373
+ # Query the RAG
374
+ result = pipeline.query(query, return_sources=True)
375
+
376
+ latency_ms = (time.time() - start_time) * 1000
377
+
378
+ answer = result.get('answer', '')
379
+ retrieved_docs = result.get('sources', [])
380
+ context = result.get('context', '')
381
+
382
+ # If context is empty, reconstruct from retrieved sources
383
+ if not context and retrieved_docs:
384
+ # Combine previews from all retrieved sources
385
+ context = ' '.join([source.get('preview', '') for source in retrieved_docs if isinstance(source, dict)])
386
+
387
+ print(f" DEBUG - context length: {len(context)}")
388
+
389
+ # Extract doc names from sources
390
+ # Sources are dicts with 'chunk_id' like 'medical_research_1_chunk_0'
391
+ retrieved_doc_names = []
392
+ for source in retrieved_docs:
393
+ if isinstance(source, dict):
394
+ chunk_id = source.get('chunk_id', '')
395
+ # Extract doc_id from chunk_id (format: {doc_id}_chunk_{num})
396
+ doc_id = chunk_id.rsplit('_chunk_', 1)[0] if '_chunk_' in chunk_id else 'unknown'
397
+ retrieved_doc_names.append(doc_id)
398
+
399
+ all_retrieved_docs.extend(retrieved_doc_names)
400
+
401
+ print(f" Retrieved: {retrieved_doc_names}")
402
+ print(f" Answer: {answer[:100]}...")
403
+
404
+ # COMPUTE REAL METRICS
405
+ retrieval_precision = compute_retrieval_precision(
406
+ retrieved_doc_names,
407
+ expected_docs
408
+ )
409
+ retrieval_recall = compute_retrieval_recall(
410
+ retrieved_doc_names,
411
+ expected_docs
412
+ )
413
+ bert_score = compute_bert_score(answer, expected_keywords)
414
+ answer_relevance = compute_answer_relevance(answer, query)
415
+ faithfulness = compute_faithfulness(answer, context)
416
+ hallucination_detected = detect_hallucinations(answer, context)
417
+ source_attribution = compute_source_attribution(answer, context)
418
+
419
+ print(f" Precision: {retrieval_precision:.2f} | Recall: {retrieval_recall:.2f}")
420
+ print(f" BERTScore: {bert_score:.2f} | Relevance: {answer_relevance:.2f}")
421
+ print(f" Faithfulness: {faithfulness:.2f} | Attribution: {source_attribution:.2f}")
422
+ print(f" Hallucination: {hallucination_detected} | Latency: {latency_ms:.0f}ms")
423
+
424
+ # Create evaluation result
425
+ eval_result = EvaluationResult(
426
+ query=query,
427
+ answer=answer,
428
+ source_docs=retrieved_doc_names,
429
+ num_retrieved=len(retrieved_docs),
430
+ retrieval_precision=retrieval_precision,
431
+ retrieval_recall=retrieval_recall,
432
+ rank_position=1 if expected_docs[0] in retrieved_doc_names else 2,
433
+ rouge_l=bert_score, # Approximation
434
+ bert_score=bert_score,
435
+ answer_relevance=answer_relevance,
436
+ faithfulness=faithfulness,
437
+ hallucination_detected=hallucination_detected,
438
+ source_attribution_score=source_attribution,
439
+ latency_ms=latency_ms,
440
+ tokens_used=len(answer.split()),
441
+ cost_cents=0.004, # Estimate for Groq
442
+ )
443
+
444
+ evaluator.add_result(eval_result)
445
+
446
+ print()
447
+
448
+ except Exception as e:
449
+ print(f" βœ— Test failed: {e}\n")
450
+
451
+ # Print summary
452
+ print("\n" + "=" * 70)
453
+ print("EVALUATION SUMMARY")
454
+ print("=" * 70)
455
+
456
+ metrics = evaluator.compute_aggregate_metrics()
457
+
458
+ print(f"\nResults:")
459
+ print(f" Total Evaluations: {metrics['total_evaluations']}")
460
+ print(f" Avg Precision: {metrics['retrieval_precision_mean']:.3f}")
461
+ print(f" Avg Recall: {metrics['retrieval_recall_mean']:.3f}")
462
+ print(f" Avg BERTScore: {metrics['bert_score_mean']:.3f}")
463
+ print(f" Avg Faithfulness: {metrics['faithfulness_mean']:.3f}")
464
+ print(f" Hallucination Rate: {metrics['hallucination_rate']*100:.1f}%")
465
+ print(f" Avg Latency: {metrics['latency_mean']:.0f}ms")
466
+ print(f" MRR: {metrics['mrr']:.3f}")
467
+
468
+ print(f"\nResults saved to: evaluation_results/results.jsonl")
469
+ print(f"View dashboard at: http://localhost:8000/evaluation")
470
+
471
+
472
+ if __name__ == "__main__":
473
+ run_real_evaluation()
src/evaluation/ragas_integration/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """RAGAS Integration for Document Intelligence RAG"""
2
+
3
+
4
+ from .pipeline_adapter import RagasReadyPipeline, EvaluationReadyResponse
5
+ from .ragas_evaluator import RagasEvaluator, RagasEvaluationResult
6
+ from .ragas_endpoints import init_ragas_router
7
+
8
+ __all__ = [
9
+ "RagasReadyPipeline",
10
+ "EvaluationReadyResponse",
11
+ "RagasEvaluator",
12
+ "RagasEvaluationResult",
13
+ "init_ragas_router",
14
+ ]
src/evaluation/ragas_integration/pipeline_adapter.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pipeline Adapter - Captures full context for RAGAS evaluation
3
+ """
4
+ import time
5
+ from typing import Dict, Any, List, Optional
6
+ from dataclasses import dataclass
7
+ from src.rag.llm import build_context_string
8
+
9
+
10
+ @dataclass
11
+ class EvaluationReadyResponse:
12
+ """Response with full contexts for RAGAS."""
13
+ query: str
14
+ answer: str
15
+ contexts: List[str] # Full text of each retrieved chunk
16
+ sources: List[Dict] # Original source metadata
17
+ chunks_used: int
18
+ response_time_ms: float
19
+ status: str
20
+
21
+
22
+ class RagasReadyPipeline:
23
+ """
24
+ Wraps your RAGPipeline to capture full context.
25
+ """
26
+
27
+ def __init__(self, base_pipeline):
28
+ """
29
+ Args:
30
+ base_pipeline: Your existing RAGPipeline instance
31
+ """
32
+ self.pipeline = base_pipeline
33
+ self.config = base_pipeline.config
34
+ self.embeddings = base_pipeline.embeddings
35
+ self.llm = base_pipeline.llm
36
+ self.vector_store = base_pipeline.vector_store
37
+
38
+ def query_for_evaluation(
39
+ self,
40
+ query: str,
41
+ top_k: Optional[int] = None
42
+ ) -> EvaluationReadyResponse:
43
+ """
44
+ Query and capture FULL context for RAGAS.
45
+
46
+ TODO: Implement this by:
47
+ 1. Embedding the query (use self.pipeline.embeddings)
48
+ 2. Retrieving chunks (use self.pipeline.vector_store.retrieve)
49
+ 3. Extracting FULL text from each chunk
50
+ 4. Building context for LLM
51
+ 5. Getting answer from LLM
52
+ 6. Returning EvaluationReadyResponse with full contexts
53
+ """
54
+ start_time = time.time()
55
+
56
+ if self.vector_store.size() == 0:
57
+ return EvaluationReadyResponse(
58
+ query=query,
59
+ answer="No documents in vector store",
60
+ contexts=[],
61
+ sources=[],
62
+ chunks_used=0,
63
+ response_time_ms=0,
64
+ status="no_documents"
65
+ )
66
+
67
+ query_embedding = self.embeddings.embed(query)
68
+ k = top_k or self.config.top_k
69
+ retrieved_chunks = self.vector_store.retrieve(query_embedding, top_k=k)
70
+ if not retrieved_chunks:
71
+ return EvaluationReadyResponse(
72
+ query=query,
73
+ answer="No relevant documents found.",
74
+ contexts=[],
75
+ sources=[],
76
+ chunks_used=0,
77
+ response_time_ms=(time.time() - start_time) * 1000,
78
+ status="no_results"
79
+ )
80
+ contexts = [chunk.text for chunk in retrieved_chunks]
81
+
82
+ context_string = build_context_string(retrieved_chunks)
83
+ answer = self.llm.query(context=context_string, query=query)
84
+ sources = [
85
+ {
86
+ "chunk_id": chunk.chunk_id,
87
+ "similarity": round(chunk.similarity, 3),
88
+ "preview": chunk.text[:100] + "..." if len(chunk.text) > 100 else chunk.text
89
+ }
90
+ for chunk in retrieved_chunks
91
+ ]
92
+
93
+ response_time_ms = (time.time() - start_time) * 1000
94
+
95
+ return EvaluationReadyResponse(
96
+ query=query,
97
+ answer=answer,
98
+ contexts=contexts, # Full texts for RAGAS!
99
+ sources=sources,
100
+ chunks_used=len(contexts),
101
+ response_time_ms=response_time_ms,
102
+ status="success"
103
+ )
src/evaluation/ragas_integration/ragas_endpoints.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAGAS API Endpoints
3
+ """
4
+ from fastapi import APIRouter, HTTPException
5
+ from pydantic import BaseModel
6
+ from typing import List, Optional
7
+
8
+ router = APIRouter()
9
+
10
+ # We'll set these from main.py
11
+ ragas_pipeline = None
12
+ ragas_evaluator = None
13
+
14
+
15
+ class RagasEvalRequest(BaseModel):
16
+ """Direct evaluation request."""
17
+ query: str
18
+ answer: str
19
+ contexts: List[str]
20
+
21
+
22
+ class RagasQueryRequest(BaseModel):
23
+ """Query + evaluate request."""
24
+ query: str
25
+ top_k: int = 3
26
+
27
+
28
+ def init_ragas_router(pipeline, evaluator):
29
+ """Initialize router with pipeline and evaluator instances."""
30
+ global ragas_pipeline, ragas_evaluator
31
+ ragas_pipeline = pipeline
32
+ ragas_evaluator = evaluator
33
+ return router
34
+
35
+
36
+ @router.post("/evaluate")
37
+ async def evaluate_direct(request: RagasEvalRequest):
38
+ """
39
+ Evaluate a query-answer pair directly.
40
+
41
+ Use this when you already have the answer and contexts.
42
+ """
43
+ if not ragas_evaluator:
44
+ raise HTTPException(status_code=503, detail="RAGAS not initialized")
45
+
46
+ result = await ragas_evaluator.evaluate_single(
47
+ query=request.query,
48
+ answer=request.answer,
49
+ contexts=request.contexts
50
+ )
51
+
52
+ return {
53
+ "eval_id": result.eval_id,
54
+ "faithfulness": result.faithfulness,
55
+ "context_precision": result.context_precision,
56
+ "ragas_score": result.ragas_score,
57
+ "latency_ms": round(result.latency_ms, 2)
58
+ }
59
+
60
+
61
+ @router.post("/query-and-evaluate")
62
+ async def query_and_evaluate(request: RagasQueryRequest):
63
+ """
64
+ Query the RAG system AND evaluate the response.
65
+
66
+ Returns both the answer and RAGAS metrics.
67
+ """
68
+ if not ragas_pipeline or not ragas_evaluator:
69
+ raise HTTPException(status_code=503, detail="RAGAS not initialized")
70
+
71
+ # Step 1: Query pipeline
72
+ response = ragas_pipeline.query_for_evaluation(
73
+ query=request.query,
74
+ top_k=request.top_k
75
+ )
76
+
77
+ if response.status != "success":
78
+ return {
79
+ "query": response.query,
80
+ "answer": response.answer,
81
+ "status": response.status,
82
+ "ragas": None
83
+ }
84
+
85
+ # Step 2: Evaluate with RAGAS
86
+ eval_result = await ragas_evaluator.evaluate_single(
87
+ query=response.query,
88
+ answer=response.answer,
89
+ contexts=response.contexts
90
+ )
91
+
92
+ return {
93
+ "query": response.query,
94
+ "answer": response.answer,
95
+ "sources": response.sources,
96
+ "chunks_used": response.chunks_used,
97
+ "response_time_ms": round(response.response_time_ms, 2),
98
+ "ragas": {
99
+ "eval_id": eval_result.eval_id,
100
+ "faithfulness": eval_result.faithfulness,
101
+ "context_precision": eval_result.context_precision,
102
+ "ragas_score": eval_result.ragas_score,
103
+ "eval_time_ms": round(eval_result.latency_ms, 2)
104
+ }
105
+ }
106
+
107
+
108
+ @router.get("/metrics")
109
+ async def get_metrics():
110
+ """Get aggregate RAGAS metrics from all evaluations."""
111
+ if not ragas_evaluator:
112
+ raise HTTPException(status_code=503, detail="RAGAS not initialized")
113
+
114
+ results = ragas_evaluator.results
115
+
116
+ if not results:
117
+ return {"total_evaluations": 0, "message": "No evaluations yet"}
118
+
119
+ # Calculate averages
120
+ avg_faithfulness = sum(r.faithfulness for r in results) / len(results)
121
+ avg_precision = sum(r.context_precision for r in results) / len(results)
122
+ avg_ragas = sum(r.ragas_score for r in results) / len(results)
123
+
124
+ return {
125
+ "total_evaluations": len(results),
126
+ "avg_faithfulness": round(avg_faithfulness, 3),
127
+ "avg_context_precision": round(avg_precision, 3),
128
+ "avg_ragas_score": round(avg_ragas, 3)
129
+ }
src/evaluation/ragas_integration/ragas_evaluator.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAGAS Evaluator - Core evaluation logic using RAGAS framework
3
+ """
4
+ import os
5
+ import logging
6
+ from typing import List, Dict, Any, Optional
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime
9
+
10
+ # RAGAS imports
11
+ from ragas.metrics import (
12
+ Faithfulness,
13
+ ResponseRelevancy,
14
+ LLMContextPrecisionWithoutReference,
15
+ )
16
+ from ragas.llms import LangchainLLMWrapper
17
+ from ragas.dataset_schema import SingleTurnSample
18
+
19
+ # LangChain for LLM wrapper (RAGAS requirement)
20
+ from langchain_groq import ChatGroq
21
+
22
+ logging.basicConfig(level=logging.INFO)
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ @dataclass
27
+ class RagasEvaluationResult:
28
+ """Result from RAGAS evaluation."""
29
+ eval_id: str
30
+ query: str
31
+
32
+ # RAGAS metrics (0-1 scale)
33
+ faithfulness: float
34
+ answer_relevancy: float
35
+ context_precision: float
36
+
37
+ # Composite score
38
+ ragas_score: float = 0.0
39
+
40
+ # Metadata
41
+ latency_ms: float = 0.0
42
+ timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
43
+
44
+ def __post_init__(self):
45
+ """Calculate composite RAGAS score."""
46
+ scores = [self.faithfulness, self.context_precision]
47
+ valid_scores = [s for s in scores if s > 0]
48
+ self.ragas_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
49
+
50
+
51
+ class RagasEvaluator:
52
+ """
53
+ Evaluates RAG responses using RAGAS metrics.
54
+
55
+ Metrics:
56
+ - Faithfulness: Is the answer grounded in the context?
57
+ - Answer Relevancy: Does the answer address the question?
58
+ - Context Precision: Are the retrieved chunks useful?
59
+ """
60
+
61
+ def __init__(self, groq_api_key: Optional[str] = None):
62
+ """
63
+ Initialize RAGAS evaluator.
64
+
65
+ Args:
66
+ groq_api_key: Your Groq API key (or uses GROQ_API_KEY env var)
67
+ """
68
+ # TODO: Step 1 - Get API key
69
+ api_key = groq_api_key or os.getenv("GROQ_API_KEY")
70
+ if not api_key:
71
+ raise ValueError("GROQ_API_KEY required")
72
+ llm = ChatGroq(
73
+ api_key=api_key,
74
+ model_name="llama-3.3-70b-versatile",
75
+ temperature=0
76
+ )
77
+
78
+ self.evaluator_llm = LangchainLLMWrapper(llm)
79
+
80
+
81
+ self.faithfulness = Faithfulness(llm=self.evaluator_llm)
82
+ # self.answer_relevancy = ResponseRelevancy(llm=self.evaluator_llm)
83
+ self.context_precision = LLMContextPrecisionWithoutReference(llm=self.evaluator_llm)
84
+
85
+ # Storage for results
86
+ self.results: List[RagasEvaluationResult] = []
87
+
88
+ logger.info("βœ“ RAGAS Evaluator initialized (Faithfulness + Context Precision)")
89
+
90
+ async def evaluate_single(
91
+ self,
92
+ query: str,
93
+ answer: str,
94
+ contexts: List[str],
95
+ ground_truth: Optional[str] = None
96
+ ) -> RagasEvaluationResult:
97
+ """
98
+ Evaluate a single RAG response.
99
+ """
100
+ import time
101
+ import hashlib
102
+
103
+ start_time = time.time()
104
+
105
+ # 1. Create SingleTurnSample
106
+ sample = SingleTurnSample(
107
+ user_input=query,
108
+ response=answer,
109
+ retrieved_contexts=contexts,
110
+ reference=ground_truth or ""
111
+ )
112
+
113
+ # 2. Score with each metric (async!)
114
+ faithfulness_score = await self.faithfulness.single_turn_ascore(sample)
115
+ # answer_relevancy_score = await self.answer_relevancy.single_turn_ascore(sample)
116
+ answer_relevancy_score = None
117
+ context_precision_score = await self.context_precision.single_turn_ascore(sample)
118
+
119
+ # 3. Calculate latency
120
+ latency_ms = (time.time() - start_time) * 1000
121
+
122
+ # 4. Generate eval_id
123
+ eval_id = hashlib.md5(f"{query}{datetime.now().isoformat()}".encode()).hexdigest()[:8]
124
+
125
+ # 5. Create and store result
126
+ result = RagasEvaluationResult(
127
+ eval_id=eval_id,
128
+ query=query,
129
+ faithfulness=float(faithfulness_score),
130
+ answer_relevancy=0.0, #float(answer_relevancy_score),
131
+ context_precision=float(context_precision_score),
132
+ latency_ms=latency_ms
133
+ )
134
+
135
+ self.results.append(result)
136
+
137
+ logger.info(f"Evaluation complete: RAGAS score = {result.ragas_score:.3f}")
138
+
139
+ return result
src/evaluation/ragas_integration/test_ragas.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Quick test for RAGAS integration
3
+ Run: python -m src.evaluation.ragas_integration.test_ragas
4
+ """
5
+ import asyncio
6
+ from src.rag import RAGPipeline, RAGConfig
7
+ from src.evaluation.ragas_integration import RagasReadyPipeline, RagasEvaluator
8
+
9
+
10
+ async def test_ragas():
11
+ print("=" * 50)
12
+ print("Testing RAGAS Integration")
13
+ print("=" * 50)
14
+
15
+ # Step 1: Initialize pipeline
16
+ print("\n1. Initializing pipeline...")
17
+ pipeline = RAGPipeline(RAGConfig())
18
+ ragas_pipeline = RagasReadyPipeline(pipeline)
19
+ print(" βœ“ Pipeline ready")
20
+
21
+ # Step 2: Initialize evaluator
22
+ print("\n2. Initializing RAGAS evaluator...")
23
+ evaluator = RagasEvaluator()
24
+ print(" βœ“ Evaluator ready")
25
+
26
+ # Step 3: Ingest a test document
27
+ print("\n3. Ingesting test document...")
28
+ test_text = """
29
+ Machine learning is a subset of artificial intelligence that enables
30
+ systems to learn from data. Deep learning is a type of machine learning
31
+ that uses neural networks with multiple layers. Natural language processing
32
+ (NLP) is used to understand human language.
33
+ """
34
+ pipeline.ingest("test_doc", test_text)
35
+ print(f" βœ“ Ingested {pipeline.vector_store.size()} chunks")
36
+
37
+ # Step 4: Query with full context capture
38
+ print("\n4. Querying pipeline...")
39
+ response = ragas_pipeline.query_for_evaluation("What is machine learning?")
40
+ print(f" Query: {response.query}")
41
+ print(f" Answer: {response.answer[:100]}...")
42
+ print(f" Contexts captured: {len(response.contexts)}")
43
+ print(f" Status: {response.status}")
44
+
45
+ # Step 5: Evaluate with RAGAS
46
+ print("\n5. Running RAGAS evaluation...")
47
+ result = await evaluator.evaluate_single(
48
+ query=response.query,
49
+ answer=response.answer,
50
+ contexts=response.contexts
51
+ )
52
+
53
+ print(f"\n{'=' * 50}")
54
+ print("RAGAS RESULTS")
55
+ print(f"{'=' * 50}")
56
+ print(f" Faithfulness: {result.faithfulness:.3f}")
57
+ print(f" Answer Relevancy: {result.answer_relevancy:.3f}")
58
+ print(f" Context Precision: {result.context_precision:.3f}")
59
+ print(f" ─────────────────────────────")
60
+ print(f" RAGAS Score: {result.ragas_score:.3f}")
61
+ print(f" Eval Time: {result.latency_ms:.0f}ms")
62
+ print(f"{'=' * 50}")
63
+
64
+ return result
65
+
66
+
67
+ if __name__ == "__main__":
68
+ asyncio.run(test_ragas())
src/main.py CHANGED
@@ -10,6 +10,12 @@ from datetime import datetime
10
  import tempfile
11
  from pathlib import Path
12
 
 
 
 
 
 
 
13
  from src.rag import RAGPipeline, RAGConfig
14
  from src.evaluation import RAGEvaluator, EvaluationResult
15
  import io
@@ -49,7 +55,8 @@ if os.path.exists("frontend"):
49
 
50
  # Global pipeline instance
51
  pipeline: Optional[RAGPipeline] = None
52
-
 
53
 
54
  # ==================== Pydantic Models ====================
55
 
@@ -108,7 +115,7 @@ class StatsResponse(BaseModel):
108
  @app.on_event("startup")
109
  async def startup_event():
110
  """Initialize pipeline on startup."""
111
- global pipeline
112
 
113
  logger.info("=" * 60)
114
  logger.info("Starting Document Intelligence RAG API")
@@ -124,8 +131,18 @@ async def startup_event():
124
 
125
  # Initialize pipeline (automatically uses get_embeddings_client())
126
  pipeline = RAGPipeline(config=config)
127
-
128
  logger.info("βœ“ Pipeline initialized successfully")
 
 
 
 
 
 
 
 
 
 
 
129
  logger.info(f"βœ“ Embedding backend: {config.embedding_backend}")
130
  logger.info(f"βœ“ API ready at http://localhost:8000")
131
  logger.info(f"βœ“ Interactive docs at http://localhost:8000/docs")
@@ -429,7 +446,7 @@ async def reset_system():
429
  Returns:
430
  Reset confirmation
431
  """
432
- global pipeline
433
 
434
  if not pipeline:
435
  raise HTTPException(status_code=503, detail="Pipeline not initialized")
@@ -439,12 +456,15 @@ async def reset_system():
439
 
440
  # Clear vector store
441
  pipeline.vector_store.clear()
 
 
 
442
 
443
  logger.info("βœ“ System reset complete")
444
 
445
  return {
446
  "status": "success",
447
- "message": "All documents and embeddings cleared",
448
  "chunks_remaining": 0,
449
  "timestamp": datetime.now().isoformat()
450
  }
@@ -659,6 +679,16 @@ async def query_with_evaluation(request: dict):
659
  return {"error": str(e)}, 500
660
 
661
 
 
 
 
 
 
 
 
 
 
 
662
  # ==================== Root Endpoint ====================
663
 
664
  @app.get("/", response_class=FileResponse)
 
10
  import tempfile
11
  from pathlib import Path
12
 
13
+ from src.evaluation.ragas_integration import (
14
+ RagasReadyPipeline,
15
+ RagasEvaluator,
16
+ init_ragas_router,
17
+ )
18
+
19
  from src.rag import RAGPipeline, RAGConfig
20
  from src.evaluation import RAGEvaluator, EvaluationResult
21
  import io
 
55
 
56
  # Global pipeline instance
57
  pipeline: Optional[RAGPipeline] = None
58
+ ragas_pipeline = None
59
+ ragas_evaluator = None
60
 
61
  # ==================== Pydantic Models ====================
62
 
 
115
  @app.on_event("startup")
116
  async def startup_event():
117
  """Initialize pipeline on startup."""
118
+ global pipeline, ragas_pipeline, ragas_evaluator
119
 
120
  logger.info("=" * 60)
121
  logger.info("Starting Document Intelligence RAG API")
 
131
 
132
  # Initialize pipeline (automatically uses get_embeddings_client())
133
  pipeline = RAGPipeline(config=config)
 
134
  logger.info("βœ“ Pipeline initialized successfully")
135
+
136
+ # RAGAS integration
137
+ ragas_pipeline = RagasReadyPipeline(pipeline)
138
+ logger.info("βœ“ Ragas pipeline initialized successfully")
139
+ ragas_evaluator = RagasEvaluator()
140
+ logger.info("βœ“ Ragas evaluator initialized successfully")
141
+ ragas_router = init_ragas_router(ragas_pipeline, ragas_evaluator)
142
+ app.include_router(ragas_router, prefix="/ragas", tags=["RAGAS Evaluation"])
143
+ logger.info("βœ“ Ragas evaluator initialized successfully")
144
+
145
+
146
  logger.info(f"βœ“ Embedding backend: {config.embedding_backend}")
147
  logger.info(f"βœ“ API ready at http://localhost:8000")
148
  logger.info(f"βœ“ Interactive docs at http://localhost:8000/docs")
 
446
  Returns:
447
  Reset confirmation
448
  """
449
+ global pipeline, ragas_evaluator
450
 
451
  if not pipeline:
452
  raise HTTPException(status_code=503, detail="Pipeline not initialized")
 
456
 
457
  # Clear vector store
458
  pipeline.vector_store.clear()
459
+ if ragas_evaluator:
460
+ ragas_evaluator.results = []
461
+ logger.info("βœ“ RAGAS evaluations cleared")
462
 
463
  logger.info("βœ“ System reset complete")
464
 
465
  return {
466
  "status": "success",
467
+ "message": "All documents, embeddings, and RAGAS evaluations cleared",
468
  "chunks_remaining": 0,
469
  "timestamp": datetime.now().isoformat()
470
  }
 
679
  return {"error": str(e)}, 500
680
 
681
 
682
+ # ===================== RAGAS Endpoints ====================
683
+
684
+ @app.get("/ragas-demo")
685
+ async def ragas_demo_page():
686
+ """Serve RAGAS evaluation demo page."""
687
+ frontend_path = "frontend/ragas.html"
688
+ if os.path.exists(frontend_path):
689
+ return FileResponse(frontend_path)
690
+ return {"error": "RAGAS demo page not found"}
691
+
692
  # ==================== Root Endpoint ====================
693
 
694
  @app.get("/", response_class=FileResponse)
uv.lock CHANGED
The diff for this file is too large to render. See raw diff