Spaces:
Sleeping
Sleeping
feat: add RAGAS eval
Browse files- evaluation_results/_results.jsonl +50 -0
- evaluation_results/results.jsonl +20 -50
- frontend/evaluation.html +567 -341
- frontend/index.html +1 -0
- frontend/index_.html +592 -0
- frontend/ragas.html +671 -0
- pyproject.toml +5 -2
- run_real_evaluation.py +473 -0
- src/evaluation/ragas_integration/__init__.py +14 -0
- src/evaluation/ragas_integration/pipeline_adapter.py +103 -0
- src/evaluation/ragas_integration/ragas_endpoints.py +129 -0
- src/evaluation/ragas_integration/ragas_evaluator.py +139 -0
- src/evaluation/ragas_integration/test_ragas.py +68 -0
- src/main.py +35 -5
- uv.lock +0 -0
evaluation_results/_results.jsonl
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8615102352119911, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.6217199504672873, "bert_score": 0.9101784656133992, "answer_relevance": 0.8611807441816679, "faithfulness": 0.9889532712914122, "hallucination_detected": 0, "source_attribution_score": 0.9197433053801606, "latency_ms": 193.9050181207473, "tokens_used": 180, "cost_cents": 0.1947866279399885, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "abaf4ca6"}
|
| 2 |
+
{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.6691151867351297, "retrieval_recall": 0.823127264267807, "rank_position": 1, "rouge_l": 0.714583633420124, "bert_score": 0.7968070501948343, "answer_relevance": 0.8386952468169229, "faithfulness": 0.8427198816502497, "hallucination_detected": 0, "source_attribution_score": 0.834049480985246, "latency_ms": 309.52617615332184, "tokens_used": 130, "cost_cents": 0.5222450372503339, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
|
| 3 |
+
{"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 2, "retrieval_precision": 0.7820863657323606, "retrieval_recall": 0.7278826391993161, "rank_position": 4, "rouge_l": 0.7288516571075816, "bert_score": 0.800838399605806, "answer_relevance": 0.7623839343155656, "faithfulness": 0.760938424869514, "hallucination_detected": 0, "source_attribution_score": 0.7367638541396095, "latency_ms": 127.88553000716428, "tokens_used": 86, "cost_cents": 0.6028654205830427, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "ff093944"}
|
| 4 |
+
{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8032957101002208, "retrieval_recall": 0.9059703284838815, "rank_position": 1, "rouge_l": 0.8300273338544246, "bert_score": 0.9454453940286349, "answer_relevance": 0.9520338304764728, "faithfulness": 0.897131533318752, "hallucination_detected": 0, "source_attribution_score": 0.9492810947177941, "latency_ms": 465.42015740446305, "tokens_used": 223, "cost_cents": 0.20152073810222879, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "c4f50504"}
|
| 5 |
+
{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8530337397480929, "retrieval_recall": 0.7059261296867919, "rank_position": 2, "rouge_l": 0.7193205808960748, "bert_score": 0.9022171118953591, "answer_relevance": 0.8531732924021801, "faithfulness": 0.8121930123501006, "hallucination_detected": 0, "source_attribution_score": 0.8249719199625603, "latency_ms": 118.93191807619638, "tokens_used": 156, "cost_cents": 0.6705483559336415, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
|
| 6 |
+
{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.871851781632808, "retrieval_recall": 1.0, "rank_position": 2, "rouge_l": 0.8763052973676115, "bert_score": 0.9155189067363468, "answer_relevance": 0.7819811920531572, "faithfulness": 0.9020511875557776, "hallucination_detected": 0, "source_attribution_score": 0.8835911263653357, "latency_ms": 215.06062627830062, "tokens_used": 158, "cost_cents": 0.28454809454724767, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "c4f50504"}
|
| 7 |
+
{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8374406518052472, "retrieval_recall": 0.8995269271491464, "rank_position": 1, "rouge_l": 0.6625752862799461, "bert_score": 0.8433178449037969, "answer_relevance": 0.8430013035861083, "faithfulness": 0.893951241843859, "hallucination_detected": 0, "source_attribution_score": 0.7615935243739598, "latency_ms": 419.38297913278507, "tokens_used": 219, "cost_cents": 0.685936998794628, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
|
| 8 |
+
{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.7801096274968522, "retrieval_recall": 0.6863130363664056, "rank_position": 2, "rouge_l": 0.7314517596590595, "bert_score": 0.8276297752821052, "answer_relevance": 0.7915041989155733, "faithfulness": 0.814200114298667, "hallucination_detected": 0, "source_attribution_score": 0.7910397701255416, "latency_ms": 192.75282528673864, "tokens_used": 140, "cost_cents": 0.5706402044081957, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
|
| 9 |
+
{"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9319798193959905, "retrieval_recall": 0.7301414759104026, "rank_position": 3, "rouge_l": 0.9195189478153559, "bert_score": 0.9506571721308754, "answer_relevance": 0.9012898093375585, "faithfulness": 0.9159276711160365, "hallucination_detected": 0, "source_attribution_score": 0.8105097496319957, "latency_ms": 310.01153330005803, "tokens_used": 134, "cost_cents": 0.36313962364633723, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "ff093944"}
|
| 10 |
+
{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.987131336980288, "retrieval_recall": 0.8161833189974133, "rank_position": 2, "rouge_l": 0.754633239450571, "bert_score": 0.8525460742457374, "answer_relevance": 0.8388153285264023, "faithfulness": 0.8947958687708046, "hallucination_detected": 0, "source_attribution_score": 0.8670965141635586, "latency_ms": 367.03119966417205, "tokens_used": 105, "cost_cents": 0.6425165690009661, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1237660e"}
|
| 11 |
+
{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8161943972603446, "retrieval_recall": 0.8191451209916161, "rank_position": 1, "rouge_l": 0.8566238483374247, "bert_score": 0.8407886193759627, "answer_relevance": 0.788647130938179, "faithfulness": 0.9458751488959517, "hallucination_detected": 0, "source_attribution_score": 0.8442883639082127, "latency_ms": 394.39735015927437, "tokens_used": 126, "cost_cents": 0.6182353694114775, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "730981e3"}
|
| 12 |
+
{"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9268720168249583, "retrieval_recall": 0.7744657390458949, "rank_position": 1, "rouge_l": 0.726991263638828, "bert_score": 0.9072089249292097, "answer_relevance": 0.7368736773342853, "faithfulness": 0.9109545928726132, "hallucination_detected": 0, "source_attribution_score": 0.8389074559482628, "latency_ms": 363.97033617468753, "tokens_used": 89, "cost_cents": 0.1520075706493582, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1236ae18"}
|
| 13 |
+
{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 1.0, "retrieval_recall": 0.7883493024047399, "rank_position": 2, "rouge_l": 0.8794507996771228, "bert_score": 0.9890372805052198, "answer_relevance": 0.98, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.9202874935555082, "latency_ms": 180.5318450150473, "tokens_used": 164, "cost_cents": 0.3633483811341406, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "637a4c17"}
|
| 14 |
+
{"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9574712547642229, "retrieval_recall": 0.898715076798533, "rank_position": 3, "rouge_l": 0.8192000079755279, "bert_score": 0.8864239733582311, "answer_relevance": 0.7428977779588922, "faithfulness": 0.9030187960492433, "hallucination_detected": 0, "source_attribution_score": 0.7624554954695243, "latency_ms": 373.14060250844705, "tokens_used": 84, "cost_cents": 0.7117749597236492, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "abaf4ca6"}
|
| 15 |
+
{"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9405730690612055, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.8808942707215686, "bert_score": 0.9132934029079159, "answer_relevance": 0.905135078735406, "faithfulness": 0.8931492108116512, "hallucination_detected": 0, "source_attribution_score": 0.8268512614166635, "latency_ms": 339.0772795799579, "tokens_used": 214, "cost_cents": 0.7190302687955942, "timestamp": "2025-12-29T16:33:52.679894", "eval_id": "1236ae18"}
|
| 16 |
+
{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9730519547186312, "retrieval_recall": 0.7000247515495168, "rank_position": 1, "rouge_l": 0.8481502060571952, "bert_score": 0.8512169150469242, "answer_relevance": 0.6066385743234217, "faithfulness": 0.90739914345254, "hallucination_detected": 0, "source_attribution_score": 0.7272214400773345, "latency_ms": 180.27676298939465, "tokens_used": 213, "cost_cents": 0.32992956367012927, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "97deba54"}
|
| 17 |
+
{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.882795855159822, "retrieval_recall": 0.9536984414043154, "rank_position": 2, "rouge_l": 0.8652639536487609, "bert_score": 0.9510206928805952, "answer_relevance": 0.98, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.9041183043586343, "latency_ms": 304.8777755850387, "tokens_used": 202, "cost_cents": 0.4573272605920282, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "ea8a82db"}
|
| 18 |
+
{"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9074913930124041, "retrieval_recall": 0.8337368592917234, "rank_position": 2, "rouge_l": 0.7866443218610295, "bert_score": 0.8890794098843255, "answer_relevance": 0.8644104258787939, "faithfulness": 0.9162807414858771, "hallucination_detected": 0, "source_attribution_score": 0.8535049903935279, "latency_ms": 153.72881316060213, "tokens_used": 192, "cost_cents": 0.42866080997615663, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "f93850ae"}
|
| 19 |
+
{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 3, "retrieval_precision": 0.736362372230679, "retrieval_recall": 0.6501144391355456, "rank_position": 1, "rouge_l": 0.550190206571193, "bert_score": 0.7928686128619693, "answer_relevance": 0.7479453349256642, "faithfulness": 0.724801089955014, "hallucination_detected": 0, "source_attribution_score": 0.65, "latency_ms": 318.4624295326313, "tokens_used": 190, "cost_cents": 0.2437715682365154, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "97deba54"}
|
| 20 |
+
{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.6735724740087383, "retrieval_recall": 0.8542061489968834, "rank_position": 3, "rouge_l": 0.753272128099022, "bert_score": 0.7951286425632846, "answer_relevance": 0.7111372864814403, "faithfulness": 0.7658078142862852, "hallucination_detected": 0, "source_attribution_score": 0.7921982358187583, "latency_ms": 74.69663423689695, "tokens_used": 81, "cost_cents": 0.27307125512490826, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "d72f240c"}
|
| 21 |
+
{"query": "What are the inclusion/exclusion criteria for this study?", "answer": "Based on the clinical data, what are the inclusion/exclusion criteria for this study. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Adverse_Events_Listing.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8823590395988979, "retrieval_recall": 0.8054000463419554, "rank_position": 4, "rouge_l": 0.657165617121062, "bert_score": 0.7163229867147761, "answer_relevance": 0.9214749005186091, "faithfulness": 0.7984409109004382, "hallucination_detected": 0, "source_attribution_score": 0.8050997029870711, "latency_ms": 320.5743801768407, "tokens_used": 139, "cost_cents": 0.18256542430243572, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "ca0963ac"}
|
| 22 |
+
{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8280930369418137, "retrieval_recall": 0.8386391743395781, "rank_position": 2, "rouge_l": 0.859195213413575, "bert_score": 0.8490706943949407, "answer_relevance": 0.8362695754227021, "faithfulness": 0.9851436561648604, "hallucination_detected": 0, "source_attribution_score": 0.9006380350406552, "latency_ms": 315.9631216373207, "tokens_used": 204, "cost_cents": 0.7895173564931882, "timestamp": "2025-12-29T16:33:52.686903", "eval_id": "225ada1b"}
|
| 23 |
+
{"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9229645259544321, "retrieval_recall": 0.9045329288076733, "rank_position": 1, "rouge_l": 0.6794903586334973, "bert_score": 0.9114979234771378, "answer_relevance": 0.8710294892629211, "faithfulness": 0.8819010505339767, "hallucination_detected": 0, "source_attribution_score": 0.828883744019921, "latency_ms": 310.1823792045738, "tokens_used": 92, "cost_cents": 0.2911547513375744, "timestamp": "2025-12-29T16:33:52.687494", "eval_id": "d5d72fda"}
|
| 24 |
+
{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8319845838750437, "retrieval_recall": 0.8232464235414909, "rank_position": 2, "rouge_l": 0.8391753287550867, "bert_score": 0.8678652920374175, "answer_relevance": 0.7651917363208208, "faithfulness": 0.9490917638308898, "hallucination_detected": 0, "source_attribution_score": 0.8135628503503984, "latency_ms": 321.0052546194483, "tokens_used": 245, "cost_cents": 0.5595103002738705, "timestamp": "2025-12-29T16:33:52.687494", "eval_id": "9ea17371"}
|
| 25 |
+
{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8901844188411807, "retrieval_recall": 0.8188640273866499, "rank_position": 1, "rouge_l": 0.6292299958891738, "bert_score": 0.8351537114403716, "answer_relevance": 0.9379221844509074, "faithfulness": 0.889919633199584, "hallucination_detected": 0, "source_attribution_score": 0.9361556905489479, "latency_ms": 301.31795277671677, "tokens_used": 193, "cost_cents": 0.5017837116409055, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "2edfcccd"}
|
| 26 |
+
{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.7410372108118369, "retrieval_recall": 0.966334676629508, "rank_position": 4, "rouge_l": 0.7403451654608713, "bert_score": 0.9267453574782148, "answer_relevance": 0.8281170820867129, "faithfulness": 0.86390200814052, "hallucination_detected": 0, "source_attribution_score": 0.8378864895727723, "latency_ms": 211.24391674054434, "tokens_used": 83, "cost_cents": 0.2983786385854106, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "a061c7fe"}
|
| 27 |
+
{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8720063460928076, "retrieval_recall": 0.6729254224539245, "rank_position": 2, "rouge_l": 0.8327709659558473, "bert_score": 0.8128271397265061, "answer_relevance": 0.8905255016851306, "faithfulness": 0.84745277001056, "hallucination_detected": 0, "source_attribution_score": 0.8230615394254884, "latency_ms": 283.8294453572478, "tokens_used": 250, "cost_cents": 0.6332729607669917, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "2edfcccd"}
|
| 28 |
+
{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9301020713000657, "retrieval_recall": 0.924804842721284, "rank_position": 1, "rouge_l": 0.9427204506133842, "bert_score": 0.9349403716685819, "answer_relevance": 0.8945900053205512, "faithfulness": 0.9102438848352746, "hallucination_detected": 0, "source_attribution_score": 0.99, "latency_ms": 277.97498285046345, "tokens_used": 196, "cost_cents": 0.7801133042353303, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "8fdf6b7c"}
|
| 29 |
+
{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 0.899280381787354, "retrieval_recall": 0.8961888094914131, "rank_position": 2, "rouge_l": 0.5936623542297897, "bert_score": 0.823996206720772, "answer_relevance": 0.6865616319136963, "faithfulness": 0.8144270370656516, "hallucination_detected": 0, "source_attribution_score": 0.9211159702320861, "latency_ms": 316.20020030370006, "tokens_used": 94, "cost_cents": 0.7486503882498293, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "2ce76cd9"}
|
| 30 |
+
{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8315011482402368, "retrieval_recall": 0.833569355528467, "rank_position": 1, "rouge_l": 0.8225004807085223, "bert_score": 0.8431786717167729, "answer_relevance": 0.7471615327404427, "faithfulness": 0.8178606484394222, "hallucination_detected": 0, "source_attribution_score": 0.7317171144269652, "latency_ms": 265.3077015433886, "tokens_used": 228, "cost_cents": 0.2775564966165721, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "d06ff1bd"}
|
| 31 |
+
{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8641595010789612, "retrieval_recall": 0.7945909900018892, "rank_position": 1, "rouge_l": 0.8006758319947014, "bert_score": 0.8321939471946035, "answer_relevance": 0.825745775211993, "faithfulness": 0.8467257172080817, "hallucination_detected": 0, "source_attribution_score": 0.8497391658427235, "latency_ms": 235.03663142966545, "tokens_used": 141, "cost_cents": 0.17524629198643646, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "d06ff1bd"}
|
| 32 |
+
{"query": "What is the success rate from the phase II trial?", "answer": "Based on the clinical data, what is the success rate from the phase ii trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.6903702549893261, "retrieval_recall": 0.73223634008384, "rank_position": 2, "rouge_l": 0.707429022155934, "bert_score": 0.777869930411189, "answer_relevance": 0.7031065283777661, "faithfulness": 0.7, "hallucination_detected": 0, "source_attribution_score": 0.7439494136650804, "latency_ms": 400.82343329582545, "tokens_used": 98, "cost_cents": 0.7870249846125801, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "36d2fc3b"}
|
| 33 |
+
{"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.725867027337326, "retrieval_recall": 0.863170416240463, "rank_position": 2, "rouge_l": 0.8912824102328486, "bert_score": 0.9643405650883139, "answer_relevance": 0.8885158015034251, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.8784123194447961, "latency_ms": 236.3847138217219, "tokens_used": 204, "cost_cents": 0.5521449515774235, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "682b9450"}
|
| 34 |
+
{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9750320447486492, "retrieval_recall": 0.7245802712668319, "rank_position": 1, "rouge_l": 0.7099476163376697, "bert_score": 0.9440083937887742, "answer_relevance": 0.8156100248089608, "faithfulness": 0.8919262171326391, "hallucination_detected": 0, "source_attribution_score": 0.863174486121895, "latency_ms": 362.8327560575255, "tokens_used": 189, "cost_cents": 0.7171165823008571, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "0de1bbf5"}
|
| 35 |
+
{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9015276357455221, "retrieval_recall": 0.9264841648056893, "rank_position": 1, "rouge_l": 0.891494274758995, "bert_score": 0.99, "answer_relevance": 0.7355355223593343, "faithfulness": 0.9702206503913026, "hallucination_detected": 0, "source_attribution_score": 0.777455505087579, "latency_ms": 321.1816552932661, "tokens_used": 247, "cost_cents": 0.7289428286892591, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "848a59c9"}
|
| 36 |
+
{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9769493002143846, "retrieval_recall": 0.7089498127174636, "rank_position": 1, "rouge_l": 0.743951051682124, "bert_score": 0.88134771993094, "answer_relevance": 0.8198995975819598, "faithfulness": 0.759966915206261, "hallucination_detected": 0, "source_attribution_score": 0.653680388081969, "latency_ms": 394.7220409253053, "tokens_used": 217, "cost_cents": 0.3830027894556253, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "d882ccef"}
|
| 37 |
+
{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9190628509274618, "retrieval_recall": 0.8737374216126653, "rank_position": 1, "rouge_l": 0.7916949852181128, "bert_score": 0.9615837240171882, "answer_relevance": 0.8916037889745834, "faithfulness": 0.8590999883691032, "hallucination_detected": 0, "source_attribution_score": 0.7580742362127584, "latency_ms": 158.1258379270646, "tokens_used": 169, "cost_cents": 0.3752840055083183, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "f7a91f0a"}
|
| 38 |
+
{"query": "What are the inclusion/exclusion criteria for this study?", "answer": "Based on the clinical data, what are the inclusion/exclusion criteria for this study. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9933833682135537, "retrieval_recall": 0.9982061025926003, "rank_position": 2, "rouge_l": 0.778356020463265, "bert_score": 0.8294547617138849, "answer_relevance": 0.7798722760563348, "faithfulness": 0.9499485856550234, "hallucination_detected": 0, "source_attribution_score": 0.7780376963536395, "latency_ms": 173.39694248228693, "tokens_used": 88, "cost_cents": 0.5571400235923032, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "c57f0a77"}
|
| 39 |
+
{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf"], "num_retrieved": 1, "retrieval_precision": 0.805738836595438, "retrieval_recall": 0.6065653094384298, "rank_position": 1, "rouge_l": 0.7577008392952846, "bert_score": 0.8073994742363001, "answer_relevance": 0.7914931355367709, "faithfulness": 0.7987802242346304, "hallucination_detected": 0, "source_attribution_score": 0.8243795429292404, "latency_ms": 358.4161124076011, "tokens_used": 149, "cost_cents": 0.7836666239789596, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "6751328e"}
|
| 40 |
+
{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9294316493258027, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.99, "answer_relevance": 0.8935370280287651, "faithfulness": 0.9657921757626544, "hallucination_detected": 0, "source_attribution_score": 0.9450063312067425, "latency_ms": 327.13170085845616, "tokens_used": 175, "cost_cents": 0.6736960835259596, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "04338b7b"}
|
| 41 |
+
{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8691780049883588, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.9053724290502517, "bert_score": 0.887461041624208, "answer_relevance": 0.8255338939540914, "faithfulness": 0.9626678867129402, "hallucination_detected": 0, "source_attribution_score": 0.8858444522908131, "latency_ms": 285.00447311390025, "tokens_used": 212, "cost_cents": 0.26516004149862177, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "04338b7b"}
|
| 42 |
+
{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9104627762675658, "retrieval_recall": 0.937976535494375, "rank_position": 1, "rouge_l": 0.5755118375496409, "bert_score": 0.8919959033394592, "answer_relevance": 0.7836202850178633, "faithfulness": 0.953432186460839, "hallucination_detected": 0, "source_attribution_score": 0.7835803057646042, "latency_ms": 393.78508982459505, "tokens_used": 127, "cost_cents": 0.5839725982219669, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "6751328e"}
|
| 43 |
+
{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.7014865049756396, "retrieval_recall": 0.8532205085753696, "rank_position": 1, "rouge_l": 0.7036879055392565, "bert_score": 0.8232887647654229, "answer_relevance": 0.7240432417784443, "faithfulness": 0.8174281679074274, "hallucination_detected": 0, "source_attribution_score": 0.8094309307066749, "latency_ms": 346.74320628259454, "tokens_used": 156, "cost_cents": 0.30470012119609546, "timestamp": "2025-12-29T16:33:52.692906", "eval_id": "6817a77e"}
|
| 44 |
+
{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.7708525319903022, "retrieval_recall": 0.7140539324008609, "rank_position": 1, "rouge_l": 0.7582038473536197, "bert_score": 0.8719206100765141, "answer_relevance": 0.768747467165288, "faithfulness": 0.7863906811511377, "hallucination_detected": 0, "source_attribution_score": 0.7827059691758022, "latency_ms": 284.30338447510456, "tokens_used": 193, "cost_cents": 0.5194315945804843, "timestamp": "2025-12-29T16:33:52.692906", "eval_id": "70950525"}
|
| 45 |
+
{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8863518011536086, "retrieval_recall": 0.9528433531913749, "rank_position": 1, "rouge_l": 0.6924764309368061, "bert_score": 0.9074272676584865, "answer_relevance": 0.8856496644947377, "faithfulness": 0.9643048532855157, "hallucination_detected": 0, "source_attribution_score": 0.9218544026918479, "latency_ms": 387.9856471606976, "tokens_used": 107, "cost_cents": 0.1927569268723833, "timestamp": "2025-12-29T16:33:52.693443", "eval_id": "bc0d2943"}
|
| 46 |
+
{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8950536281566746, "retrieval_recall": 0.9144248160397045, "rank_position": 1, "rouge_l": 0.7618677680298188, "bert_score": 0.8461644035252505, "answer_relevance": 0.9653601861381645, "faithfulness": 0.8755786694922031, "hallucination_detected": 0, "source_attribution_score": 0.8808869584154418, "latency_ms": 353.36305965541663, "tokens_used": 245, "cost_cents": 0.5148915885221008, "timestamp": "2025-12-29T16:33:52.693443", "eval_id": "eeaa869f"}
|
| 47 |
+
{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8098059467924409, "retrieval_recall": 0.6023065734388835, "rank_position": 3, "rouge_l": 0.7004028932959154, "bert_score": 0.813015925326988, "answer_relevance": 0.6784644783231156, "faithfulness": 0.7845740350573508, "hallucination_detected": 0, "source_attribution_score": 0.65, "latency_ms": 459.79977076107156, "tokens_used": 117, "cost_cents": 0.27331328918592634, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "2202146d"}
|
| 48 |
+
{"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 3, "retrieval_precision": 1.0, "retrieval_recall": 0.7608373996776078, "rank_position": 1, "rouge_l": 0.7774878763927089, "bert_score": 0.8396996698506028, "answer_relevance": 0.8271539804365684, "faithfulness": 0.9337213882950308, "hallucination_detected": 0, "source_attribution_score": 0.8388598969576262, "latency_ms": 213.90842919317265, "tokens_used": 86, "cost_cents": 0.29153943157162554, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "bfa3ef53"}
|
| 49 |
+
{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8346511394851743, "retrieval_recall": 0.8814336106436549, "rank_position": 1, "rouge_l": 0.8430315573988195, "bert_score": 0.8944331459730633, "answer_relevance": 0.7897513415421694, "faithfulness": 0.8580143425540971, "hallucination_detected": 1, "source_attribution_score": 0.7346872187150348, "latency_ms": 361.9668724913192, "tokens_used": 101, "cost_cents": 0.5711407488163474, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "b0d984c7"}
|
| 50 |
+
{"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9009022034719727, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.9105038892045774, "bert_score": 0.8960253555216661, "answer_relevance": 0.8253569744550738, "faithfulness": 0.8368519357763938, "hallucination_detected": 0, "source_attribution_score": 0.7777537972022747, "latency_ms": 300.33605788776373, "tokens_used": 207, "cost_cents": 0.3048499137601775, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "81f2022b"}
|
evaluation_results/results.jsonl
CHANGED
|
@@ -1,50 +1,20 @@
|
|
| 1 |
-
{"query": "What is the
|
| 2 |
-
{"query": "What
|
| 3 |
-
{"query": "
|
| 4 |
-
{"query": "What
|
| 5 |
-
{"query": "What
|
| 6 |
-
{"query": "What
|
| 7 |
-
{"query": "What
|
| 8 |
-
{"query": "
|
| 9 |
-
{"query": "What
|
| 10 |
-
{"query": "What
|
| 11 |
-
{"query": "What
|
| 12 |
-
{"query": "What
|
| 13 |
-
{"query": "
|
| 14 |
-
{"query": "What is the
|
| 15 |
-
{"query": "What is the
|
| 16 |
-
{"query": "
|
| 17 |
-
{"query": "What
|
| 18 |
-
{"query": "
|
| 19 |
-
{"query": "
|
| 20 |
-
{"query": "What
|
| 21 |
-
{"query": "What are the inclusion/exclusion criteria for this study?", "answer": "Based on the clinical data, what are the inclusion/exclusion criteria for this study. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Adverse_Events_Listing.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8823590395988979, "retrieval_recall": 0.8054000463419554, "rank_position": 4, "rouge_l": 0.657165617121062, "bert_score": 0.7163229867147761, "answer_relevance": 0.9214749005186091, "faithfulness": 0.7984409109004382, "hallucination_detected": 0, "source_attribution_score": 0.8050997029870711, "latency_ms": 320.5743801768407, "tokens_used": 139, "cost_cents": 0.18256542430243572, "timestamp": "2025-12-29T16:33:52.685157", "eval_id": "ca0963ac"}
|
| 22 |
-
{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8280930369418137, "retrieval_recall": 0.8386391743395781, "rank_position": 2, "rouge_l": 0.859195213413575, "bert_score": 0.8490706943949407, "answer_relevance": 0.8362695754227021, "faithfulness": 0.9851436561648604, "hallucination_detected": 0, "source_attribution_score": 0.9006380350406552, "latency_ms": 315.9631216373207, "tokens_used": 204, "cost_cents": 0.7895173564931882, "timestamp": "2025-12-29T16:33:52.686903", "eval_id": "225ada1b"}
|
| 23 |
-
{"query": "What is the recommended dosage for this medication?", "answer": "Based on the clinical data, what is the recommended dosage for this medication. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9229645259544321, "retrieval_recall": 0.9045329288076733, "rank_position": 1, "rouge_l": 0.6794903586334973, "bert_score": 0.9114979234771378, "answer_relevance": 0.8710294892629211, "faithfulness": 0.8819010505339767, "hallucination_detected": 0, "source_attribution_score": 0.828883744019921, "latency_ms": 310.1823792045738, "tokens_used": 92, "cost_cents": 0.2911547513375744, "timestamp": "2025-12-29T16:33:52.687494", "eval_id": "d5d72fda"}
|
| 24 |
-
{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8319845838750437, "retrieval_recall": 0.8232464235414909, "rank_position": 2, "rouge_l": 0.8391753287550867, "bert_score": 0.8678652920374175, "answer_relevance": 0.7651917363208208, "faithfulness": 0.9490917638308898, "hallucination_detected": 0, "source_attribution_score": 0.8135628503503984, "latency_ms": 321.0052546194483, "tokens_used": 245, "cost_cents": 0.5595103002738705, "timestamp": "2025-12-29T16:33:52.687494", "eval_id": "9ea17371"}
|
| 25 |
-
{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8901844188411807, "retrieval_recall": 0.8188640273866499, "rank_position": 1, "rouge_l": 0.6292299958891738, "bert_score": 0.8351537114403716, "answer_relevance": 0.9379221844509074, "faithfulness": 0.889919633199584, "hallucination_detected": 0, "source_attribution_score": 0.9361556905489479, "latency_ms": 301.31795277671677, "tokens_used": 193, "cost_cents": 0.5017837116409055, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "2edfcccd"}
|
| 26 |
-
{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.7410372108118369, "retrieval_recall": 0.966334676629508, "rank_position": 4, "rouge_l": 0.7403451654608713, "bert_score": 0.9267453574782148, "answer_relevance": 0.8281170820867129, "faithfulness": 0.86390200814052, "hallucination_detected": 0, "source_attribution_score": 0.8378864895727723, "latency_ms": 211.24391674054434, "tokens_used": 83, "cost_cents": 0.2983786385854106, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "a061c7fe"}
|
| 27 |
-
{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8720063460928076, "retrieval_recall": 0.6729254224539245, "rank_position": 2, "rouge_l": 0.8327709659558473, "bert_score": 0.8128271397265061, "answer_relevance": 0.8905255016851306, "faithfulness": 0.84745277001056, "hallucination_detected": 0, "source_attribution_score": 0.8230615394254884, "latency_ms": 283.8294453572478, "tokens_used": 250, "cost_cents": 0.6332729607669917, "timestamp": "2025-12-29T16:33:52.687933", "eval_id": "2edfcccd"}
|
| 28 |
-
{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 2, "retrieval_precision": 0.9301020713000657, "retrieval_recall": 0.924804842721284, "rank_position": 1, "rouge_l": 0.9427204506133842, "bert_score": 0.9349403716685819, "answer_relevance": 0.8945900053205512, "faithfulness": 0.9102438848352746, "hallucination_detected": 0, "source_attribution_score": 0.99, "latency_ms": 277.97498285046345, "tokens_used": 196, "cost_cents": 0.7801133042353303, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "8fdf6b7c"}
|
| 29 |
-
{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 0.899280381787354, "retrieval_recall": 0.8961888094914131, "rank_position": 2, "rouge_l": 0.5936623542297897, "bert_score": 0.823996206720772, "answer_relevance": 0.6865616319136963, "faithfulness": 0.8144270370656516, "hallucination_detected": 0, "source_attribution_score": 0.9211159702320861, "latency_ms": 316.20020030370006, "tokens_used": 94, "cost_cents": 0.7486503882498293, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "2ce76cd9"}
|
| 30 |
-
{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8315011482402368, "retrieval_recall": 0.833569355528467, "rank_position": 1, "rouge_l": 0.8225004807085223, "bert_score": 0.8431786717167729, "answer_relevance": 0.7471615327404427, "faithfulness": 0.8178606484394222, "hallucination_detected": 0, "source_attribution_score": 0.7317171144269652, "latency_ms": 265.3077015433886, "tokens_used": 228, "cost_cents": 0.2775564966165721, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "d06ff1bd"}
|
| 31 |
-
{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf"], "num_retrieved": 1, "retrieval_precision": 0.8641595010789612, "retrieval_recall": 0.7945909900018892, "rank_position": 1, "rouge_l": 0.8006758319947014, "bert_score": 0.8321939471946035, "answer_relevance": 0.825745775211993, "faithfulness": 0.8467257172080817, "hallucination_detected": 0, "source_attribution_score": 0.8497391658427235, "latency_ms": 235.03663142966545, "tokens_used": 141, "cost_cents": 0.17524629198643646, "timestamp": "2025-12-29T16:33:52.688937", "eval_id": "d06ff1bd"}
|
| 32 |
-
{"query": "What is the success rate from the phase II trial?", "answer": "Based on the clinical data, what is the success rate from the phase ii trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.6903702549893261, "retrieval_recall": 0.73223634008384, "rank_position": 2, "rouge_l": 0.707429022155934, "bert_score": 0.777869930411189, "answer_relevance": 0.7031065283777661, "faithfulness": 0.7, "hallucination_detected": 0, "source_attribution_score": 0.7439494136650804, "latency_ms": 400.82343329582545, "tokens_used": 98, "cost_cents": 0.7870249846125801, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "36d2fc3b"}
|
| 33 |
-
{"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.725867027337326, "retrieval_recall": 0.863170416240463, "rank_position": 2, "rouge_l": 0.8912824102328486, "bert_score": 0.9643405650883139, "answer_relevance": 0.8885158015034251, "faithfulness": 0.99, "hallucination_detected": 0, "source_attribution_score": 0.8784123194447961, "latency_ms": 236.3847138217219, "tokens_used": 204, "cost_cents": 0.5521449515774235, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "682b9450"}
|
| 34 |
-
{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9750320447486492, "retrieval_recall": 0.7245802712668319, "rank_position": 1, "rouge_l": 0.7099476163376697, "bert_score": 0.9440083937887742, "answer_relevance": 0.8156100248089608, "faithfulness": 0.8919262171326391, "hallucination_detected": 0, "source_attribution_score": 0.863174486121895, "latency_ms": 362.8327560575255, "tokens_used": 189, "cost_cents": 0.7171165823008571, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "0de1bbf5"}
|
| 35 |
-
{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9015276357455221, "retrieval_recall": 0.9264841648056893, "rank_position": 1, "rouge_l": 0.891494274758995, "bert_score": 0.99, "answer_relevance": 0.7355355223593343, "faithfulness": 0.9702206503913026, "hallucination_detected": 0, "source_attribution_score": 0.777455505087579, "latency_ms": 321.1816552932661, "tokens_used": 247, "cost_cents": 0.7289428286892591, "timestamp": "2025-12-29T16:33:52.689939", "eval_id": "848a59c9"}
|
| 36 |
-
{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 3, "retrieval_precision": 0.9769493002143846, "retrieval_recall": 0.7089498127174636, "rank_position": 1, "rouge_l": 0.743951051682124, "bert_score": 0.88134771993094, "answer_relevance": 0.8198995975819598, "faithfulness": 0.759966915206261, "hallucination_detected": 0, "source_attribution_score": 0.653680388081969, "latency_ms": 394.7220409253053, "tokens_used": 217, "cost_cents": 0.3830027894556253, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "d882ccef"}
|
| 37 |
-
{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Pharmacokinetics_Study.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9190628509274618, "retrieval_recall": 0.8737374216126653, "rank_position": 1, "rouge_l": 0.7916949852181128, "bert_score": 0.9615837240171882, "answer_relevance": 0.8916037889745834, "faithfulness": 0.8590999883691032, "hallucination_detected": 0, "source_attribution_score": 0.7580742362127584, "latency_ms": 158.1258379270646, "tokens_used": 169, "cost_cents": 0.3752840055083183, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "f7a91f0a"}
|
| 38 |
-
{"query": "What are the inclusion/exclusion criteria for this study?", "answer": "Based on the clinical data, what are the inclusion/exclusion criteria for this study. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Pharmacokinetics_Study.pdf"], "num_retrieved": 4, "retrieval_precision": 0.9933833682135537, "retrieval_recall": 0.9982061025926003, "rank_position": 2, "rouge_l": 0.778356020463265, "bert_score": 0.8294547617138849, "answer_relevance": 0.7798722760563348, "faithfulness": 0.9499485856550234, "hallucination_detected": 0, "source_attribution_score": 0.7780376963536395, "latency_ms": 173.39694248228693, "tokens_used": 88, "cost_cents": 0.5571400235923032, "timestamp": "2025-12-29T16:33:52.690939", "eval_id": "c57f0a77"}
|
| 39 |
-
{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf"], "num_retrieved": 1, "retrieval_precision": 0.805738836595438, "retrieval_recall": 0.6065653094384298, "rank_position": 1, "rouge_l": 0.7577008392952846, "bert_score": 0.8073994742363001, "answer_relevance": 0.7914931355367709, "faithfulness": 0.7987802242346304, "hallucination_detected": 0, "source_attribution_score": 0.8243795429292404, "latency_ms": 358.4161124076011, "tokens_used": 149, "cost_cents": 0.7836666239789596, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "6751328e"}
|
| 40 |
-
{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9294316493258027, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.99, "answer_relevance": 0.8935370280287651, "faithfulness": 0.9657921757626544, "hallucination_detected": 0, "source_attribution_score": 0.9450063312067425, "latency_ms": 327.13170085845616, "tokens_used": 175, "cost_cents": 0.6736960835259596, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "04338b7b"}
|
| 41 |
-
{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Adverse_Events_Listing.pdf", "Safety_Profile_Report.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8691780049883588, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.9053724290502517, "bert_score": 0.887461041624208, "answer_relevance": 0.8255338939540914, "faithfulness": 0.9626678867129402, "hallucination_detected": 0, "source_attribution_score": 0.8858444522908131, "latency_ms": 285.00447311390025, "tokens_used": 212, "cost_cents": 0.26516004149862177, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "04338b7b"}
|
| 42 |
-
{"query": "What are the primary side effects of this drug?", "answer": "Based on the clinical data, what are the primary side effects of this drug. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9104627762675658, "retrieval_recall": 0.937976535494375, "rank_position": 1, "rouge_l": 0.5755118375496409, "bert_score": 0.8919959033394592, "answer_relevance": 0.7836202850178633, "faithfulness": 0.953432186460839, "hallucination_detected": 0, "source_attribution_score": 0.7835803057646042, "latency_ms": 393.78508982459505, "tokens_used": 127, "cost_cents": 0.5839725982219669, "timestamp": "2025-12-29T16:33:52.691937", "eval_id": "6751328e"}
|
| 43 |
-
{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.7014865049756396, "retrieval_recall": 0.8532205085753696, "rank_position": 1, "rouge_l": 0.7036879055392565, "bert_score": 0.8232887647654229, "answer_relevance": 0.7240432417784443, "faithfulness": 0.8174281679074274, "hallucination_detected": 0, "source_attribution_score": 0.8094309307066749, "latency_ms": 346.74320628259454, "tokens_used": 156, "cost_cents": 0.30470012119609546, "timestamp": "2025-12-29T16:33:52.692906", "eval_id": "6817a77e"}
|
| 44 |
-
{"query": "What is the safety profile based on reported adverse events?", "answer": "Based on the clinical data, what is the safety profile based on reported adverse events. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "FDA_Approval_Summary.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.7708525319903022, "retrieval_recall": 0.7140539324008609, "rank_position": 1, "rouge_l": 0.7582038473536197, "bert_score": 0.8719206100765141, "answer_relevance": 0.768747467165288, "faithfulness": 0.7863906811511377, "hallucination_detected": 0, "source_attribution_score": 0.7827059691758022, "latency_ms": 284.30338447510456, "tokens_used": 193, "cost_cents": 0.5194315945804843, "timestamp": "2025-12-29T16:33:52.692906", "eval_id": "70950525"}
|
| 45 |
-
{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf", "Pharmacokinetics_Study.pdf", "Clinical_Trial_Protocol.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8863518011536086, "retrieval_recall": 0.9528433531913749, "rank_position": 1, "rouge_l": 0.6924764309368061, "bert_score": 0.9074272676584865, "answer_relevance": 0.8856496644947377, "faithfulness": 0.9643048532855157, "hallucination_detected": 0, "source_attribution_score": 0.9218544026918479, "latency_ms": 387.9856471606976, "tokens_used": 107, "cost_cents": 0.1927569268723833, "timestamp": "2025-12-29T16:33:52.693443", "eval_id": "bc0d2943"}
|
| 46 |
-
{"query": "What were the patient demographics in the clinical trial?", "answer": "Based on the clinical data, what were the patient demographics in the clinical trial. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf", "FDA_Approval_Summary.pdf"], "num_retrieved": 3, "retrieval_precision": 0.8950536281566746, "retrieval_recall": 0.9144248160397045, "rank_position": 1, "rouge_l": 0.7618677680298188, "bert_score": 0.8461644035252505, "answer_relevance": 0.9653601861381645, "faithfulness": 0.8755786694922031, "hallucination_detected": 0, "source_attribution_score": 0.8808869584154418, "latency_ms": 353.36305965541663, "tokens_used": 245, "cost_cents": 0.5148915885221008, "timestamp": "2025-12-29T16:33:52.693443", "eval_id": "eeaa869f"}
|
| 47 |
-
{"query": "How does this drug compare to existing treatments?", "answer": "Based on the clinical data, how does this drug compare to existing treatments. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 2, "retrieval_precision": 0.8098059467924409, "retrieval_recall": 0.6023065734388835, "rank_position": 3, "rouge_l": 0.7004028932959154, "bert_score": 0.813015925326988, "answer_relevance": 0.6784644783231156, "faithfulness": 0.7845740350573508, "hallucination_detected": 0, "source_attribution_score": 0.65, "latency_ms": 459.79977076107156, "tokens_used": 117, "cost_cents": 0.27331328918592634, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "2202146d"}
|
| 48 |
-
{"query": "What is the mechanism of action for this treatment?", "answer": "Based on the clinical data, what is the mechanism of action for this treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Clinical_Trial_Protocol.pdf", "FDA_Approval_Summary.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 3, "retrieval_precision": 1.0, "retrieval_recall": 0.7608373996776078, "rank_position": 1, "rouge_l": 0.7774878763927089, "bert_score": 0.8396996698506028, "answer_relevance": 0.8271539804365684, "faithfulness": 0.9337213882950308, "hallucination_detected": 0, "source_attribution_score": 0.8388598969576262, "latency_ms": 213.90842919317265, "tokens_used": 86, "cost_cents": 0.29153943157162554, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "bfa3ef53"}
|
| 49 |
-
{"query": "What biomarkers should be monitored during treatment?", "answer": "Based on the clinical data, what biomarkers should be monitored during treatment. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Pharmacokinetics_Study.pdf", "Adverse_Events_Listing.pdf", "Clinical_Trial_Protocol.pdf", "Safety_Profile_Report.pdf"], "num_retrieved": 4, "retrieval_precision": 0.8346511394851743, "retrieval_recall": 0.8814336106436549, "rank_position": 1, "rouge_l": 0.8430315573988195, "bert_score": 0.8944331459730633, "answer_relevance": 0.7897513415421694, "faithfulness": 0.8580143425540971, "hallucination_detected": 1, "source_attribution_score": 0.7346872187150348, "latency_ms": 361.9668724913192, "tokens_used": 101, "cost_cents": 0.5711407488163474, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "b0d984c7"}
|
| 50 |
-
{"query": "What are the contraindications for this therapy?", "answer": "Based on the clinical data, what are the contraindications for this therapy. This finding is supported by the source documents indicating a positive correlation with treatment outcomes.", "source_docs": ["Safety_Profile_Report.pdf"], "num_retrieved": 1, "retrieval_precision": 0.9009022034719727, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.9105038892045774, "bert_score": 0.8960253555216661, "answer_relevance": 0.8253569744550738, "faithfulness": 0.8368519357763938, "hallucination_detected": 0, "source_attribution_score": 0.7777537972022747, "latency_ms": 300.33605788776373, "tokens_used": 207, "cost_cents": 0.3048499137601775, "timestamp": "2025-12-29T16:33:52.694024", "eval_id": "81f2022b"}
|
|
|
|
| 1 |
+
{"query": "What is the response rate of Drug X?", "answer": "According to the provided context (Chunk 1), Drug X demonstrated a 65% response rate compared to 35% in the placebo group.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 324.74422454833984, "tokens_used": 21, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:34.998293", "eval_id": "4e10192a"}
|
| 2 |
+
{"query": "What are the side effects of Drug X?", "answer": "The side effects of Drug X are:\n\n- Headache (12% of patients)\n- Mild gastrointestinal upset (8% of patients)\n- Dizziness (5% of patients)\n- Fatigue (4% of patients)\n\nAdditionally, serious adverse events occurred in 2% of patients, including liver enzyme elevation.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 198.8542079925537, "tokens_used": 42, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:35.198696", "eval_id": "05359951"}
|
| 3 |
+
{"query": "How does Drug X interact with warfarin?", "answer": "There is information about the interaction between Drug X and warfarin in the context provided. According to the information in Chunk 1, the interaction between Drug X and warfarin is:\n\n- Potential increased bleeding risk\n- Monitor INR closely\n- Baseline INR and weekly monitoring recommended", "source_docs": ["drug_interactions", "medical_research_1", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 274.7206687927246, "tokens_used": 46, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:35.474923", "eval_id": "09eb8892"}
|
| 4 |
+
{"query": "What is the recommended dosage of Drug X?", "answer": "The recommended dose of Drug X is 500mg twice daily with meals.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 233.20865631103516, "tokens_used": 12, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:35.708131", "eval_id": "297130d7"}
|
| 5 |
+
{"query": "What is the mechanism of action for Drug X?", "answer": "Drug X works by inhibiting protein kinase Y, which is overexpressed in condition Y cells. This inhibition leads to cell cycle arrest and apoptosis of affected cells.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 150.78186988830566, "tokens_used": 27, "cost_cents": 0.004, "timestamp": "2026-01-02T11:00:35.860418", "eval_id": "f55b494b"}
|
| 6 |
+
{"query": "What is the response rate of Drug X?", "answer": "65% response rate compared to 35% in the placebo group.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 435.7447624206543, "tokens_used": 10, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:27.660827", "eval_id": "92c25556"}
|
| 7 |
+
{"query": "What are the side effects of Drug X?", "answer": "The most common adverse events reported in the clinical trial for Drug X were:\n\n1. Headache (12% of patients)\n2. Mild gastrointestinal upset (8% of patients)\n3. Dizziness (5% of patients)\n4. Fatigue (4% of patients)\n\nAdditionally, in the case study, the patient experienced:\n\n1. Mild headache (resolved with acetaminophen)\n2. Occasional nausea (resolved with food intake)", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 236.0692024230957, "tokens_used": 57, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:27.898918", "eval_id": "1fab9b0a"}
|
| 8 |
+
{"query": "How does Drug X interact with warfarin?", "answer": "According to the provided context (Chunk 1 - Relevance: 66.9%), Drug X has a potential interaction with warfarin: \"2. Warfarin: - Potential increased bleeding risk - Monitor INR closely - Baseline INR and weekly monitoring recommended\"", "source_docs": ["drug_interactions", "medical_research_1", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 198.96793365478516, "tokens_used": 36, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:28.098393", "eval_id": "83a684fc"}
|
| 9 |
+
{"query": "What is the recommended dosage of Drug X?", "answer": "The recommended dose of Drug X is 500mg twice daily with meals.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 149.9345302581787, "tokens_used": 12, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:28.248327", "eval_id": "a0771f33"}
|
| 10 |
+
{"query": "What is the mechanism of action for Drug X?", "answer": "The mechanism of action for Drug X is inhibiting protein kinase Y, which is overexpressed in condition Y cells. This inhibition leads to cell cycle arrest and apoptosis of affected cells.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.0, "hallucination_detected": true, "source_attribution_score": 0.0, "latency_ms": 179.38470840454102, "tokens_used": 31, "cost_cents": 0.004, "timestamp": "2026-01-02T11:02:28.440212", "eval_id": "35939350"}
|
| 11 |
+
{"query": "What is the response rate of Drug X?", "answer": "The response rate of Drug X is 65%, compared to 35% in the placebo group (p<0.001).", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.5333333333333333, "hallucination_detected": true, "source_attribution_score": 0.3125, "latency_ms": 336.91954612731934, "tokens_used": 16, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:21.623166", "eval_id": "3688751b"}
|
| 12 |
+
{"query": "What are the side effects of Drug X?", "answer": "The most common side effects of Drug X are:\n\n- Headache (12% of patients)\n- Mild gastrointestinal upset (8% of patients)\n- Dizziness (5% of patients)\n- Fatigue (4% of patients)\n\nAdditionally, serious adverse events occurred in 2% of patients, including liver enzyme elevation.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.18181818181818182, "hallucination_detected": true, "source_attribution_score": 0.06818181818181818, "latency_ms": 331.4476013183594, "tokens_used": 44, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:21.955619", "eval_id": "586b096d"}
|
| 13 |
+
{"query": "How does Drug X interact with warfarin?", "answer": "According to the provided context, Drug X interacts with warfarin by potentially increasing the risk of bleeding. Monitoring of INR is recommended, including baseline INR and weekly monitoring is recommended.", "source_docs": ["drug_interactions", "medical_research_1", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.48, "hallucination_detected": true, "source_attribution_score": 0.23333333333333334, "latency_ms": 283.388614654541, "tokens_used": 30, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:22.240178", "eval_id": "a37df1d9"}
|
| 14 |
+
{"query": "What is the recommended dosage of Drug X?", "answer": "The recommended dose of Drug X is 500mg twice daily with meals. Dose adjustments may be necessary for patients with renal impairment, in which case a dose reduction to 250mg twice daily is recommended.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.5, "hallucination_detected": true, "source_attribution_score": 0.23529411764705882, "latency_ms": 272.6554870605469, "tokens_used": 34, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:22.512833", "eval_id": "66ecfa00"}
|
| 15 |
+
{"query": "What is the mechanism of action for Drug X?", "answer": "The mechanism of action for Drug X is inhibiting protein kinase Y, which is overexpressed in condition Y cells. This inhibition leads to cell cycle arrest and apoptosis of affected cells.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.5, "hallucination_detected": true, "source_attribution_score": 0.22580645161290322, "latency_ms": 165.5900478363037, "tokens_used": 31, "cost_cents": 0.004, "timestamp": "2026-01-02T11:03:22.679422", "eval_id": "4b7e99dd"}
|
| 16 |
+
{"query": "What is the response rate of Drug X?", "answer": "According to the provided context in Chunk 1, the response rate of Drug X was 65%.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.6666666666666666, "hallucination_detected": false, "source_attribution_score": 0.375, "latency_ms": 594.7244167327881, "tokens_used": 16, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:10.135089", "eval_id": "f5583f32"}
|
| 17 |
+
{"query": "What are the side effects of Drug X?", "answer": "According to the provided context, the side effects of Drug X are:\n\n- Headache (12% of patients)\n- Mild gastrointestinal upset (8% of patients)\n- Dizziness (5% of patients)\n- Fatigue (4% of patients)\n\nAdditionally, from the case study, the patient experienced:\n\n- Mild headache (resolved with acetaminophen)\n- Occasional nausea (resolved with food intake)", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.3888888888888889, "hallucination_detected": true, "source_attribution_score": 0.2, "latency_ms": 265.3524875640869, "tokens_used": 55, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:10.403565", "eval_id": "07148b4e"}
|
| 18 |
+
{"query": "How does Drug X interact with warfarin?", "answer": "According to the context, Drug X interacts with warfarin as follows: \n\n- There is a potential increased bleeding risk.\n- It is recommended to monitor INR closely.\n- Baseline INR and weekly monitoring are recommended.", "source_docs": ["drug_interactions", "medical_research_1", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.4666666666666667, "hallucination_detected": true, "source_attribution_score": 0.22857142857142856, "latency_ms": 339.0464782714844, "tokens_used": 35, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:10.742611", "eval_id": "b7c99790"}
|
| 19 |
+
{"query": "What is the recommended dosage of Drug X?", "answer": "The recommended dosage of Drug X is 500mg twice daily with meals.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.6666666666666666, "hallucination_detected": false, "source_attribution_score": 0.3333333333333333, "latency_ms": 176.81241035461426, "tokens_used": 12, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:10.920425", "eval_id": "1993b43f"}
|
| 20 |
+
{"query": "What is the mechanism of action for Drug X?", "answer": "Drug X works by inhibiting protein kinase Y, which is overexpressed in condition Y cells. This inhibition leads to cell cycle arrest and apoptosis of affected cells.", "source_docs": ["medical_research_1", "drug_interactions", "patient_case_study"], "num_retrieved": 3, "retrieval_precision": 0.3333333333333333, "retrieval_recall": 1.0, "rank_position": 1, "rouge_l": 0.95, "bert_score": 0.95, "answer_relevance": 0.85, "faithfulness": 0.46153846153846156, "hallucination_detected": true, "source_attribution_score": 0.2222222222222222, "latency_ms": 389.65582847595215, "tokens_used": 27, "cost_cents": 0.004, "timestamp": "2026-01-02T11:41:11.310612", "eval_id": "e1fdaf31"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
frontend/evaluation.html
CHANGED
|
@@ -7,154 +7,280 @@
|
|
| 7 |
<title>RAG Evaluation Dashboard</title>
|
| 8 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/plotly.js/2.26.0/plotly.min.js"></script>
|
| 9 |
<style>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
* {
|
| 11 |
margin: 0;
|
| 12 |
padding: 0;
|
| 13 |
box-sizing: border-box;
|
| 14 |
}
|
| 15 |
|
|
|
|
|
|
|
|
|
|
| 16 |
body {
|
| 17 |
-
font-family:
|
| 18 |
-
background:
|
| 19 |
min-height: 100vh;
|
| 20 |
-
padding:
|
|
|
|
| 21 |
}
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
.container {
|
| 24 |
-
max-width:
|
| 25 |
margin: 0 auto;
|
| 26 |
-
background: white;
|
| 27 |
-
border-radius: 15px;
|
| 28 |
-
box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
|
| 29 |
-
overflow: hidden;
|
| 30 |
}
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
text-align: center;
|
|
|
|
| 37 |
}
|
| 38 |
|
| 39 |
-
|
| 40 |
-
font-size: 2.
|
| 41 |
-
|
|
|
|
|
|
|
| 42 |
}
|
| 43 |
|
| 44 |
-
|
| 45 |
-
font-size:
|
| 46 |
-
|
|
|
|
| 47 |
}
|
| 48 |
|
| 49 |
.nav-buttons {
|
| 50 |
display: flex;
|
| 51 |
-
gap: 10px;
|
| 52 |
justify-content: center;
|
| 53 |
-
|
| 54 |
flex-wrap: wrap;
|
|
|
|
| 55 |
}
|
| 56 |
|
| 57 |
.nav-btn {
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
| 61 |
color: white;
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
cursor: pointer;
|
| 64 |
-
font-size: 1em;
|
| 65 |
-
transition: all 0.3s;
|
| 66 |
}
|
| 67 |
|
| 68 |
-
.nav-btn:hover
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
color: #667eea;
|
| 72 |
}
|
| 73 |
|
| 74 |
-
.
|
| 75 |
-
|
|
|
|
|
|
|
| 76 |
}
|
| 77 |
|
| 78 |
-
.
|
| 79 |
-
|
| 80 |
}
|
| 81 |
|
| 82 |
-
.
|
| 83 |
-
|
| 84 |
-
animation: fadeIn 0.3s;
|
| 85 |
}
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
}
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
}
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
.metrics-grid {
|
| 98 |
display: grid;
|
| 99 |
-
grid-template-columns: repeat(auto-fit, minmax(
|
| 100 |
-
gap:
|
| 101 |
-
margin-bottom:
|
| 102 |
}
|
| 103 |
|
| 104 |
.metric-card {
|
| 105 |
-
background:
|
| 106 |
color: white;
|
| 107 |
-
padding:
|
| 108 |
-
border-radius:
|
| 109 |
text-align: center;
|
| 110 |
-
|
| 111 |
-
transition: transform 0.3s;
|
| 112 |
}
|
| 113 |
|
| 114 |
.metric-card:hover {
|
| 115 |
-
transform: translateY(-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
}
|
| 117 |
|
| 118 |
.metric-label {
|
| 119 |
-
font-size: 0.
|
| 120 |
opacity: 0.9;
|
| 121 |
-
margin-bottom:
|
| 122 |
text-transform: uppercase;
|
| 123 |
-
letter-spacing:
|
| 124 |
}
|
| 125 |
|
| 126 |
.metric-value {
|
| 127 |
-
font-size:
|
| 128 |
-
font-weight:
|
| 129 |
}
|
| 130 |
|
| 131 |
.metric-unit {
|
| 132 |
-
font-size: 0.
|
| 133 |
opacity: 0.8;
|
| 134 |
-
margin-top:
|
| 135 |
}
|
| 136 |
|
|
|
|
|
|
|
|
|
|
| 137 |
.chart-container {
|
| 138 |
-
background:
|
| 139 |
-
border: 1px solid
|
| 140 |
-
border-radius:
|
| 141 |
-
padding:
|
| 142 |
-
margin-bottom:
|
| 143 |
-
min-height:
|
| 144 |
}
|
| 145 |
|
| 146 |
.chart-title {
|
| 147 |
-
font-size:
|
| 148 |
font-weight: 600;
|
| 149 |
-
margin-bottom:
|
| 150 |
-
color:
|
| 151 |
}
|
| 152 |
|
| 153 |
.two-column {
|
| 154 |
display: grid;
|
| 155 |
grid-template-columns: 1fr 1fr;
|
| 156 |
gap: 20px;
|
| 157 |
-
margin-bottom: 20px;
|
| 158 |
}
|
| 159 |
|
| 160 |
@media (max-width: 900px) {
|
|
@@ -163,31 +289,49 @@
|
|
| 163 |
}
|
| 164 |
}
|
| 165 |
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
| 168 |
}
|
| 169 |
|
| 170 |
-
.
|
| 171 |
-
|
|
|
|
| 172 |
}
|
| 173 |
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
}
|
| 177 |
|
|
|
|
|
|
|
|
|
|
| 178 |
.info-box {
|
| 179 |
-
background:
|
| 180 |
-
border-left: 4px solid
|
| 181 |
-
padding:
|
|
|
|
| 182 |
margin-bottom: 20px;
|
| 183 |
-
border-radius: 5px;
|
| 184 |
}
|
| 185 |
|
| 186 |
.info-box p {
|
| 187 |
-
|
| 188 |
-
|
| 189 |
}
|
| 190 |
|
|
|
|
|
|
|
|
|
|
| 191 |
.button-group {
|
| 192 |
display: flex;
|
| 193 |
gap: 10px;
|
|
@@ -196,43 +340,49 @@
|
|
| 196 |
}
|
| 197 |
|
| 198 |
.btn {
|
| 199 |
-
padding: 10px
|
| 200 |
-
background:
|
| 201 |
color: white;
|
| 202 |
border: none;
|
| 203 |
-
border-radius:
|
| 204 |
cursor: pointer;
|
| 205 |
-
font-size:
|
| 206 |
-
|
|
|
|
| 207 |
}
|
| 208 |
|
| 209 |
.btn:hover {
|
| 210 |
-
background: #
|
| 211 |
}
|
| 212 |
|
| 213 |
.btn-secondary {
|
| 214 |
-
background:
|
|
|
|
|
|
|
| 215 |
}
|
| 216 |
|
| 217 |
.btn-secondary:hover {
|
| 218 |
-
background:
|
| 219 |
}
|
| 220 |
|
|
|
|
|
|
|
|
|
|
| 221 |
.loading {
|
| 222 |
display: none;
|
| 223 |
text-align: center;
|
| 224 |
-
padding:
|
| 225 |
-
color:
|
| 226 |
}
|
| 227 |
|
| 228 |
.spinner {
|
| 229 |
-
border: 4px solid
|
| 230 |
-
border-top: 4px solid
|
| 231 |
border-radius: 50%;
|
| 232 |
width: 40px;
|
| 233 |
height: 40px;
|
| 234 |
animation: spin 1s linear infinite;
|
| 235 |
-
margin: 0 auto
|
| 236 |
}
|
| 237 |
|
| 238 |
@keyframes spin {
|
|
@@ -245,272 +395,287 @@
|
|
| 245 |
}
|
| 246 |
}
|
| 247 |
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
}
|
| 252 |
-
|
| 253 |
.failure-item {
|
| 254 |
-
background:
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
border-
|
| 259 |
}
|
| 260 |
|
| 261 |
.failure-item-query {
|
| 262 |
-
font-
|
| 263 |
-
color:
|
|
|
|
| 264 |
}
|
| 265 |
|
| 266 |
.failure-item-score {
|
| 267 |
-
font-size: 0.
|
| 268 |
-
color:
|
| 269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
}
|
| 271 |
</style>
|
| 272 |
</head>
|
| 273 |
|
| 274 |
<body>
|
|
|
|
|
|
|
| 275 |
<div class="container">
|
| 276 |
-
<
|
| 277 |
-
<h1
|
| 278 |
-
<p>
|
| 279 |
<div class="nav-buttons">
|
| 280 |
-
<
|
| 281 |
-
<
|
| 282 |
-
<button class="nav-btn" onclick="showSection('generation')">Generation</button>
|
| 283 |
-
<button class="nav-btn" onclick="showSection('faithfulness')">Faithfulness</button>
|
| 284 |
-
<button class="nav-btn" onclick="showSection('performance')">Performance</button>
|
| 285 |
-
<button class="nav-btn" onclick="showSection('failures')">Failures</button>
|
| 286 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
</div>
|
| 288 |
|
| 289 |
-
<div class="
|
| 290 |
-
|
| 291 |
-
<
|
| 292 |
-
|
| 293 |
-
<button class="btn" onclick="loadMetrics()">π Refresh Metrics</button>
|
| 294 |
-
<button class="btn btn-secondary" onclick="exportResults()">π₯ Export Results</button>
|
| 295 |
-
<button class="btn btn-secondary" onclick="clearResults()">ποΈ Clear Results</button>
|
| 296 |
-
</div>
|
| 297 |
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
|
|
|
|
|
|
|
|
|
| 301 |
</div>
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
</div>
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
<
|
| 309 |
-
|
| 310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
</div>
|
| 312 |
</div>
|
| 313 |
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
<
|
| 317 |
-
|
| 318 |
-
<div class="two-column">
|
| 319 |
-
<div class="chart-container">
|
| 320 |
-
<div class="chart-title">Precision & Recall Trend</div>
|
| 321 |
-
<div id="retrievalChart"></div>
|
| 322 |
-
</div>
|
| 323 |
-
<div class="chart-container">
|
| 324 |
-
<div class="chart-title">Key Metrics</div>
|
| 325 |
-
<div style="padding: 20px;">
|
| 326 |
-
<p><strong>Mean Reciprocal Rank (MRR):</strong> <span id="mrrValue">--</span></p>
|
| 327 |
-
<p style="margin-top: 10px; font-size: 0.9em;">Measures ranking quality of retrieved
|
| 328 |
-
documents. Higher is better (ideal: 1.0)</p>
|
| 329 |
-
<hr style="margin: 15px 0;">
|
| 330 |
-
<p><strong>Avg Precision:</strong> <span id="avgPrecision">--</span></p>
|
| 331 |
-
<p style="margin-top: 10px;"><strong>Avg Recall:</strong> <span id="avgRecall">--</span></p>
|
| 332 |
-
</div>
|
| 333 |
-
</div>
|
| 334 |
-
</div>
|
| 335 |
</div>
|
| 336 |
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
|
|
|
|
|
|
|
|
|
| 340 |
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
</div>
|
|
|
|
| 350 |
</div>
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
<
|
| 354 |
-
|
| 355 |
-
|
|
|
|
|
|
|
| 356 |
</div>
|
| 357 |
</div>
|
|
|
|
| 358 |
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
<div class="chart-container">
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
</
|
| 368 |
-
<div class="chart-container">
|
| 369 |
-
<div class="chart-title">Faithfulness Trend</div>
|
| 370 |
-
<div id="faithfulnessChart"></div>
|
| 371 |
-
</div>
|
| 372 |
</div>
|
| 373 |
</div>
|
|
|
|
| 374 |
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
<div class="chart-container">
|
| 381 |
-
<div class="chart-title">Latency vs Cost</div>
|
| 382 |
-
<div id="latencyChart"></div>
|
| 383 |
-
</div>
|
| 384 |
-
<div class="chart-container">
|
| 385 |
-
<div class="chart-title">Latency Percentiles</div>
|
| 386 |
-
<div id="percentileChart"></div>
|
| 387 |
-
</div>
|
| 388 |
</div>
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
<div class="
|
| 392 |
-
<div class="metric-label">P50 Latency</div>
|
| 393 |
-
<div class="metric-value" id="p50Value">--</div>
|
| 394 |
-
<div class="metric-unit">milliseconds</div>
|
| 395 |
-
</div>
|
| 396 |
-
<div class="metric-card">
|
| 397 |
-
<div class="metric-label">P95 Latency</div>
|
| 398 |
-
<div class="metric-value" id="p95Value">--</div>
|
| 399 |
-
<div class="metric-unit">milliseconds</div>
|
| 400 |
-
</div>
|
| 401 |
-
<div class="metric-card">
|
| 402 |
-
<div class="metric-label">P99 Latency</div>
|
| 403 |
-
<div class="metric-value" id="p99Value">--</div>
|
| 404 |
-
<div class="metric-unit">milliseconds</div>
|
| 405 |
-
</div>
|
| 406 |
-
<div class="metric-card">
|
| 407 |
-
<div class="metric-label">Avg Cost</div>
|
| 408 |
-
<div class="metric-value" id="costValue">--</div>
|
| 409 |
-
<div class="metric-unit">cents per query</div>
|
| 410 |
-
</div>
|
| 411 |
</div>
|
| 412 |
</div>
|
|
|
|
| 413 |
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
</
|
| 423 |
-
<div class="chart-container">
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
<
|
| 430 |
-
<
|
| 431 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
</div>
|
| 433 |
</div>
|
| 434 |
</div>
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
|
|
|
| 439 |
</div>
|
| 440 |
</div>
|
| 441 |
</div>
|
| 442 |
</div>
|
| 443 |
|
| 444 |
<script>
|
|
|
|
| 445 |
let metricsData = null;
|
| 446 |
let timeseriesData = null;
|
| 447 |
let failureData = null;
|
| 448 |
|
|
|
|
| 449 |
function showSection(sectionId) {
|
| 450 |
-
// Hide all sections
|
| 451 |
document.querySelectorAll('.section').forEach(s => s.classList.remove('active'));
|
| 452 |
-
document.querySelectorAll('.
|
| 453 |
|
| 454 |
-
// Show selected section
|
| 455 |
document.getElementById(sectionId).classList.add('active');
|
| 456 |
event.target.classList.add('active');
|
| 457 |
|
| 458 |
-
// Load data
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
|
|
|
| 465 |
}
|
| 466 |
|
| 467 |
async function loadMetrics() {
|
| 468 |
showLoading(true);
|
| 469 |
try {
|
| 470 |
-
const response = await fetch(
|
| 471 |
metricsData = await response.json();
|
| 472 |
|
| 473 |
-
// Update overview
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
<div class="metric-value">${metricsData.retrieval_precision_mean.toFixed(3)}</div>
|
| 483 |
-
</div>
|
| 484 |
-
<div class="metric-card">
|
| 485 |
-
<div class="metric-label">Avg BERTScore</div>
|
| 486 |
-
<div class="metric-value">${metricsData.bert_score_mean.toFixed(3)}</div>
|
| 487 |
-
</div>
|
| 488 |
-
<div class="metric-card">
|
| 489 |
-
<div class="metric-label">Faithfulness</div>
|
| 490 |
-
<div class="metric-value">${metricsData.faithfulness_mean.toFixed(3)}</div>
|
| 491 |
-
</div>
|
| 492 |
-
<div class="metric-card">
|
| 493 |
-
<div class="metric-label">Hallucination Rate</div>
|
| 494 |
-
<div class="metric-value">${(metricsData.hallucination_rate * 100).toFixed(1)}%</div>
|
| 495 |
-
</div>
|
| 496 |
-
<div class="metric-card">
|
| 497 |
-
<div class="metric-label">Avg Latency</div>
|
| 498 |
-
<div class="metric-value">${metricsData.latency_mean.toFixed(0)}</div>
|
| 499 |
-
<div class="metric-unit">ms</div>
|
| 500 |
-
</div>
|
| 501 |
-
<div class="metric-card">
|
| 502 |
-
<div class="metric-label">MRR</div>
|
| 503 |
-
<div class="metric-value">${metricsData.mrr.toFixed(3)}</div>
|
| 504 |
-
</div>
|
| 505 |
-
<div class="metric-card">
|
| 506 |
-
<div class="metric-label">Cost/Query</div>
|
| 507 |
-
<div class="metric-value">$${(metricsData.cost_per_query / 100).toFixed(4)}</div>
|
| 508 |
-
</div>
|
| 509 |
-
`;
|
| 510 |
-
|
| 511 |
-
document.getElementById('totalEvals').textContent = metricsData.total_evaluations;
|
| 512 |
-
document.getElementById('lastUpdated').textContent = new Date(metricsData.timestamp).toLocaleString();
|
| 513 |
-
document.getElementById('systemStatus').textContent = metricsData.hallucination_rate < 0.15 ? 'β
Healthy' : 'β οΈ Issues Detected';
|
| 514 |
|
| 515 |
} catch (e) {
|
| 516 |
console.error('Error loading metrics:', e);
|
|
@@ -520,11 +685,13 @@
|
|
| 520 |
|
| 521 |
async function loadRetrievalCharts() {
|
| 522 |
try {
|
| 523 |
-
|
| 524 |
-
|
|
|
|
|
|
|
| 525 |
|
| 526 |
if (!timeseriesData.query_idx || timeseriesData.query_idx.length === 0) {
|
| 527 |
-
document.getElementById('retrievalChart').innerHTML = '<p style="padding:
|
| 528 |
return;
|
| 529 |
}
|
| 530 |
|
|
@@ -533,7 +700,7 @@
|
|
| 533 |
y: timeseriesData.retrieval_precision,
|
| 534 |
name: 'Precision',
|
| 535 |
mode: 'lines+markers',
|
| 536 |
-
line: { color: '#
|
| 537 |
};
|
| 538 |
|
| 539 |
const trace2 = {
|
|
@@ -541,21 +708,25 @@
|
|
| 541 |
y: timeseriesData.retrieval_recall,
|
| 542 |
name: 'Recall',
|
| 543 |
mode: 'lines+markers',
|
| 544 |
-
line: { color: '#
|
| 545 |
};
|
| 546 |
|
| 547 |
-
|
| 548 |
-
title: '',
|
| 549 |
xaxis: { title: 'Query Index' },
|
| 550 |
yaxis: { title: 'Score' },
|
| 551 |
hovermode: 'x unified',
|
| 552 |
-
responsive: true
|
| 553 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
|
| 555 |
if (metricsData) {
|
| 556 |
-
document.getElementById('mrrValue').textContent = metricsData.mrr
|
| 557 |
-
document.getElementById('avgPrecision').textContent = metricsData.retrieval_precision_mean
|
| 558 |
-
document.getElementById('avgRecall').textContent = metricsData.retrieval_recall_mean
|
| 559 |
}
|
| 560 |
|
| 561 |
} catch (e) {
|
|
@@ -566,7 +737,7 @@
|
|
| 566 |
async function loadGenerationCharts() {
|
| 567 |
try {
|
| 568 |
if (!timeseriesData) {
|
| 569 |
-
const response = await fetch(
|
| 570 |
timeseriesData = await response.json();
|
| 571 |
}
|
| 572 |
|
|
@@ -577,7 +748,7 @@
|
|
| 577 |
y: timeseriesData.rouge_l,
|
| 578 |
name: 'ROUGE-L',
|
| 579 |
mode: 'lines+markers',
|
| 580 |
-
line: { color: '#
|
| 581 |
};
|
| 582 |
|
| 583 |
const trace2 = {
|
|
@@ -585,23 +756,34 @@
|
|
| 585 |
y: timeseriesData.bert_score,
|
| 586 |
name: 'BERTScore',
|
| 587 |
mode: 'lines+markers',
|
| 588 |
-
line: { color: '#
|
| 589 |
};
|
| 590 |
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
|
| 595 |
if (metricsData) {
|
| 596 |
const barsTrace = {
|
| 597 |
x: ['ROUGE-L', 'BERTScore', 'Answer Relevance'],
|
| 598 |
y: [metricsData.rouge_l_mean, metricsData.bert_score_mean, metricsData.answer_relevance_mean],
|
| 599 |
type: 'bar',
|
| 600 |
-
marker: { color: ['#
|
| 601 |
};
|
| 602 |
|
| 603 |
Plotly.newPlot('generationBars', [barsTrace], {
|
| 604 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
});
|
| 606 |
}
|
| 607 |
|
|
@@ -614,21 +796,24 @@
|
|
| 614 |
try {
|
| 615 |
if (!metricsData) await loadMetrics();
|
| 616 |
if (!timeseriesData) {
|
| 617 |
-
const response = await fetch(
|
| 618 |
timeseriesData = await response.json();
|
| 619 |
}
|
| 620 |
|
| 621 |
-
const hallRate = metricsData.hallucination_rate;
|
| 622 |
const faithfulRate = 1 - hallRate;
|
| 623 |
|
| 624 |
const pieTrace = {
|
| 625 |
labels: ['Faithful Answers', 'Hallucinations'],
|
| 626 |
values: [faithfulRate * 100, hallRate * 100],
|
| 627 |
type: 'pie',
|
| 628 |
-
marker: { colors: ['#
|
| 629 |
};
|
| 630 |
|
| 631 |
-
Plotly.newPlot('hallucinationChart', [pieTrace], {
|
|
|
|
|
|
|
|
|
|
| 632 |
|
| 633 |
if (timeseriesData.query_idx && timeseriesData.query_idx.length > 0) {
|
| 634 |
const faithTrace = {
|
|
@@ -636,12 +821,16 @@
|
|
| 636 |
y: timeseriesData.faithfulness,
|
| 637 |
name: 'Faithfulness',
|
| 638 |
mode: 'lines+markers',
|
| 639 |
-
line: { color: '#
|
| 640 |
marker: { size: 6 }
|
| 641 |
};
|
| 642 |
|
| 643 |
Plotly.newPlot('faithfulnessChart', [faithTrace], {
|
| 644 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 645 |
});
|
| 646 |
}
|
| 647 |
|
|
@@ -654,28 +843,32 @@
|
|
| 654 |
try {
|
| 655 |
if (!metricsData) await loadMetrics();
|
| 656 |
if (!timeseriesData) {
|
| 657 |
-
const response = await fetch(
|
| 658 |
timeseriesData = await response.json();
|
| 659 |
}
|
| 660 |
|
| 661 |
if (timeseriesData.query_idx && timeseriesData.query_idx.length > 0) {
|
| 662 |
const latencyTrace = {
|
| 663 |
-
x: timeseriesData.
|
| 664 |
y: timeseriesData.latency_ms,
|
| 665 |
-
mode: 'markers',
|
| 666 |
-
|
| 667 |
-
|
| 668 |
};
|
| 669 |
|
| 670 |
Plotly.newPlot('latencyChart', [latencyTrace], {
|
| 671 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 672 |
});
|
| 673 |
}
|
| 674 |
|
| 675 |
-
document.getElementById('p50Value').textContent = metricsData.latency_p50
|
| 676 |
-
document.getElementById('p95Value').textContent = metricsData.latency_p95
|
| 677 |
-
document.getElementById('p99Value').textContent = metricsData.latency_p99
|
| 678 |
-
document.getElementById('costValue').textContent = (metricsData.cost_per_query / 100).toFixed(4);
|
| 679 |
|
| 680 |
} catch (e) {
|
| 681 |
console.error('Error loading performance charts:', e);
|
|
@@ -684,25 +877,28 @@
|
|
| 684 |
|
| 685 |
async function loadFailureCharts() {
|
| 686 |
try {
|
| 687 |
-
const response = await fetch(
|
| 688 |
failureData = await response.json();
|
| 689 |
|
| 690 |
const failureChart = {
|
| 691 |
x: Object.keys(failureData.failure_modes),
|
| 692 |
y: Object.values(failureData.failure_modes),
|
| 693 |
type: 'bar',
|
| 694 |
-
marker: { color: '#
|
| 695 |
};
|
| 696 |
|
| 697 |
Plotly.newPlot('failureChart', [failureChart], {
|
| 698 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
});
|
| 700 |
|
| 701 |
document.getElementById('totalFailures').textContent = failureData.total_failures;
|
| 702 |
document.getElementById('hallCount').textContent = failureData.failure_modes.hallucinations;
|
| 703 |
document.getElementById('retCount').textContent = failureData.failure_modes.low_retrieval;
|
| 704 |
document.getElementById('genCount').textContent = failureData.failure_modes.low_generation;
|
| 705 |
-
document.getElementById('faithCount').textContent = failureData.failure_modes.low_faithfulness;
|
| 706 |
|
| 707 |
// Show recent failures
|
| 708 |
const failureList = document.getElementById('failureList');
|
|
@@ -713,10 +909,13 @@
|
|
| 713 |
];
|
| 714 |
|
| 715 |
allFailures.forEach(f => {
|
| 716 |
-
html += `<div class="failure-item"
|
|
|
|
|
|
|
|
|
|
| 717 |
});
|
| 718 |
|
| 719 |
-
failureList.innerHTML = html || '<p style="padding: 20px; color:
|
| 720 |
|
| 721 |
} catch (e) {
|
| 722 |
console.error('Error loading failure analysis:', e);
|
|
@@ -729,7 +928,7 @@
|
|
| 729 |
|
| 730 |
async function exportResults() {
|
| 731 |
try {
|
| 732 |
-
const response = await fetch(
|
| 733 |
const blob = await response.blob();
|
| 734 |
const url = window.URL.createObjectURL(blob);
|
| 735 |
const a = document.createElement('a');
|
|
@@ -745,7 +944,7 @@
|
|
| 745 |
async function clearResults() {
|
| 746 |
if (confirm('Are you sure you want to clear all results?')) {
|
| 747 |
try {
|
| 748 |
-
await fetch(
|
| 749 |
metricsData = null;
|
| 750 |
timeseriesData = null;
|
| 751 |
failureData = null;
|
|
@@ -757,6 +956,33 @@
|
|
| 757 |
}
|
| 758 |
}
|
| 759 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 760 |
// Load metrics on page load
|
| 761 |
window.addEventListener('load', loadMetrics);
|
| 762 |
</script>
|
|
|
|
| 7 |
<title>RAG Evaluation Dashboard</title>
|
| 8 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/plotly.js/2.26.0/plotly.min.js"></script>
|
| 9 |
<style>
|
| 10 |
+
/* -----------------------------
|
| 11 |
+
Design tokens (matching index.html)
|
| 12 |
+
------------------------------*/
|
| 13 |
+
:root {
|
| 14 |
+
--surface: #ffffff;
|
| 15 |
+
--surface-subtle: #fafafa;
|
| 16 |
+
--bg-main: #f5f7fb;
|
| 17 |
+
--card-bg: #ffffff;
|
| 18 |
+
--accent: #2563eb;
|
| 19 |
+
--accent-soft: #eff6ff;
|
| 20 |
+
--text-main: #111827;
|
| 21 |
+
--text-muted: #6b7280;
|
| 22 |
+
--border-soft: #e5e7eb;
|
| 23 |
+
--success: #16a34a;
|
| 24 |
+
--error: #dc2626;
|
| 25 |
+
--warning: #f59e0b;
|
| 26 |
+
--info: #2563eb;
|
| 27 |
+
--radius-sm: 6px;
|
| 28 |
+
--radius-md: 10px;
|
| 29 |
+
--radius-lg: 14px;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
/* Dark mode */
|
| 33 |
+
[data-theme="dark"] {
|
| 34 |
+
--surface: #1f2937;
|
| 35 |
+
--surface-subtle: #111827;
|
| 36 |
+
--bg-main: #0f172a;
|
| 37 |
+
--card-bg: #1e293b;
|
| 38 |
+
--accent: #60a5fa;
|
| 39 |
+
--accent-soft: #1e3a5f;
|
| 40 |
+
--text-main: #f1f5f9;
|
| 41 |
+
--text-muted: #94a3b8;
|
| 42 |
+
--border-soft: #334155;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
/* -----------------------------
|
| 46 |
+
Reset
|
| 47 |
+
------------------------------*/
|
| 48 |
* {
|
| 49 |
margin: 0;
|
| 50 |
padding: 0;
|
| 51 |
box-sizing: border-box;
|
| 52 |
}
|
| 53 |
|
| 54 |
+
/* -----------------------------
|
| 55 |
+
Base
|
| 56 |
+
------------------------------*/
|
| 57 |
body {
|
| 58 |
+
font-family: Inter, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
|
| 59 |
+
background: var(--bg-main);
|
| 60 |
min-height: 100vh;
|
| 61 |
+
padding: 24px;
|
| 62 |
+
color: var(--text-main);
|
| 63 |
}
|
| 64 |
|
| 65 |
+
/* -----------------------------
|
| 66 |
+
Container
|
| 67 |
+
------------------------------*/
|
| 68 |
.container {
|
| 69 |
+
max-width: 1200px;
|
| 70 |
margin: 0 auto;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
}
|
| 72 |
|
| 73 |
+
/* -----------------------------
|
| 74 |
+
Header
|
| 75 |
+
------------------------------*/
|
| 76 |
+
header {
|
| 77 |
text-align: center;
|
| 78 |
+
margin-bottom: 36px;
|
| 79 |
}
|
| 80 |
|
| 81 |
+
header h1 {
|
| 82 |
+
font-size: 2.2rem;
|
| 83 |
+
font-weight: 600;
|
| 84 |
+
letter-spacing: -0.02em;
|
| 85 |
+
margin-bottom: 8px;
|
| 86 |
}
|
| 87 |
|
| 88 |
+
header p {
|
| 89 |
+
font-size: 1rem;
|
| 90 |
+
color: var(--text-muted);
|
| 91 |
+
margin-bottom: 20px;
|
| 92 |
}
|
| 93 |
|
| 94 |
.nav-buttons {
|
| 95 |
display: flex;
|
|
|
|
| 96 |
justify-content: center;
|
| 97 |
+
gap: 12px;
|
| 98 |
flex-wrap: wrap;
|
| 99 |
+
margin-bottom: 12px;
|
| 100 |
}
|
| 101 |
|
| 102 |
.nav-btn {
|
| 103 |
+
display: inline-flex;
|
| 104 |
+
align-items: center;
|
| 105 |
+
gap: 8px;
|
| 106 |
+
padding: 10px 18px;
|
| 107 |
+
background: var(--accent);
|
| 108 |
color: white;
|
| 109 |
+
text-decoration: none;
|
| 110 |
+
border-radius: var(--radius-md);
|
| 111 |
+
font-size: 0.9rem;
|
| 112 |
+
font-weight: 500;
|
| 113 |
+
transition: background 0.15s ease, transform 0.15s ease;
|
| 114 |
+
border: none;
|
| 115 |
cursor: pointer;
|
|
|
|
|
|
|
| 116 |
}
|
| 117 |
|
| 118 |
+
.nav-btn:hover {
|
| 119 |
+
background: #1d4ed8;
|
| 120 |
+
transform: translateY(-1px);
|
|
|
|
| 121 |
}
|
| 122 |
|
| 123 |
+
.nav-btn.secondary {
|
| 124 |
+
background: var(--surface);
|
| 125 |
+
color: var(--text-main);
|
| 126 |
+
border: 1px solid var(--border-soft);
|
| 127 |
}
|
| 128 |
|
| 129 |
+
.nav-btn.secondary:hover {
|
| 130 |
+
background: var(--surface-subtle);
|
| 131 |
}
|
| 132 |
|
| 133 |
+
[data-theme="dark"] .nav-btn {
|
| 134 |
+
background: #60a5fa;
|
|
|
|
| 135 |
}
|
| 136 |
|
| 137 |
+
[data-theme="dark"] .nav-btn:hover {
|
| 138 |
+
background: #3b82f6;
|
| 139 |
+
}
|
|
|
|
| 140 |
|
| 141 |
+
/* Theme toggle */
|
| 142 |
+
.theme-toggle {
|
| 143 |
+
position: fixed;
|
| 144 |
+
top: 20px;
|
| 145 |
+
right: 20px;
|
| 146 |
+
padding: 8px 14px;
|
| 147 |
+
background: var(--surface);
|
| 148 |
+
border: 1px solid var(--border-soft);
|
| 149 |
+
border-radius: var(--radius-md);
|
| 150 |
+
cursor: pointer;
|
| 151 |
+
font-size: 0.85rem;
|
| 152 |
+
color: var(--text-main);
|
| 153 |
+
z-index: 100;
|
| 154 |
}
|
| 155 |
|
| 156 |
+
/* -----------------------------
|
| 157 |
+
Tab Navigation
|
| 158 |
+
------------------------------*/
|
| 159 |
+
.tab-nav {
|
| 160 |
+
display: flex;
|
| 161 |
+
justify-content: center;
|
| 162 |
+
gap: 8px;
|
| 163 |
+
margin-bottom: 24px;
|
| 164 |
+
flex-wrap: wrap;
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
.tab-btn {
|
| 168 |
+
padding: 10px 20px;
|
| 169 |
+
background: var(--surface);
|
| 170 |
+
border: 1px solid var(--border-soft);
|
| 171 |
+
color: var(--text-main);
|
| 172 |
+
border-radius: var(--radius-md);
|
| 173 |
+
cursor: pointer;
|
| 174 |
+
font-size: 0.9rem;
|
| 175 |
+
transition: all 0.15s ease;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
.tab-btn:hover {
|
| 179 |
+
background: var(--surface-subtle);
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
.tab-btn.active {
|
| 183 |
+
background: var(--accent);
|
| 184 |
+
color: white;
|
| 185 |
+
border-color: var(--accent);
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
/* -----------------------------
|
| 189 |
+
Cards
|
| 190 |
+
------------------------------*/
|
| 191 |
+
.card {
|
| 192 |
+
background: var(--surface);
|
| 193 |
+
border-radius: var(--radius-lg);
|
| 194 |
+
padding: 24px;
|
| 195 |
+
border: 1px solid var(--border-soft);
|
| 196 |
+
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05);
|
| 197 |
+
margin-bottom: 24px;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
.card h2 {
|
| 201 |
+
font-size: 1.1rem;
|
| 202 |
+
font-weight: 600;
|
| 203 |
+
margin-bottom: 16px;
|
| 204 |
+
color: var(--text-main);
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
/* -----------------------------
|
| 208 |
+
Metrics Grid
|
| 209 |
+
------------------------------*/
|
| 210 |
.metrics-grid {
|
| 211 |
display: grid;
|
| 212 |
+
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
|
| 213 |
+
gap: 16px;
|
| 214 |
+
margin-bottom: 24px;
|
| 215 |
}
|
| 216 |
|
| 217 |
.metric-card {
|
| 218 |
+
background: var(--accent);
|
| 219 |
color: white;
|
| 220 |
+
padding: 20px;
|
| 221 |
+
border-radius: var(--radius-md);
|
| 222 |
text-align: center;
|
| 223 |
+
transition: transform 0.15s ease;
|
|
|
|
| 224 |
}
|
| 225 |
|
| 226 |
.metric-card:hover {
|
| 227 |
+
transform: translateY(-2px);
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
.metric-card.success {
|
| 231 |
+
background: var(--success);
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
.metric-card.warning {
|
| 235 |
+
background: var(--warning);
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
.metric-card.error {
|
| 239 |
+
background: var(--error);
|
| 240 |
}
|
| 241 |
|
| 242 |
.metric-label {
|
| 243 |
+
font-size: 0.8rem;
|
| 244 |
opacity: 0.9;
|
| 245 |
+
margin-bottom: 8px;
|
| 246 |
text-transform: uppercase;
|
| 247 |
+
letter-spacing: 0.5px;
|
| 248 |
}
|
| 249 |
|
| 250 |
.metric-value {
|
| 251 |
+
font-size: 1.8rem;
|
| 252 |
+
font-weight: 700;
|
| 253 |
}
|
| 254 |
|
| 255 |
.metric-unit {
|
| 256 |
+
font-size: 0.75rem;
|
| 257 |
opacity: 0.8;
|
| 258 |
+
margin-top: 4px;
|
| 259 |
}
|
| 260 |
|
| 261 |
+
/* -----------------------------
|
| 262 |
+
Charts
|
| 263 |
+
------------------------------*/
|
| 264 |
.chart-container {
|
| 265 |
+
background: var(--surface);
|
| 266 |
+
border: 1px solid var(--border-soft);
|
| 267 |
+
border-radius: var(--radius-md);
|
| 268 |
+
padding: 16px;
|
| 269 |
+
margin-bottom: 20px;
|
| 270 |
+
min-height: 350px;
|
| 271 |
}
|
| 272 |
|
| 273 |
.chart-title {
|
| 274 |
+
font-size: 1rem;
|
| 275 |
font-weight: 600;
|
| 276 |
+
margin-bottom: 12px;
|
| 277 |
+
color: var(--text-main);
|
| 278 |
}
|
| 279 |
|
| 280 |
.two-column {
|
| 281 |
display: grid;
|
| 282 |
grid-template-columns: 1fr 1fr;
|
| 283 |
gap: 20px;
|
|
|
|
| 284 |
}
|
| 285 |
|
| 286 |
@media (max-width: 900px) {
|
|
|
|
| 289 |
}
|
| 290 |
}
|
| 291 |
|
| 292 |
+
/* -----------------------------
|
| 293 |
+
Sections
|
| 294 |
+
------------------------------*/
|
| 295 |
+
.section {
|
| 296 |
+
display: none;
|
| 297 |
}
|
| 298 |
|
| 299 |
+
.section.active {
|
| 300 |
+
display: block;
|
| 301 |
+
animation: fadeIn 0.3s ease;
|
| 302 |
}
|
| 303 |
|
| 304 |
+
@keyframes fadeIn {
|
| 305 |
+
from {
|
| 306 |
+
opacity: 0;
|
| 307 |
+
transform: translateY(10px);
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
to {
|
| 311 |
+
opacity: 1;
|
| 312 |
+
transform: translateY(0);
|
| 313 |
+
}
|
| 314 |
}
|
| 315 |
|
| 316 |
+
/* -----------------------------
|
| 317 |
+
Info Box
|
| 318 |
+
------------------------------*/
|
| 319 |
.info-box {
|
| 320 |
+
background: var(--accent-soft);
|
| 321 |
+
border-left: 4px solid var(--accent);
|
| 322 |
+
padding: 14px;
|
| 323 |
+
border-radius: var(--radius-sm);
|
| 324 |
margin-bottom: 20px;
|
|
|
|
| 325 |
}
|
| 326 |
|
| 327 |
.info-box p {
|
| 328 |
+
font-size: 0.9rem;
|
| 329 |
+
color: var(--text-muted);
|
| 330 |
}
|
| 331 |
|
| 332 |
+
/* -----------------------------
|
| 333 |
+
Button Group
|
| 334 |
+
------------------------------*/
|
| 335 |
.button-group {
|
| 336 |
display: flex;
|
| 337 |
gap: 10px;
|
|
|
|
| 340 |
}
|
| 341 |
|
| 342 |
.btn {
|
| 343 |
+
padding: 10px 18px;
|
| 344 |
+
background: var(--accent);
|
| 345 |
color: white;
|
| 346 |
border: none;
|
| 347 |
+
border-radius: var(--radius-md);
|
| 348 |
cursor: pointer;
|
| 349 |
+
font-size: 0.9rem;
|
| 350 |
+
font-weight: 500;
|
| 351 |
+
transition: background 0.15s ease;
|
| 352 |
}
|
| 353 |
|
| 354 |
.btn:hover {
|
| 355 |
+
background: #1d4ed8;
|
| 356 |
}
|
| 357 |
|
| 358 |
.btn-secondary {
|
| 359 |
+
background: var(--surface);
|
| 360 |
+
color: var(--text-main);
|
| 361 |
+
border: 1px solid var(--border-soft);
|
| 362 |
}
|
| 363 |
|
| 364 |
.btn-secondary:hover {
|
| 365 |
+
background: var(--surface-subtle);
|
| 366 |
}
|
| 367 |
|
| 368 |
+
/* -----------------------------
|
| 369 |
+
Loading
|
| 370 |
+
------------------------------*/
|
| 371 |
.loading {
|
| 372 |
display: none;
|
| 373 |
text-align: center;
|
| 374 |
+
padding: 40px;
|
| 375 |
+
color: var(--accent);
|
| 376 |
}
|
| 377 |
|
| 378 |
.spinner {
|
| 379 |
+
border: 4px solid var(--border-soft);
|
| 380 |
+
border-top: 4px solid var(--accent);
|
| 381 |
border-radius: 50%;
|
| 382 |
width: 40px;
|
| 383 |
height: 40px;
|
| 384 |
animation: spin 1s linear infinite;
|
| 385 |
+
margin: 0 auto 15px;
|
| 386 |
}
|
| 387 |
|
| 388 |
@keyframes spin {
|
|
|
|
| 395 |
}
|
| 396 |
}
|
| 397 |
|
| 398 |
+
/* -----------------------------
|
| 399 |
+
Failure Items
|
| 400 |
+
------------------------------*/
|
|
|
|
|
|
|
| 401 |
.failure-item {
|
| 402 |
+
background: var(--surface-subtle);
|
| 403 |
+
border-left: 4px solid var(--error);
|
| 404 |
+
padding: 12px;
|
| 405 |
+
margin-bottom: 10px;
|
| 406 |
+
border-radius: var(--radius-sm);
|
| 407 |
}
|
| 408 |
|
| 409 |
.failure-item-query {
|
| 410 |
+
font-size: 0.9rem;
|
| 411 |
+
color: var(--text-main);
|
| 412 |
+
margin-bottom: 4px;
|
| 413 |
}
|
| 414 |
|
| 415 |
.failure-item-score {
|
| 416 |
+
font-size: 0.8rem;
|
| 417 |
+
color: var(--error);
|
| 418 |
+
font-weight: 500;
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
/* -----------------------------
|
| 422 |
+
Status Colors
|
| 423 |
+
------------------------------*/
|
| 424 |
+
.status-good {
|
| 425 |
+
color: var(--success);
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
.status-warning {
|
| 429 |
+
color: var(--warning);
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
.status-critical {
|
| 433 |
+
color: var(--error);
|
| 434 |
+
}
|
| 435 |
+
|
| 436 |
+
/* Plotly chart background fix for dark mode */
|
| 437 |
+
[data-theme="dark"] .js-plotly-plot .plotly .bg {
|
| 438 |
+
fill: var(--surface) !important;
|
| 439 |
}
|
| 440 |
</style>
|
| 441 |
</head>
|
| 442 |
|
| 443 |
<body>
|
| 444 |
+
<button class="theme-toggle" id="themeToggle">π Dark mode</button>
|
| 445 |
+
|
| 446 |
<div class="container">
|
| 447 |
+
<header>
|
| 448 |
+
<h1>π RAG Evaluation Dashboard</h1>
|
| 449 |
+
<p>Real-time metrics and analysis for your RAG pipeline</p>
|
| 450 |
<div class="nav-buttons">
|
| 451 |
+
<a href="/" class="nav-btn secondary">β Back to Main</a>
|
| 452 |
+
<a href="/ragas-demo" class="nav-btn">π¬ RAGAS Evaluation</a>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
</div>
|
| 454 |
+
</header>
|
| 455 |
+
|
| 456 |
+
<!-- Tab Navigation -->
|
| 457 |
+
<div class="tab-nav">
|
| 458 |
+
<button class="tab-btn active" onclick="showSection('overview')">Overview</button>
|
| 459 |
+
<button class="tab-btn" onclick="showSection('retrieval')">Retrieval</button>
|
| 460 |
+
<button class="tab-btn" onclick="showSection('generation')">Generation</button>
|
| 461 |
+
<button class="tab-btn" onclick="showSection('faithfulness')">Faithfulness</button>
|
| 462 |
+
<button class="tab-btn" onclick="showSection('performance')">Performance</button>
|
| 463 |
+
<button class="tab-btn" onclick="showSection('failures')">Failures</button>
|
| 464 |
</div>
|
| 465 |
|
| 466 |
+
<div class="loading" id="loading">
|
| 467 |
+
<div class="spinner"></div>
|
| 468 |
+
<p>Loading evaluation data...</p>
|
| 469 |
+
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
|
| 471 |
+
<!-- Overview Section -->
|
| 472 |
+
<div class="section active" id="overview">
|
| 473 |
+
<div class="metrics-grid">
|
| 474 |
+
<div class="metric-card">
|
| 475 |
+
<div class="metric-label">Total Evaluations</div>
|
| 476 |
+
<div class="metric-value" id="totalEvaluations">0</div>
|
| 477 |
</div>
|
| 478 |
+
<div class="metric-card success">
|
| 479 |
+
<div class="metric-label">Avg Precision</div>
|
| 480 |
+
<div class="metric-value" id="overviewPrecision">-</div>
|
| 481 |
</div>
|
| 482 |
+
<div class="metric-card">
|
| 483 |
+
<div class="metric-label">Avg Recall</div>
|
| 484 |
+
<div class="metric-value" id="overviewRecall">-</div>
|
| 485 |
+
</div>
|
| 486 |
+
<div class="metric-card success">
|
| 487 |
+
<div class="metric-label">Avg BERTScore</div>
|
| 488 |
+
<div class="metric-value" id="overviewBert">-</div>
|
| 489 |
+
</div>
|
| 490 |
+
<div class="metric-card">
|
| 491 |
+
<div class="metric-label">Avg Faithfulness</div>
|
| 492 |
+
<div class="metric-value" id="overviewFaith">-</div>
|
| 493 |
+
</div>
|
| 494 |
+
<div class="metric-card warning">
|
| 495 |
+
<div class="metric-label">Hallucination Rate</div>
|
| 496 |
+
<div class="metric-value" id="overviewHalluc">-</div>
|
| 497 |
+
</div>
|
| 498 |
+
<div class="metric-card">
|
| 499 |
+
<div class="metric-label">Avg Latency</div>
|
| 500 |
+
<div class="metric-value" id="overviewLatency">-</div>
|
| 501 |
+
<div class="metric-unit">ms</div>
|
| 502 |
+
</div>
|
| 503 |
+
<div class="metric-card">
|
| 504 |
+
<div class="metric-label">MRR</div>
|
| 505 |
+
<div class="metric-value" id="overviewMRR">-</div>
|
| 506 |
</div>
|
| 507 |
</div>
|
| 508 |
|
| 509 |
+
<div class="button-group">
|
| 510 |
+
<button class="btn" onclick="loadMetrics()">π Refresh</button>
|
| 511 |
+
<button class="btn btn-secondary" onclick="exportResults()">π₯ Export CSV</button>
|
| 512 |
+
<button class="btn btn-secondary" onclick="clearResults()">ποΈ Clear Results</button>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
</div>
|
| 514 |
|
| 515 |
+
<div class="info-box">
|
| 516 |
+
<p><strong>How to use:</strong> This dashboard shows evaluation metrics from your RAG pipeline.
|
| 517 |
+
Use the tabs above to explore different aspects of performance.
|
| 518 |
+
Try the <a href="/ragas-demo">RAGAS Evaluation</a> to run live evaluations.</p>
|
| 519 |
+
</div>
|
| 520 |
+
</div>
|
| 521 |
|
| 522 |
+
<!-- Retrieval Section -->
|
| 523 |
+
<div class="section" id="retrieval">
|
| 524 |
+
<div class="card">
|
| 525 |
+
<h2>Retrieval Metrics Over Time</h2>
|
| 526 |
+
<div class="chart-container" id="retrievalChart"></div>
|
| 527 |
+
</div>
|
| 528 |
+
<div class="metrics-grid">
|
| 529 |
+
<div class="metric-card">
|
| 530 |
+
<div class="metric-label">MRR</div>
|
| 531 |
+
<div class="metric-value" id="mrrValue">-</div>
|
| 532 |
</div>
|
| 533 |
+
<div class="metric-card success">
|
| 534 |
+
<div class="metric-label">Avg Precision</div>
|
| 535 |
+
<div class="metric-value" id="avgPrecision">-</div>
|
| 536 |
+
</div>
|
| 537 |
+
<div class="metric-card">
|
| 538 |
+
<div class="metric-label">Avg Recall</div>
|
| 539 |
+
<div class="metric-value" id="avgRecall">-</div>
|
| 540 |
</div>
|
| 541 |
</div>
|
| 542 |
+
</div>
|
| 543 |
|
| 544 |
+
<!-- Generation Section -->
|
| 545 |
+
<div class="section" id="generation">
|
| 546 |
+
<div class="two-column">
|
| 547 |
+
<div class="card">
|
| 548 |
+
<h2>Generation Quality Over Time</h2>
|
| 549 |
+
<div class="chart-container" id="generationChart"></div>
|
| 550 |
+
</div>
|
| 551 |
+
<div class="card">
|
| 552 |
+
<h2>Average Scores</h2>
|
| 553 |
+
<div class="chart-container" id="generationBars"></div>
|
|
|
|
|
|
|
|
|
|
| 554 |
</div>
|
| 555 |
</div>
|
| 556 |
+
</div>
|
| 557 |
|
| 558 |
+
<!-- Faithfulness Section -->
|
| 559 |
+
<div class="section" id="faithfulness">
|
| 560 |
+
<div class="two-column">
|
| 561 |
+
<div class="card">
|
| 562 |
+
<h2>Hallucination Rate</h2>
|
| 563 |
+
<div class="chart-container" id="hallucinationChart"></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 564 |
</div>
|
| 565 |
+
<div class="card">
|
| 566 |
+
<h2>Faithfulness Over Time</h2>
|
| 567 |
+
<div class="chart-container" id="faithfulnessChart"></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 568 |
</div>
|
| 569 |
</div>
|
| 570 |
+
</div>
|
| 571 |
|
| 572 |
+
<!-- Performance Section -->
|
| 573 |
+
<div class="section" id="performance">
|
| 574 |
+
<div class="card">
|
| 575 |
+
<h2>Latency Distribution</h2>
|
| 576 |
+
<div class="chart-container" id="latencyChart"></div>
|
| 577 |
+
</div>
|
| 578 |
+
<div class="metrics-grid">
|
| 579 |
+
<div class="metric-card">
|
| 580 |
+
<div class="metric-label">P50 Latency</div>
|
| 581 |
+
<div class="metric-value" id="p50Value">-</div>
|
| 582 |
+
<div class="metric-unit">ms</div>
|
| 583 |
+
</div>
|
| 584 |
+
<div class="metric-card warning">
|
| 585 |
+
<div class="metric-label">P95 Latency</div>
|
| 586 |
+
<div class="metric-value" id="p95Value">-</div>
|
| 587 |
+
<div class="metric-unit">ms</div>
|
| 588 |
+
</div>
|
| 589 |
+
<div class="metric-card">
|
| 590 |
+
<div class="metric-label">P99 Latency</div>
|
| 591 |
+
<div class="metric-value" id="p99Value">-</div>
|
| 592 |
+
<div class="metric-unit">ms</div>
|
| 593 |
+
</div>
|
| 594 |
+
<div class="metric-card">
|
| 595 |
+
<div class="metric-label">Cost per Query</div>
|
| 596 |
+
<div class="metric-value" id="costValue">-</div>
|
| 597 |
+
<div class="metric-unit">USD</div>
|
| 598 |
+
</div>
|
| 599 |
+
</div>
|
| 600 |
+
</div>
|
| 601 |
|
| 602 |
+
<!-- Failures Section -->
|
| 603 |
+
<div class="section" id="failures">
|
| 604 |
+
<div class="two-column">
|
| 605 |
+
<div class="card">
|
| 606 |
+
<h2>Failure Modes</h2>
|
| 607 |
+
<div class="chart-container" id="failureChart"></div>
|
| 608 |
+
</div>
|
| 609 |
+
<div class="card">
|
| 610 |
+
<h2>Failure Counts</h2>
|
| 611 |
+
<div class="metrics-grid" style="margin-top: 20px;">
|
| 612 |
+
<div class="metric-card error">
|
| 613 |
+
<div class="metric-label">Total Failures</div>
|
| 614 |
+
<div class="metric-value" id="totalFailures">0</div>
|
| 615 |
+
</div>
|
| 616 |
+
<div class="metric-card error">
|
| 617 |
+
<div class="metric-label">Hallucinations</div>
|
| 618 |
+
<div class="metric-value" id="hallCount">0</div>
|
| 619 |
+
</div>
|
| 620 |
+
<div class="metric-card warning">
|
| 621 |
+
<div class="metric-label">Low Retrieval</div>
|
| 622 |
+
<div class="metric-value" id="retCount">0</div>
|
| 623 |
+
</div>
|
| 624 |
+
<div class="metric-card warning">
|
| 625 |
+
<div class="metric-label">Low Generation</div>
|
| 626 |
+
<div class="metric-value" id="genCount">0</div>
|
| 627 |
</div>
|
| 628 |
</div>
|
| 629 |
</div>
|
| 630 |
+
</div>
|
| 631 |
+
<div class="card">
|
| 632 |
+
<h2>Recent Failures</h2>
|
| 633 |
+
<div id="failureList">
|
| 634 |
+
<p style="color: var(--text-muted); padding: 20px;">No failures detected! π</p>
|
| 635 |
</div>
|
| 636 |
</div>
|
| 637 |
</div>
|
| 638 |
</div>
|
| 639 |
|
| 640 |
<script>
|
| 641 |
+
const API_URL = window.location.origin;
|
| 642 |
let metricsData = null;
|
| 643 |
let timeseriesData = null;
|
| 644 |
let failureData = null;
|
| 645 |
|
| 646 |
+
// Tab navigation
|
| 647 |
function showSection(sectionId) {
|
|
|
|
| 648 |
document.querySelectorAll('.section').forEach(s => s.classList.remove('active'));
|
| 649 |
+
document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
|
| 650 |
|
|
|
|
| 651 |
document.getElementById(sectionId).classList.add('active');
|
| 652 |
event.target.classList.add('active');
|
| 653 |
|
| 654 |
+
// Load section-specific data
|
| 655 |
+
switch (sectionId) {
|
| 656 |
+
case 'retrieval': loadRetrievalCharts(); break;
|
| 657 |
+
case 'generation': loadGenerationCharts(); break;
|
| 658 |
+
case 'faithfulness': loadFaithfulnessCharts(); break;
|
| 659 |
+
case 'performance': loadPerformanceCharts(); break;
|
| 660 |
+
case 'failures': loadFailureCharts(); break;
|
| 661 |
+
}
|
| 662 |
}
|
| 663 |
|
| 664 |
async function loadMetrics() {
|
| 665 |
showLoading(true);
|
| 666 |
try {
|
| 667 |
+
const response = await fetch(`${API_URL}/evaluation/metrics`);
|
| 668 |
metricsData = await response.json();
|
| 669 |
|
| 670 |
+
// Update overview
|
| 671 |
+
document.getElementById('totalEvaluations').textContent = metricsData.total_evaluations;
|
| 672 |
+
document.getElementById('overviewPrecision').textContent = metricsData.retrieval_precision_mean?.toFixed(3) || '-';
|
| 673 |
+
document.getElementById('overviewRecall').textContent = metricsData.retrieval_recall_mean?.toFixed(3) || '-';
|
| 674 |
+
document.getElementById('overviewBert').textContent = metricsData.bert_score_mean?.toFixed(3) || '-';
|
| 675 |
+
document.getElementById('overviewFaith').textContent = metricsData.faithfulness_mean?.toFixed(3) || '-';
|
| 676 |
+
document.getElementById('overviewHalluc').textContent = ((metricsData.hallucination_rate || 0) * 100).toFixed(1) + '%';
|
| 677 |
+
document.getElementById('overviewLatency').textContent = metricsData.latency_mean?.toFixed(0) || '-';
|
| 678 |
+
document.getElementById('overviewMRR').textContent = metricsData.mrr?.toFixed(3) || '-';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 679 |
|
| 680 |
} catch (e) {
|
| 681 |
console.error('Error loading metrics:', e);
|
|
|
|
| 685 |
|
| 686 |
async function loadRetrievalCharts() {
|
| 687 |
try {
|
| 688 |
+
if (!timeseriesData) {
|
| 689 |
+
const response = await fetch(`${API_URL}/evaluation/timeseries`);
|
| 690 |
+
timeseriesData = await response.json();
|
| 691 |
+
}
|
| 692 |
|
| 693 |
if (!timeseriesData.query_idx || timeseriesData.query_idx.length === 0) {
|
| 694 |
+
document.getElementById('retrievalChart').innerHTML = '<p style="padding: 40px; text-align: center; color: var(--text-muted);">No data yet. Run some evaluations first.</p>';
|
| 695 |
return;
|
| 696 |
}
|
| 697 |
|
|
|
|
| 700 |
y: timeseriesData.retrieval_precision,
|
| 701 |
name: 'Precision',
|
| 702 |
mode: 'lines+markers',
|
| 703 |
+
line: { color: '#2563eb' }
|
| 704 |
};
|
| 705 |
|
| 706 |
const trace2 = {
|
|
|
|
| 708 |
y: timeseriesData.retrieval_recall,
|
| 709 |
name: 'Recall',
|
| 710 |
mode: 'lines+markers',
|
| 711 |
+
line: { color: '#16a34a' }
|
| 712 |
};
|
| 713 |
|
| 714 |
+
const layout = {
|
|
|
|
| 715 |
xaxis: { title: 'Query Index' },
|
| 716 |
yaxis: { title: 'Score' },
|
| 717 |
hovermode: 'x unified',
|
| 718 |
+
responsive: true,
|
| 719 |
+
paper_bgcolor: 'rgba(0,0,0,0)',
|
| 720 |
+
plot_bgcolor: 'rgba(0,0,0,0)',
|
| 721 |
+
font: { color: getComputedStyle(document.body).getPropertyValue('--text-main') }
|
| 722 |
+
};
|
| 723 |
+
|
| 724 |
+
Plotly.newPlot('retrievalChart', [trace1, trace2], layout);
|
| 725 |
|
| 726 |
if (metricsData) {
|
| 727 |
+
document.getElementById('mrrValue').textContent = metricsData.mrr?.toFixed(3) || '-';
|
| 728 |
+
document.getElementById('avgPrecision').textContent = metricsData.retrieval_precision_mean?.toFixed(3) || '-';
|
| 729 |
+
document.getElementById('avgRecall').textContent = metricsData.retrieval_recall_mean?.toFixed(3) || '-';
|
| 730 |
}
|
| 731 |
|
| 732 |
} catch (e) {
|
|
|
|
| 737 |
async function loadGenerationCharts() {
|
| 738 |
try {
|
| 739 |
if (!timeseriesData) {
|
| 740 |
+
const response = await fetch(`${API_URL}/evaluation/timeseries`);
|
| 741 |
timeseriesData = await response.json();
|
| 742 |
}
|
| 743 |
|
|
|
|
| 748 |
y: timeseriesData.rouge_l,
|
| 749 |
name: 'ROUGE-L',
|
| 750 |
mode: 'lines+markers',
|
| 751 |
+
line: { color: '#f59e0b' }
|
| 752 |
};
|
| 753 |
|
| 754 |
const trace2 = {
|
|
|
|
| 756 |
y: timeseriesData.bert_score,
|
| 757 |
name: 'BERTScore',
|
| 758 |
mode: 'lines+markers',
|
| 759 |
+
line: { color: '#16a34a' }
|
| 760 |
};
|
| 761 |
|
| 762 |
+
const layout = {
|
| 763 |
+
xaxis: { title: 'Query Index' },
|
| 764 |
+
yaxis: { title: 'Score' },
|
| 765 |
+
hovermode: 'x unified',
|
| 766 |
+
responsive: true,
|
| 767 |
+
paper_bgcolor: 'rgba(0,0,0,0)',
|
| 768 |
+
plot_bgcolor: 'rgba(0,0,0,0)'
|
| 769 |
+
};
|
| 770 |
+
|
| 771 |
+
Plotly.newPlot('generationChart', [trace1, trace2], layout);
|
| 772 |
|
| 773 |
if (metricsData) {
|
| 774 |
const barsTrace = {
|
| 775 |
x: ['ROUGE-L', 'BERTScore', 'Answer Relevance'],
|
| 776 |
y: [metricsData.rouge_l_mean, metricsData.bert_score_mean, metricsData.answer_relevance_mean],
|
| 777 |
type: 'bar',
|
| 778 |
+
marker: { color: ['#f59e0b', '#16a34a', '#2563eb'] }
|
| 779 |
};
|
| 780 |
|
| 781 |
Plotly.newPlot('generationBars', [barsTrace], {
|
| 782 |
+
yaxis: { title: 'Score' },
|
| 783 |
+
responsive: true,
|
| 784 |
+
showlegend: false,
|
| 785 |
+
paper_bgcolor: 'rgba(0,0,0,0)',
|
| 786 |
+
plot_bgcolor: 'rgba(0,0,0,0)'
|
| 787 |
});
|
| 788 |
}
|
| 789 |
|
|
|
|
| 796 |
try {
|
| 797 |
if (!metricsData) await loadMetrics();
|
| 798 |
if (!timeseriesData) {
|
| 799 |
+
const response = await fetch(`${API_URL}/evaluation/timeseries`);
|
| 800 |
timeseriesData = await response.json();
|
| 801 |
}
|
| 802 |
|
| 803 |
+
const hallRate = metricsData.hallucination_rate || 0;
|
| 804 |
const faithfulRate = 1 - hallRate;
|
| 805 |
|
| 806 |
const pieTrace = {
|
| 807 |
labels: ['Faithful Answers', 'Hallucinations'],
|
| 808 |
values: [faithfulRate * 100, hallRate * 100],
|
| 809 |
type: 'pie',
|
| 810 |
+
marker: { colors: ['#16a34a', '#dc2626'] }
|
| 811 |
};
|
| 812 |
|
| 813 |
+
Plotly.newPlot('hallucinationChart', [pieTrace], {
|
| 814 |
+
responsive: true,
|
| 815 |
+
paper_bgcolor: 'rgba(0,0,0,0)'
|
| 816 |
+
});
|
| 817 |
|
| 818 |
if (timeseriesData.query_idx && timeseriesData.query_idx.length > 0) {
|
| 819 |
const faithTrace = {
|
|
|
|
| 821 |
y: timeseriesData.faithfulness,
|
| 822 |
name: 'Faithfulness',
|
| 823 |
mode: 'lines+markers',
|
| 824 |
+
line: { color: '#2563eb', width: 2 },
|
| 825 |
marker: { size: 6 }
|
| 826 |
};
|
| 827 |
|
| 828 |
Plotly.newPlot('faithfulnessChart', [faithTrace], {
|
| 829 |
+
xaxis: { title: 'Query Index' },
|
| 830 |
+
yaxis: { title: 'Score (0-1)' },
|
| 831 |
+
responsive: true,
|
| 832 |
+
paper_bgcolor: 'rgba(0,0,0,0)',
|
| 833 |
+
plot_bgcolor: 'rgba(0,0,0,0)'
|
| 834 |
});
|
| 835 |
}
|
| 836 |
|
|
|
|
| 843 |
try {
|
| 844 |
if (!metricsData) await loadMetrics();
|
| 845 |
if (!timeseriesData) {
|
| 846 |
+
const response = await fetch(`${API_URL}/evaluation/timeseries`);
|
| 847 |
timeseriesData = await response.json();
|
| 848 |
}
|
| 849 |
|
| 850 |
if (timeseriesData.query_idx && timeseriesData.query_idx.length > 0) {
|
| 851 |
const latencyTrace = {
|
| 852 |
+
x: timeseriesData.query_idx,
|
| 853 |
y: timeseriesData.latency_ms,
|
| 854 |
+
mode: 'lines+markers',
|
| 855 |
+
line: { color: '#2563eb' },
|
| 856 |
+
marker: { size: 6 }
|
| 857 |
};
|
| 858 |
|
| 859 |
Plotly.newPlot('latencyChart', [latencyTrace], {
|
| 860 |
+
xaxis: { title: 'Query Index' },
|
| 861 |
+
yaxis: { title: 'Latency (ms)' },
|
| 862 |
+
responsive: true,
|
| 863 |
+
paper_bgcolor: 'rgba(0,0,0,0)',
|
| 864 |
+
plot_bgcolor: 'rgba(0,0,0,0)'
|
| 865 |
});
|
| 866 |
}
|
| 867 |
|
| 868 |
+
document.getElementById('p50Value').textContent = metricsData.latency_p50?.toFixed(0) || '-';
|
| 869 |
+
document.getElementById('p95Value').textContent = metricsData.latency_p95?.toFixed(0) || '-';
|
| 870 |
+
document.getElementById('p99Value').textContent = metricsData.latency_p99?.toFixed(0) || '-';
|
| 871 |
+
document.getElementById('costValue').textContent = metricsData.cost_per_query ? (metricsData.cost_per_query / 100).toFixed(4) : '-';
|
| 872 |
|
| 873 |
} catch (e) {
|
| 874 |
console.error('Error loading performance charts:', e);
|
|
|
|
| 877 |
|
| 878 |
async function loadFailureCharts() {
|
| 879 |
try {
|
| 880 |
+
const response = await fetch(`${API_URL}/evaluation/failures`);
|
| 881 |
failureData = await response.json();
|
| 882 |
|
| 883 |
const failureChart = {
|
| 884 |
x: Object.keys(failureData.failure_modes),
|
| 885 |
y: Object.values(failureData.failure_modes),
|
| 886 |
type: 'bar',
|
| 887 |
+
marker: { color: '#dc2626' }
|
| 888 |
};
|
| 889 |
|
| 890 |
Plotly.newPlot('failureChart', [failureChart], {
|
| 891 |
+
yaxis: { title: 'Count' },
|
| 892 |
+
responsive: true,
|
| 893 |
+
showlegend: false,
|
| 894 |
+
paper_bgcolor: 'rgba(0,0,0,0)',
|
| 895 |
+
plot_bgcolor: 'rgba(0,0,0,0)'
|
| 896 |
});
|
| 897 |
|
| 898 |
document.getElementById('totalFailures').textContent = failureData.total_failures;
|
| 899 |
document.getElementById('hallCount').textContent = failureData.failure_modes.hallucinations;
|
| 900 |
document.getElementById('retCount').textContent = failureData.failure_modes.low_retrieval;
|
| 901 |
document.getElementById('genCount').textContent = failureData.failure_modes.low_generation;
|
|
|
|
| 902 |
|
| 903 |
// Show recent failures
|
| 904 |
const failureList = document.getElementById('failureList');
|
|
|
|
| 909 |
];
|
| 910 |
|
| 911 |
allFailures.forEach(f => {
|
| 912 |
+
html += `<div class="failure-item">
|
| 913 |
+
<div class="failure-item-query">${f.query}</div>
|
| 914 |
+
<div class="failure-item-score">Score: ${f.score.toFixed(3)}</div>
|
| 915 |
+
</div>`;
|
| 916 |
});
|
| 917 |
|
| 918 |
+
failureList.innerHTML = html || '<p style="padding: 20px; color: var(--text-muted);">No failures detected! π</p>';
|
| 919 |
|
| 920 |
} catch (e) {
|
| 921 |
console.error('Error loading failure analysis:', e);
|
|
|
|
| 928 |
|
| 929 |
async function exportResults() {
|
| 930 |
try {
|
| 931 |
+
const response = await fetch(`${API_URL}/evaluation/export`);
|
| 932 |
const blob = await response.blob();
|
| 933 |
const url = window.URL.createObjectURL(blob);
|
| 934 |
const a = document.createElement('a');
|
|
|
|
| 944 |
async function clearResults() {
|
| 945 |
if (confirm('Are you sure you want to clear all results?')) {
|
| 946 |
try {
|
| 947 |
+
await fetch(`${API_URL}/evaluation/reset`, { method: 'POST' });
|
| 948 |
metricsData = null;
|
| 949 |
timeseriesData = null;
|
| 950 |
failureData = null;
|
|
|
|
| 956 |
}
|
| 957 |
}
|
| 958 |
|
| 959 |
+
// Dark mode toggle
|
| 960 |
+
const themeToggle = document.getElementById("themeToggle");
|
| 961 |
+
const root = document.documentElement;
|
| 962 |
+
|
| 963 |
+
const savedTheme = localStorage.getItem("theme");
|
| 964 |
+
const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
|
| 965 |
+
|
| 966 |
+
if (savedTheme) {
|
| 967 |
+
root.setAttribute("data-theme", savedTheme);
|
| 968 |
+
} else if (prefersDark) {
|
| 969 |
+
root.setAttribute("data-theme", "dark");
|
| 970 |
+
}
|
| 971 |
+
|
| 972 |
+
function updateToggleText() {
|
| 973 |
+
const isDark = root.getAttribute("data-theme") === "dark";
|
| 974 |
+
themeToggle.textContent = isDark ? "βοΈ Light mode" : "π Dark mode";
|
| 975 |
+
}
|
| 976 |
+
updateToggleText();
|
| 977 |
+
|
| 978 |
+
themeToggle.addEventListener("click", () => {
|
| 979 |
+
const isDark = root.getAttribute("data-theme") === "dark";
|
| 980 |
+
const newTheme = isDark ? "light" : "dark";
|
| 981 |
+
root.setAttribute("data-theme", newTheme);
|
| 982 |
+
localStorage.setItem("theme", newTheme);
|
| 983 |
+
updateToggleText();
|
| 984 |
+
});
|
| 985 |
+
|
| 986 |
// Load metrics on page load
|
| 987 |
window.addEventListener('load', loadMetrics);
|
| 988 |
</script>
|
frontend/index.html
CHANGED
|
@@ -485,6 +485,7 @@
|
|
| 485 |
<p>Ask questions about your research papers</p>
|
| 486 |
<nav>
|
| 487 |
<a href="/evaluation" class="eval-button">π Evaluation Dashboard</a>
|
|
|
|
| 488 |
</nav>
|
| 489 |
<button id="themeToggle" aria-label="Toggle dark mode" style="
|
| 490 |
position: fixed;
|
|
|
|
| 485 |
<p>Ask questions about your research papers</p>
|
| 486 |
<nav>
|
| 487 |
<a href="/evaluation" class="eval-button">π Evaluation Dashboard</a>
|
| 488 |
+
<a href="/ragas-demo" class="eval-button">π¬ RAGAS Evaluation</a>
|
| 489 |
</nav>
|
| 490 |
<button id="themeToggle" aria-label="Toggle dark mode" style="
|
| 491 |
position: fixed;
|
frontend/index_.html
ADDED
|
@@ -0,0 +1,592 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
|
| 4 |
+
<head>
|
| 5 |
+
<meta charset="UTF-8">
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 7 |
+
<title>Document Intelligence RAG</title>
|
| 8 |
+
<style>
|
| 9 |
+
* {
|
| 10 |
+
margin: 0;
|
| 11 |
+
padding: 0;
|
| 12 |
+
box-sizing: border-box;
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
body {
|
| 16 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
|
| 17 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 18 |
+
min-height: 100vh;
|
| 19 |
+
padding: 20px;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
.container {
|
| 23 |
+
max-width: 1000px;
|
| 24 |
+
margin: 0 auto;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
header {
|
| 28 |
+
text-align: center;
|
| 29 |
+
color: white;
|
| 30 |
+
margin-bottom: 40px;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
header h1 {
|
| 34 |
+
font-size: 2.5em;
|
| 35 |
+
margin-bottom: 10px;
|
| 36 |
+
text-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
header p {
|
| 40 |
+
font-size: 1.1em;
|
| 41 |
+
opacity: 0.9;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
.main-grid {
|
| 45 |
+
display: grid;
|
| 46 |
+
grid-template-columns: 1fr 1fr;
|
| 47 |
+
gap: 20px;
|
| 48 |
+
margin-bottom: 20px;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
.card {
|
| 52 |
+
background: white;
|
| 53 |
+
border-radius: 12px;
|
| 54 |
+
padding: 25px;
|
| 55 |
+
box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
.card h2 {
|
| 59 |
+
color: #333;
|
| 60 |
+
margin-bottom: 15px;
|
| 61 |
+
font-size: 1.3em;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
.upload-area {
|
| 65 |
+
border: 2px dashed #667eea;
|
| 66 |
+
border-radius: 8px;
|
| 67 |
+
padding: 30px;
|
| 68 |
+
text-align: center;
|
| 69 |
+
cursor: pointer;
|
| 70 |
+
transition: all 0.3s;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
.upload-area:hover {
|
| 74 |
+
border-color: #764ba2;
|
| 75 |
+
background: #f8f9ff;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
.upload-area.dragover {
|
| 79 |
+
border-color: #764ba2;
|
| 80 |
+
background: #f0f2ff;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
.upload-area input {
|
| 84 |
+
display: none;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
.upload-area p {
|
| 88 |
+
color: #666;
|
| 89 |
+
margin-bottom: 10px;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
.btn {
|
| 93 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 94 |
+
color: white;
|
| 95 |
+
border: none;
|
| 96 |
+
padding: 12px 24px;
|
| 97 |
+
border-radius: 8px;
|
| 98 |
+
cursor: pointer;
|
| 99 |
+
font-size: 1em;
|
| 100 |
+
font-weight: 600;
|
| 101 |
+
transition: transform 0.2s, box-shadow 0.2s;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
.btn:hover {
|
| 105 |
+
transform: translateY(-2px);
|
| 106 |
+
box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
.btn:active {
|
| 110 |
+
transform: translateY(0);
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
.btn-secondary {
|
| 114 |
+
background: #f0f0f0;
|
| 115 |
+
color: #333;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
.btn-secondary:hover {
|
| 119 |
+
background: #e0e0e0;
|
| 120 |
+
box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
.query-input {
|
| 124 |
+
display: flex;
|
| 125 |
+
gap: 10px;
|
| 126 |
+
margin-bottom: 20px;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
.query-input input {
|
| 130 |
+
flex: 1;
|
| 131 |
+
padding: 12px;
|
| 132 |
+
border: 2px solid #e0e0e0;
|
| 133 |
+
border-radius: 8px;
|
| 134 |
+
font-size: 1em;
|
| 135 |
+
transition: border-color 0.3s;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
.query-input input:focus {
|
| 139 |
+
outline: none;
|
| 140 |
+
border-color: #667eea;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
.status {
|
| 144 |
+
padding: 15px;
|
| 145 |
+
border-radius: 8px;
|
| 146 |
+
margin-bottom: 15px;
|
| 147 |
+
font-size: 0.95em;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
.status.success {
|
| 151 |
+
background: #d4edda;
|
| 152 |
+
color: #155724;
|
| 153 |
+
border-left: 4px solid #28a745;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
.status.error {
|
| 157 |
+
background: #f8d7da;
|
| 158 |
+
color: #721c24;
|
| 159 |
+
border-left: 4px solid #f5c6cb;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
.status.loading {
|
| 163 |
+
background: #e7f3ff;
|
| 164 |
+
color: #004085;
|
| 165 |
+
border-left: 4px solid #0c5ff4;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
.answer-box {
|
| 169 |
+
background: #f8f9fa;
|
| 170 |
+
border-left: 4px solid #667eea;
|
| 171 |
+
padding: 15px;
|
| 172 |
+
border-radius: 8px;
|
| 173 |
+
margin-bottom: 20px;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
.answer-box h3 {
|
| 177 |
+
color: #333;
|
| 178 |
+
margin-bottom: 10px;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
.answer-box p {
|
| 182 |
+
color: #555;
|
| 183 |
+
line-height: 1.6;
|
| 184 |
+
margin-bottom: 15px;
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
.sources {
|
| 188 |
+
background: white;
|
| 189 |
+
border-radius: 8px;
|
| 190 |
+
padding: 15px;
|
| 191 |
+
margin-bottom: 15px;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
.sources h4 {
|
| 195 |
+
color: #333;
|
| 196 |
+
margin-bottom: 12px;
|
| 197 |
+
font-size: 0.95em;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
.source-item {
|
| 201 |
+
padding: 10px;
|
| 202 |
+
background: #f8f9fa;
|
| 203 |
+
border-radius: 6px;
|
| 204 |
+
margin-bottom: 8px;
|
| 205 |
+
border-left: 3px solid #667eea;
|
| 206 |
+
font-size: 0.9em;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
.source-item .relevance {
|
| 210 |
+
color: #667eea;
|
| 211 |
+
font-weight: 600;
|
| 212 |
+
margin-bottom: 5px;
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
.source-item .text {
|
| 216 |
+
color: #555;
|
| 217 |
+
font-style: italic;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
.stats {
|
| 221 |
+
display: grid;
|
| 222 |
+
grid-template-columns: repeat(2, 1fr);
|
| 223 |
+
gap: 10px;
|
| 224 |
+
margin-bottom: 20px;
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
.stat-box {
|
| 228 |
+
background: #f8f9fa;
|
| 229 |
+
padding: 12px;
|
| 230 |
+
border-radius: 6px;
|
| 231 |
+
text-align: center;
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
.stat-box .number {
|
| 235 |
+
font-size: 1.5em;
|
| 236 |
+
font-weight: bold;
|
| 237 |
+
color: #667eea;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
.stat-box .label {
|
| 241 |
+
font-size: 0.85em;
|
| 242 |
+
color: #666;
|
| 243 |
+
margin-top: 5px;
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
.status-grid {
|
| 247 |
+
display: grid;
|
| 248 |
+
grid-template-columns: repeat(4, 1fr);
|
| 249 |
+
gap: 10px;
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
.loading-spinner {
|
| 253 |
+
display: inline-block;
|
| 254 |
+
width: 20px;
|
| 255 |
+
height: 20px;
|
| 256 |
+
border: 3px solid #f3f3f3;
|
| 257 |
+
border-top: 3px solid #667eea;
|
| 258 |
+
border-radius: 50%;
|
| 259 |
+
animation: spin 1s linear infinite;
|
| 260 |
+
margin-right: 10px;
|
| 261 |
+
vertical-align: middle;
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
@keyframes spin {
|
| 265 |
+
0% {
|
| 266 |
+
transform: rotate(0deg);
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
100% {
|
| 270 |
+
transform: rotate(360deg);
|
| 271 |
+
}
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
.full-width {
|
| 275 |
+
grid-column: 1 / -1;
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
@media (max-width: 768px) {
|
| 279 |
+
.main-grid {
|
| 280 |
+
grid-template-columns: 1fr;
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
header h1 {
|
| 284 |
+
font-size: 1.8em;
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
.stats {
|
| 288 |
+
grid-template-columns: 1fr;
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
.status-grid {
|
| 292 |
+
grid-template-columns: repeat(2, 1fr);
|
| 293 |
+
}
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
.hidden {
|
| 297 |
+
display: none;
|
| 298 |
+
}
|
| 299 |
+
</style>
|
| 300 |
+
</head>
|
| 301 |
+
|
| 302 |
+
<body>
|
| 303 |
+
<div class="container">
|
| 304 |
+
<header>
|
| 305 |
+
<h1>π Document Intelligence RAG</h1>
|
| 306 |
+
<p>Ask questions about your research papers</p>
|
| 307 |
+
</header>
|
| 308 |
+
|
| 309 |
+
<div class="main-grid">
|
| 310 |
+
<!-- Upload Section -->
|
| 311 |
+
<div class="card">
|
| 312 |
+
<h2>π€ Upload Documents</h2>
|
| 313 |
+
|
| 314 |
+
<div class="upload-area" id="uploadArea">
|
| 315 |
+
<p>π Drag & drop PDFs here or click to browse</p>
|
| 316 |
+
<input type="file" id="fileInput" multiple accept=".pdf">
|
| 317 |
+
<button class="btn" onclick="document.getElementById('fileInput').click()">
|
| 318 |
+
Choose Files
|
| 319 |
+
</button>
|
| 320 |
+
</div>
|
| 321 |
+
|
| 322 |
+
<div id="uploadStatus" class="status hidden"></div>
|
| 323 |
+
|
| 324 |
+
<div id="stats" class="stats">
|
| 325 |
+
<div class="stat-box">
|
| 326 |
+
<div class="number" id="totalChunks">0</div>
|
| 327 |
+
<div class="label">Total Chunks</div>
|
| 328 |
+
</div>
|
| 329 |
+
<div class="stat-box">
|
| 330 |
+
<div class="number" id="docCount">0</div>
|
| 331 |
+
<div class="label">Documents</div>
|
| 332 |
+
</div>
|
| 333 |
+
</div>
|
| 334 |
+
|
| 335 |
+
<button class="btn btn-secondary" onclick="loadStats()">
|
| 336 |
+
π Refresh Stats
|
| 337 |
+
</button>
|
| 338 |
+
|
| 339 |
+
<button class="btn btn-secondary" style="background: #ff6b6b; color: white; margin-top: 10px;"
|
| 340 |
+
onclick="resetSystem()">
|
| 341 |
+
ποΈ Delete All Documents
|
| 342 |
+
</button>
|
| 343 |
+
|
| 344 |
+
<p style="font-size: 0.85em; color: #999; margin-top: 10px;">
|
| 345 |
+
πΎ Documents are stored persistently. They remain after restart.
|
| 346 |
+
</p>
|
| 347 |
+
</div>
|
| 348 |
+
|
| 349 |
+
<!-- Query Section -->
|
| 350 |
+
<div class="card">
|
| 351 |
+
<h2>β Ask Questions</h2>
|
| 352 |
+
|
| 353 |
+
<div class="query-input">
|
| 354 |
+
<input type="text" id="queryInput" placeholder="What would you like to know about your documents?"
|
| 355 |
+
onkeypress="if(event.key==='Enter') submitQuery()">
|
| 356 |
+
<button class="btn" onclick="submitQuery()">Search</button>
|
| 357 |
+
</div>
|
| 358 |
+
|
| 359 |
+
<div id="queryStatus" class="status hidden"></div>
|
| 360 |
+
|
| 361 |
+
<div id="answerContainer" class="hidden">
|
| 362 |
+
<div class="answer-box">
|
| 363 |
+
<h3>Answer</h3>
|
| 364 |
+
<p id="answerText"></p>
|
| 365 |
+
</div>
|
| 366 |
+
|
| 367 |
+
<div class="sources" id="sourcesBox">
|
| 368 |
+
<h4>π Sources Used</h4>
|
| 369 |
+
<div id="sourcesList"></div>
|
| 370 |
+
</div>
|
| 371 |
+
</div>
|
| 372 |
+
</div>
|
| 373 |
+
</div>
|
| 374 |
+
|
| 375 |
+
<!-- Status Indicators -->
|
| 376 |
+
<div class="card full-width">
|
| 377 |
+
<h2>π§ System Status</h2>
|
| 378 |
+
<div id="healthStatus" class="status-grid">Loading...</div>
|
| 379 |
+
</div>
|
| 380 |
+
</div>
|
| 381 |
+
|
| 382 |
+
<script>
|
| 383 |
+
const API_URL = window.location.origin; // 'http://localhost:8000';
|
| 384 |
+
|
| 385 |
+
// Upload handlers
|
| 386 |
+
const uploadArea = document.getElementById('uploadArea');
|
| 387 |
+
const fileInput = document.getElementById('fileInput');
|
| 388 |
+
|
| 389 |
+
uploadArea.addEventListener('click', () => fileInput.click());
|
| 390 |
+
uploadArea.addEventListener('dragover', (e) => {
|
| 391 |
+
e.preventDefault();
|
| 392 |
+
uploadArea.classList.add('dragover');
|
| 393 |
+
});
|
| 394 |
+
uploadArea.addEventListener('dragleave', () => {
|
| 395 |
+
uploadArea.classList.remove('dragover');
|
| 396 |
+
});
|
| 397 |
+
uploadArea.addEventListener('drop', (e) => {
|
| 398 |
+
e.preventDefault();
|
| 399 |
+
uploadArea.classList.remove('dragover');
|
| 400 |
+
handleFiles(e.dataTransfer.files);
|
| 401 |
+
});
|
| 402 |
+
|
| 403 |
+
fileInput.addEventListener('change', (e) => {
|
| 404 |
+
handleFiles(e.target.files);
|
| 405 |
+
});
|
| 406 |
+
|
| 407 |
+
async function handleFiles(files) {
|
| 408 |
+
const statusDiv = document.getElementById('uploadStatus');
|
| 409 |
+
|
| 410 |
+
for (const file of files) {
|
| 411 |
+
if (!file.name.endsWith('.pdf')) {
|
| 412 |
+
showStatus(statusDiv, `Skipping ${file.name} - only PDFs supported`, 'error');
|
| 413 |
+
continue;
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
showStatus(statusDiv, `Uploading ${file.name}...`, 'loading');
|
| 417 |
+
|
| 418 |
+
const formData = new FormData();
|
| 419 |
+
formData.append('file', file);
|
| 420 |
+
|
| 421 |
+
try {
|
| 422 |
+
const response = await fetch(`${API_URL}/ingest`, {
|
| 423 |
+
method: 'POST',
|
| 424 |
+
body: formData
|
| 425 |
+
});
|
| 426 |
+
|
| 427 |
+
if (response.ok) {
|
| 428 |
+
const data = await response.json();
|
| 429 |
+
showStatus(
|
| 430 |
+
statusDiv,
|
| 431 |
+
`β ${file.name}: ${data.chunks_embedded} chunks ingested`,
|
| 432 |
+
'success'
|
| 433 |
+
);
|
| 434 |
+
loadStats();
|
| 435 |
+
} else {
|
| 436 |
+
const error = await response.json();
|
| 437 |
+
showStatus(statusDiv, `β ${file.name}: ${error.detail}`, 'error');
|
| 438 |
+
}
|
| 439 |
+
} catch (error) {
|
| 440 |
+
showStatus(statusDiv, `β Upload failed: ${error.message}`, 'error');
|
| 441 |
+
}
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
fileInput.value = '';
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
async function submitQuery() {
|
| 448 |
+
const query = document.getElementById('queryInput').value.trim();
|
| 449 |
+
if (!query) {
|
| 450 |
+
showStatus(
|
| 451 |
+
document.getElementById('queryStatus'),
|
| 452 |
+
'Please enter a question',
|
| 453 |
+
'error'
|
| 454 |
+
);
|
| 455 |
+
return;
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
const statusDiv = document.getElementById('queryStatus');
|
| 459 |
+
showStatus(statusDiv, 'Searching your documents...', 'loading');
|
| 460 |
+
|
| 461 |
+
try {
|
| 462 |
+
const response = await fetch(`${API_URL}/query`, {
|
| 463 |
+
method: 'POST',
|
| 464 |
+
headers: { 'Content-Type': 'application/json' },
|
| 465 |
+
body: JSON.stringify({ query, top_k: 3 })
|
| 466 |
+
});
|
| 467 |
+
|
| 468 |
+
if (response.ok) {
|
| 469 |
+
const data = await response.json();
|
| 470 |
+
displayAnswer(data);
|
| 471 |
+
statusDiv.classList.add('hidden');
|
| 472 |
+
} else {
|
| 473 |
+
const error = await response.json();
|
| 474 |
+
showStatus(statusDiv, error.error || 'Query failed', 'error');
|
| 475 |
+
}
|
| 476 |
+
} catch (error) {
|
| 477 |
+
showStatus(statusDiv, `Error: ${error.message}`, 'error');
|
| 478 |
+
}
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
function displayAnswer(data) {
|
| 482 |
+
document.getElementById('answerText').textContent = data.answer;
|
| 483 |
+
|
| 484 |
+
const sourcesList = document.getElementById('sourcesList');
|
| 485 |
+
sourcesList.innerHTML = data.sources.map(source => `
|
| 486 |
+
<div class="source-item">
|
| 487 |
+
<div class="relevance">π Relevance: ${(source.similarity * 100).toFixed(0)}%</div>
|
| 488 |
+
<div class="text">${source.preview}</div>
|
| 489 |
+
</div>
|
| 490 |
+
`).join('');
|
| 491 |
+
|
| 492 |
+
document.getElementById('answerContainer').classList.remove('hidden');
|
| 493 |
+
}
|
| 494 |
+
|
| 495 |
+
async function loadStats() {
|
| 496 |
+
try {
|
| 497 |
+
const response = await fetch(`${API_URL}/stats`);
|
| 498 |
+
if (response.ok) {
|
| 499 |
+
const data = await response.json();
|
| 500 |
+
document.getElementById('totalChunks').textContent = data.total_chunks;
|
| 501 |
+
}
|
| 502 |
+
} catch (error) {
|
| 503 |
+
console.error('Failed to load stats:', error);
|
| 504 |
+
}
|
| 505 |
+
}
|
| 506 |
+
|
| 507 |
+
async function loadHealth() {
|
| 508 |
+
try {
|
| 509 |
+
const response = await fetch(`${API_URL}/health`);
|
| 510 |
+
if (response.ok) {
|
| 511 |
+
const data = await response.json();
|
| 512 |
+
|
| 513 |
+
// Get embedding backend name
|
| 514 |
+
let embeddingName = data.embedding_backend || 'Unknown';
|
| 515 |
+
// Format nicely
|
| 516 |
+
if (embeddingName === 'sentence-transformers') {
|
| 517 |
+
embeddingName = 'Sentence-Transformers';
|
| 518 |
+
} else if (embeddingName === 'ollama') {
|
| 519 |
+
embeddingName = 'Ollama';
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
const healthHtml = `
|
| 523 |
+
<div class="stat-box">
|
| 524 |
+
<div class="number">${data.embedding_backend ? 'β' : 'β'}</div>
|
| 525 |
+
<div class="label">${embeddingName} (Embeddings)</div>
|
| 526 |
+
</div>
|
| 527 |
+
<div class="stat-box">
|
| 528 |
+
<div class="number">${data.groq === 'β' ? 'β' : 'β'}</div>
|
| 529 |
+
<div class="label">Groq (LLM)</div>
|
| 530 |
+
</div>
|
| 531 |
+
<div class="stat-box">
|
| 532 |
+
<div class="number">${data.chroma.status === 'β' ? 'β' : 'β'}</div>
|
| 533 |
+
<div class="label">Chroma (Vector DB)</div>
|
| 534 |
+
</div>
|
| 535 |
+
<div class="stat-box">
|
| 536 |
+
<div class="number">${data.status === 'healthy' ? 'β' : 'β '}</div>
|
| 537 |
+
<div class="label">Overall Status</div>
|
| 538 |
+
</div>
|
| 539 |
+
`;
|
| 540 |
+
document.getElementById('healthStatus').innerHTML = healthHtml;
|
| 541 |
+
}
|
| 542 |
+
} catch (error) {
|
| 543 |
+
document.getElementById('healthStatus').innerHTML =
|
| 544 |
+
`<div style="grid-column: 1/-1; padding: 15px; background: #f8d7da; color: #721c24; border-radius: 8px;">Cannot connect to API at ${API_URL}</div>`;
|
| 545 |
+
}
|
| 546 |
+
}
|
| 547 |
+
|
| 548 |
+
async function resetSystem() {
|
| 549 |
+
if (!confirm('β οΈ Delete ALL documents and embeddings? This cannot be undone!')) {
|
| 550 |
+
return;
|
| 551 |
+
}
|
| 552 |
+
|
| 553 |
+
const statusDiv = document.getElementById('uploadStatus');
|
| 554 |
+
showStatus(statusDiv, 'Resetting system...', 'loading');
|
| 555 |
+
|
| 556 |
+
try {
|
| 557 |
+
const response = await fetch(`${API_URL}/reset`, {
|
| 558 |
+
method: 'POST',
|
| 559 |
+
headers: {
|
| 560 |
+
'Content-Type': 'application/json'
|
| 561 |
+
}
|
| 562 |
+
});
|
| 563 |
+
|
| 564 |
+
if (response.ok) {
|
| 565 |
+
const data = await response.json();
|
| 566 |
+
showStatus(statusDiv, 'β All documents deleted!', 'success');
|
| 567 |
+
loadStats();
|
| 568 |
+
} else {
|
| 569 |
+
const error = await response.json();
|
| 570 |
+
showStatus(statusDiv, `Reset failed: ${error.detail || 'Unknown error'}`, 'error');
|
| 571 |
+
}
|
| 572 |
+
} catch (error) {
|
| 573 |
+
showStatus(statusDiv, `Error: ${error.message}`, 'error');
|
| 574 |
+
}
|
| 575 |
+
}
|
| 576 |
+
|
| 577 |
+
function showStatus(element, message, type) {
|
| 578 |
+
element.textContent = message;
|
| 579 |
+
element.className = `status ${type}`;
|
| 580 |
+
element.classList.remove('hidden');
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
// Load stats and health on page load
|
| 584 |
+
window.addEventListener('load', () => {
|
| 585 |
+
loadStats();
|
| 586 |
+
loadHealth();
|
| 587 |
+
setInterval(loadHealth, 30000); // Refresh every 30s
|
| 588 |
+
});
|
| 589 |
+
</script>
|
| 590 |
+
</body>
|
| 591 |
+
|
| 592 |
+
</html>
|
frontend/ragas.html
ADDED
|
@@ -0,0 +1,671 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
|
| 4 |
+
<head>
|
| 5 |
+
<meta charset="UTF-8">
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 7 |
+
<title>RAGAS Evaluation - Document Intelligence RAG</title>
|
| 8 |
+
<style>
|
| 9 |
+
/* Theme toggle */
|
| 10 |
+
.theme-toggle {
|
| 11 |
+
position: fixed;
|
| 12 |
+
top: 20px;
|
| 13 |
+
right: 20px;
|
| 14 |
+
padding: 8px 14px;
|
| 15 |
+
background: var(--surface);
|
| 16 |
+
border: 1px solid var(--border-soft);
|
| 17 |
+
border-radius: var(--radius-md);
|
| 18 |
+
cursor: pointer;
|
| 19 |
+
font-size: 0.85rem;
|
| 20 |
+
color: var(--text-main);
|
| 21 |
+
z-index: 100;
|
| 22 |
+
transition: background 0.15s ease;
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
.theme-toggle:hover {
|
| 26 |
+
background: var(--surface-subtle);
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
/* Button group */
|
| 30 |
+
.button-group {
|
| 31 |
+
display: flex;
|
| 32 |
+
gap: 10px;
|
| 33 |
+
margin-bottom: 16px;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.btn-small {
|
| 37 |
+
padding: 8px 14px;
|
| 38 |
+
background: var(--accent);
|
| 39 |
+
color: white;
|
| 40 |
+
border: none;
|
| 41 |
+
border-radius: var(--radius-sm);
|
| 42 |
+
font-size: 0.85rem;
|
| 43 |
+
cursor: pointer;
|
| 44 |
+
transition: background 0.15s ease;
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
.btn-small:hover {
|
| 48 |
+
background: #1d4ed8;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
.btn-small.secondary {
|
| 52 |
+
background: var(--surface-subtle);
|
| 53 |
+
color: var(--text-main);
|
| 54 |
+
border: 1px solid var(--border-soft);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
.btn-small.secondary:hover {
|
| 58 |
+
background: var(--border-soft);
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
/* Same design tokens as index.html */
|
| 62 |
+
:root {
|
| 63 |
+
--surface: #ffffff;
|
| 64 |
+
--surface-subtle: #fafafa;
|
| 65 |
+
--bg-main: #f5f7fb;
|
| 66 |
+
--card-bg: #ffffff;
|
| 67 |
+
--accent: #2563eb;
|
| 68 |
+
--accent-soft: #eff6ff;
|
| 69 |
+
--text-main: #111827;
|
| 70 |
+
--text-muted: #6b7280;
|
| 71 |
+
--border-soft: #e5e7eb;
|
| 72 |
+
--success: #16a34a;
|
| 73 |
+
--error: #dc2626;
|
| 74 |
+
--warning: #f59e0b;
|
| 75 |
+
--radius-sm: 6px;
|
| 76 |
+
--radius-md: 10px;
|
| 77 |
+
--radius-lg: 14px;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
[data-theme="dark"] {
|
| 81 |
+
--surface: #1f2937;
|
| 82 |
+
--surface-subtle: #111827;
|
| 83 |
+
--bg-main: #0f172a;
|
| 84 |
+
--card-bg: #1e293b;
|
| 85 |
+
--accent: #60a5fa;
|
| 86 |
+
--accent-soft: #1e3a5f;
|
| 87 |
+
--text-main: #f1f5f9;
|
| 88 |
+
--text-muted: #94a3b8;
|
| 89 |
+
--border-soft: #334155;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
* {
|
| 93 |
+
margin: 0;
|
| 94 |
+
padding: 0;
|
| 95 |
+
box-sizing: border-box;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
body {
|
| 99 |
+
font-family: Inter, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
|
| 100 |
+
background: var(--bg-main);
|
| 101 |
+
min-height: 100vh;
|
| 102 |
+
padding: 24px;
|
| 103 |
+
color: var(--text-main);
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
.container {
|
| 107 |
+
max-width: 900px;
|
| 108 |
+
margin: 0 auto;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
header {
|
| 112 |
+
text-align: center;
|
| 113 |
+
margin-bottom: 36px;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
header h1 {
|
| 117 |
+
font-size: 2rem;
|
| 118 |
+
font-weight: 600;
|
| 119 |
+
letter-spacing: -0.02em;
|
| 120 |
+
margin-bottom: 8px;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
header p {
|
| 124 |
+
font-size: 1rem;
|
| 125 |
+
color: var(--text-muted);
|
| 126 |
+
margin-bottom: 16px;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
.nav-links {
|
| 130 |
+
display: flex;
|
| 131 |
+
justify-content: center;
|
| 132 |
+
gap: 12px;
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
.nav-links a {
|
| 136 |
+
color: var(--accent);
|
| 137 |
+
text-decoration: none;
|
| 138 |
+
font-size: 0.9rem;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
.nav-links a:hover {
|
| 142 |
+
text-decoration: underline;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
.card {
|
| 146 |
+
background: var(--surface);
|
| 147 |
+
border-radius: var(--radius-lg);
|
| 148 |
+
padding: 28px;
|
| 149 |
+
border: 1px solid var(--border-soft);
|
| 150 |
+
box-shadow: 0 8px 20px rgba(0, 0, 0, 0.05);
|
| 151 |
+
margin-bottom: 24px;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
.card h2 {
|
| 155 |
+
font-size: 1.2rem;
|
| 156 |
+
font-weight: 600;
|
| 157 |
+
margin-bottom: 18px;
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
.query-section {
|
| 161 |
+
display: flex;
|
| 162 |
+
gap: 12px;
|
| 163 |
+
margin-bottom: 16px;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
.query-section input {
|
| 167 |
+
flex: 1;
|
| 168 |
+
padding: 12px 14px;
|
| 169 |
+
border: 1.5px solid var(--border-soft);
|
| 170 |
+
border-radius: var(--radius-md);
|
| 171 |
+
font-size: 0.95rem;
|
| 172 |
+
background: var(--surface);
|
| 173 |
+
color: var(--text-main);
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
.query-section input:focus {
|
| 177 |
+
outline: none;
|
| 178 |
+
border-color: var(--accent);
|
| 179 |
+
box-shadow: 0 0 0 3px var(--accent-soft);
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
.btn {
|
| 183 |
+
background: var(--accent);
|
| 184 |
+
color: white;
|
| 185 |
+
border: none;
|
| 186 |
+
padding: 12px 24px;
|
| 187 |
+
border-radius: var(--radius-md);
|
| 188 |
+
font-size: 0.95rem;
|
| 189 |
+
font-weight: 500;
|
| 190 |
+
cursor: pointer;
|
| 191 |
+
transition: background 0.15s ease, transform 0.15s ease;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
.btn:hover {
|
| 195 |
+
background: #1d4ed8;
|
| 196 |
+
transform: translateY(-1px);
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
.btn:disabled {
|
| 200 |
+
background: var(--text-muted);
|
| 201 |
+
cursor: not-allowed;
|
| 202 |
+
transform: none;
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
.status {
|
| 206 |
+
padding: 12px;
|
| 207 |
+
border-radius: var(--radius-md);
|
| 208 |
+
font-size: 0.9rem;
|
| 209 |
+
margin-bottom: 16px;
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
.status.loading {
|
| 213 |
+
background: var(--accent-soft);
|
| 214 |
+
color: var(--accent);
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
.status.error {
|
| 218 |
+
background: #fef2f2;
|
| 219 |
+
color: var(--error);
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
.status.success {
|
| 223 |
+
background: #f0fdf4;
|
| 224 |
+
color: var(--success);
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
.hidden {
|
| 228 |
+
display: none;
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
/* Answer Section */
|
| 232 |
+
.answer-box {
|
| 233 |
+
background: var(--surface-subtle);
|
| 234 |
+
border-radius: var(--radius-md);
|
| 235 |
+
padding: 16px;
|
| 236 |
+
margin-bottom: 20px;
|
| 237 |
+
border-left: 4px solid var(--accent);
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
.answer-box p {
|
| 241 |
+
line-height: 1.6;
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
/* RAGAS Scores */
|
| 245 |
+
.scores-grid {
|
| 246 |
+
display: grid;
|
| 247 |
+
grid-template-columns: repeat(3, 1fr);
|
| 248 |
+
gap: 16px;
|
| 249 |
+
margin-bottom: 20px;
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
.score-card {
|
| 253 |
+
background: var(--surface-subtle);
|
| 254 |
+
border-radius: var(--radius-md);
|
| 255 |
+
padding: 20px;
|
| 256 |
+
text-align: center;
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
.score-label {
|
| 260 |
+
font-size: 0.85rem;
|
| 261 |
+
color: var(--text-muted);
|
| 262 |
+
margin-bottom: 8px;
|
| 263 |
+
text-transform: uppercase;
|
| 264 |
+
letter-spacing: 0.5px;
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
.score-value {
|
| 268 |
+
font-size: 2rem;
|
| 269 |
+
font-weight: 700;
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
.score-value.high {
|
| 273 |
+
color: var(--success);
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
.score-value.medium {
|
| 277 |
+
color: var(--warning);
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
.score-value.low {
|
| 281 |
+
color: var(--error);
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
/* Progress Bar */
|
| 285 |
+
.progress-bar {
|
| 286 |
+
height: 8px;
|
| 287 |
+
background: var(--border-soft);
|
| 288 |
+
border-radius: 4px;
|
| 289 |
+
overflow: hidden;
|
| 290 |
+
margin-top: 8px;
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
.progress-fill {
|
| 294 |
+
height: 100%;
|
| 295 |
+
border-radius: 4px;
|
| 296 |
+
transition: width 0.5s ease;
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
.progress-fill.high {
|
| 300 |
+
background: var(--success);
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
.progress-fill.medium {
|
| 304 |
+
background: var(--warning);
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
.progress-fill.low {
|
| 308 |
+
background: var(--error);
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
/* Aggregate Stats */
|
| 312 |
+
.aggregate-stats {
|
| 313 |
+
display: flex;
|
| 314 |
+
justify-content: space-around;
|
| 315 |
+
padding: 16px;
|
| 316 |
+
background: var(--surface-subtle);
|
| 317 |
+
border-radius: var(--radius-md);
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
.stat-item {
|
| 321 |
+
text-align: center;
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
.stat-value {
|
| 325 |
+
font-size: 1.5rem;
|
| 326 |
+
font-weight: 600;
|
| 327 |
+
color: var(--accent);
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
.stat-label {
|
| 331 |
+
font-size: 0.8rem;
|
| 332 |
+
color: var(--text-muted);
|
| 333 |
+
}
|
| 334 |
+
|
| 335 |
+
/* Sources */
|
| 336 |
+
.sources-section {
|
| 337 |
+
margin-top: 20px;
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
.sources-section h3 {
|
| 341 |
+
font-size: 1rem;
|
| 342 |
+
margin-bottom: 12px;
|
| 343 |
+
color: var(--text-muted);
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
.source-item {
|
| 347 |
+
background: var(--surface-subtle);
|
| 348 |
+
border-radius: var(--radius-sm);
|
| 349 |
+
padding: 12px;
|
| 350 |
+
margin-bottom: 8px;
|
| 351 |
+
font-size: 0.85rem;
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
.source-similarity {
|
| 355 |
+
color: var(--accent);
|
| 356 |
+
font-weight: 500;
|
| 357 |
+
margin-bottom: 4px;
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
/* Info Box */
|
| 361 |
+
.info-box {
|
| 362 |
+
background: var(--accent-soft);
|
| 363 |
+
border-left: 4px solid var(--accent);
|
| 364 |
+
padding: 14px;
|
| 365 |
+
border-radius: var(--radius-sm);
|
| 366 |
+
margin-bottom: 20px;
|
| 367 |
+
}
|
| 368 |
+
|
| 369 |
+
.info-box p {
|
| 370 |
+
font-size: 0.9rem;
|
| 371 |
+
color: var(--text-muted);
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
/* Time display */
|
| 375 |
+
.time-info {
|
| 376 |
+
font-size: 0.85rem;
|
| 377 |
+
color: var(--text-muted);
|
| 378 |
+
text-align: right;
|
| 379 |
+
margin-top: 12px;
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
@media (max-width: 600px) {
|
| 383 |
+
.scores-grid {
|
| 384 |
+
grid-template-columns: 1fr;
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
.query-section {
|
| 388 |
+
flex-direction: column;
|
| 389 |
+
}
|
| 390 |
+
}
|
| 391 |
+
</style>
|
| 392 |
+
</head>
|
| 393 |
+
|
| 394 |
+
<body>
|
| 395 |
+
<button class="theme-toggle" id="themeToggle">π Dark mode</button>
|
| 396 |
+
|
| 397 |
+
<div class="container">
|
| 398 |
+
<header>
|
| 399 |
+
<h1>π¬ RAGAS Evaluation</h1>
|
| 400 |
+
<p>Test the RAG pipeline and see real-time quality metrics</p>
|
| 401 |
+
<div class="nav-links">
|
| 402 |
+
<a href="/">β Back to Main</a>
|
| 403 |
+
<a href="/evaluation">View Full Dashboard</a>
|
| 404 |
+
</div>
|
| 405 |
+
</header>
|
| 406 |
+
|
| 407 |
+
<!-- Query Section -->
|
| 408 |
+
<div class="card">
|
| 409 |
+
<h2>Ask a Question</h2>
|
| 410 |
+
<div class="info-box">
|
| 411 |
+
<p>Enter a question to query the RAG system. RAGAS will evaluate the response for
|
| 412 |
+
<strong>Faithfulness</strong> (is the answer grounded in context?) and <strong>Context
|
| 413 |
+
Precision</strong> (were the retrieved chunks useful?).
|
| 414 |
+
</p>
|
| 415 |
+
</div>
|
| 416 |
+
<div class="query-section">
|
| 417 |
+
<input type="text" id="queryInput" placeholder="e.g., What is machine learning?"
|
| 418 |
+
onkeypress="if(event.key === 'Enter') runEvaluation()">
|
| 419 |
+
<button class="btn" id="evalBtn" onclick="runEvaluation()">
|
| 420 |
+
π Query & Evaluate
|
| 421 |
+
</button>
|
| 422 |
+
</div>
|
| 423 |
+
<div id="status" class="status hidden"></div>
|
| 424 |
+
</div>
|
| 425 |
+
|
| 426 |
+
<!-- Results Section -->
|
| 427 |
+
<div id="resultsSection" class="card hidden">
|
| 428 |
+
<h2>π RAGAS Evaluation Results</h2>
|
| 429 |
+
|
| 430 |
+
<!-- Scores -->
|
| 431 |
+
<div class="scores-grid">
|
| 432 |
+
<div class="score-card">
|
| 433 |
+
<div class="score-label">Faithfulness</div>
|
| 434 |
+
<div class="score-value" id="faithScore">-</div>
|
| 435 |
+
<div class="progress-bar">
|
| 436 |
+
<div class="progress-fill" id="faithBar" style="width: 0%"></div>
|
| 437 |
+
</div>
|
| 438 |
+
</div>
|
| 439 |
+
<div class="score-card">
|
| 440 |
+
<div class="score-label">Context Precision</div>
|
| 441 |
+
<div class="score-value" id="precisionScore">-</div>
|
| 442 |
+
<div class="progress-bar">
|
| 443 |
+
<div class="progress-fill" id="precisionBar" style="width: 0%"></div>
|
| 444 |
+
</div>
|
| 445 |
+
</div>
|
| 446 |
+
<div class="score-card">
|
| 447 |
+
<div class="score-label">RAGAS Score</div>
|
| 448 |
+
<div class="score-value" id="ragasScore">-</div>
|
| 449 |
+
<div class="progress-bar">
|
| 450 |
+
<div class="progress-fill" id="ragasBar" style="width: 0%"></div>
|
| 451 |
+
</div>
|
| 452 |
+
</div>
|
| 453 |
+
</div>
|
| 454 |
+
|
| 455 |
+
<!-- Answer -->
|
| 456 |
+
<h3 style="margin-bottom: 12px;">π Generated Answer</h3>
|
| 457 |
+
<div class="answer-box">
|
| 458 |
+
<p id="answerText">-</p>
|
| 459 |
+
</div>
|
| 460 |
+
|
| 461 |
+
<!-- Sources -->
|
| 462 |
+
<div class="sources-section">
|
| 463 |
+
<h3>π Retrieved Sources</h3>
|
| 464 |
+
<div id="sourcesList"></div>
|
| 465 |
+
</div>
|
| 466 |
+
|
| 467 |
+
<div class="time-info">
|
| 468 |
+
<span id="responseTime"></span> | <span id="evalTime"></span>
|
| 469 |
+
</div>
|
| 470 |
+
</div>
|
| 471 |
+
|
| 472 |
+
<!-- Aggregate Stats -->
|
| 473 |
+
<div class="card">
|
| 474 |
+
<h2>π Aggregate Statistics</h2>
|
| 475 |
+
<div class="button-group">
|
| 476 |
+
<button class="btn-small" onclick="loadAggregateStats()">π Refresh</button>
|
| 477 |
+
<button class="btn-small secondary" onclick="clearRagasResults()">ποΈ Clear Results</button>
|
| 478 |
+
</div>
|
| 479 |
+
<div class="aggregate-stats">
|
| 480 |
+
<div class="stat-item">
|
| 481 |
+
<div class="stat-value" id="totalEvals">0</div>
|
| 482 |
+
<div class="stat-label">Total Evaluations</div>
|
| 483 |
+
</div>
|
| 484 |
+
<div class="stat-item">
|
| 485 |
+
<div class="stat-value" id="avgFaith">-</div>
|
| 486 |
+
<div class="stat-label">Avg Faithfulness</div>
|
| 487 |
+
</div>
|
| 488 |
+
<div class="stat-item">
|
| 489 |
+
<div class="stat-value" id="avgPrecision">-</div>
|
| 490 |
+
<div class="stat-label">Avg Precision</div>
|
| 491 |
+
</div>
|
| 492 |
+
<div class="stat-item">
|
| 493 |
+
<div class="stat-value" id="avgRagas">-</div>
|
| 494 |
+
<div class="stat-label">Avg RAGAS Score</div>
|
| 495 |
+
</div>
|
| 496 |
+
</div>
|
| 497 |
+
</div>
|
| 498 |
+
</div>
|
| 499 |
+
|
| 500 |
+
<script>
|
| 501 |
+
async function clearRagasResults() {
|
| 502 |
+
if (!confirm('Clear all RAGAS evaluation results? This cannot be undone.')) {
|
| 503 |
+
return;
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
try {
|
| 507 |
+
const response = await fetch(`${API_URL}/ragas/reset`, {
|
| 508 |
+
method: 'POST'
|
| 509 |
+
});
|
| 510 |
+
|
| 511 |
+
if (response.ok) {
|
| 512 |
+
// Reset displayed values
|
| 513 |
+
document.getElementById('totalEvals').textContent = '0';
|
| 514 |
+
document.getElementById('avgFaith').textContent = '-';
|
| 515 |
+
document.getElementById('avgPrecision').textContent = '-';
|
| 516 |
+
document.getElementById('avgRagas').textContent = '-';
|
| 517 |
+
|
| 518 |
+
alert('RAGAS results cleared!');
|
| 519 |
+
} else {
|
| 520 |
+
alert('Failed to clear results');
|
| 521 |
+
}
|
| 522 |
+
} catch (error) {
|
| 523 |
+
alert('Error: ' + error.message);
|
| 524 |
+
}
|
| 525 |
+
}
|
| 526 |
+
// Dark mode toggle
|
| 527 |
+
const themeToggle = document.getElementById("themeToggle");
|
| 528 |
+
const root = document.documentElement;
|
| 529 |
+
|
| 530 |
+
const savedTheme = localStorage.getItem("theme");
|
| 531 |
+
const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
|
| 532 |
+
|
| 533 |
+
if (savedTheme) {
|
| 534 |
+
root.setAttribute("data-theme", savedTheme);
|
| 535 |
+
} else if (prefersDark) {
|
| 536 |
+
root.setAttribute("data-theme", "dark");
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
function updateToggleText() {
|
| 540 |
+
const isDark = root.getAttribute("data-theme") === "dark";
|
| 541 |
+
themeToggle.textContent = isDark ? "βοΈ Light mode" : "π Dark mode";
|
| 542 |
+
}
|
| 543 |
+
updateToggleText();
|
| 544 |
+
|
| 545 |
+
themeToggle.addEventListener("click", () => {
|
| 546 |
+
const isDark = root.getAttribute("data-theme") === "dark";
|
| 547 |
+
const newTheme = isDark ? "light" : "dark";
|
| 548 |
+
root.setAttribute("data-theme", newTheme);
|
| 549 |
+
localStorage.setItem("theme", newTheme);
|
| 550 |
+
updateToggleText();
|
| 551 |
+
});
|
| 552 |
+
const API_URL = window.location.origin;
|
| 553 |
+
|
| 554 |
+
async function runEvaluation() {
|
| 555 |
+
const query = document.getElementById('queryInput').value.trim();
|
| 556 |
+
if (!query) {
|
| 557 |
+
showStatus('Please enter a question', 'error');
|
| 558 |
+
return;
|
| 559 |
+
}
|
| 560 |
+
|
| 561 |
+
const btn = document.getElementById('evalBtn');
|
| 562 |
+
btn.disabled = true;
|
| 563 |
+
btn.textContent = 'β³ Evaluating...';
|
| 564 |
+
|
| 565 |
+
showStatus('Querying RAG and running RAGAS evaluation...', 'loading');
|
| 566 |
+
document.getElementById('resultsSection').classList.add('hidden');
|
| 567 |
+
|
| 568 |
+
try {
|
| 569 |
+
const response = await fetch(`${API_URL}/ragas/query-and-evaluate`, {
|
| 570 |
+
method: 'POST',
|
| 571 |
+
headers: { 'Content-Type': 'application/json' },
|
| 572 |
+
body: JSON.stringify({ query, top_k: 3 })
|
| 573 |
+
});
|
| 574 |
+
|
| 575 |
+
if (!response.ok) {
|
| 576 |
+
throw new Error('Evaluation failed');
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
const data = await response.json();
|
| 580 |
+
displayResults(data);
|
| 581 |
+
hideStatus();
|
| 582 |
+
loadAggregateStats();
|
| 583 |
+
|
| 584 |
+
} catch (error) {
|
| 585 |
+
showStatus(`Error: ${error.message}`, 'error');
|
| 586 |
+
} finally {
|
| 587 |
+
btn.disabled = false;
|
| 588 |
+
btn.textContent = 'π Query & Evaluate';
|
| 589 |
+
}
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
function displayResults(data) {
|
| 593 |
+
document.getElementById('resultsSection').classList.remove('hidden');
|
| 594 |
+
|
| 595 |
+
// Answer
|
| 596 |
+
document.getElementById('answerText').textContent = data.answer;
|
| 597 |
+
|
| 598 |
+
// RAGAS Scores
|
| 599 |
+
if (data.ragas) {
|
| 600 |
+
setScore('faithScore', 'faithBar', data.ragas.faithfulness);
|
| 601 |
+
setScore('precisionScore', 'precisionBar', data.ragas.context_precision);
|
| 602 |
+
setScore('ragasScore', 'ragasBar', data.ragas.ragas_score);
|
| 603 |
+
|
| 604 |
+
document.getElementById('evalTime').textContent =
|
| 605 |
+
`Eval: ${data.ragas.eval_time_ms.toFixed(0)}ms`;
|
| 606 |
+
}
|
| 607 |
+
|
| 608 |
+
document.getElementById('responseTime').textContent =
|
| 609 |
+
`Response: ${data.response_time_ms.toFixed(0)}ms`;
|
| 610 |
+
|
| 611 |
+
// Sources
|
| 612 |
+
const sourcesList = document.getElementById('sourcesList');
|
| 613 |
+
sourcesList.innerHTML = data.sources.map(source => `
|
| 614 |
+
<div class="source-item">
|
| 615 |
+
<div class="source-similarity">π Similarity: ${(source.similarity * 100).toFixed(0)}%</div>
|
| 616 |
+
<div>${source.preview}</div>
|
| 617 |
+
</div>
|
| 618 |
+
`).join('');
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
function setScore(valueId, barId, score) {
|
| 622 |
+
const valueEl = document.getElementById(valueId);
|
| 623 |
+
const barEl = document.getElementById(barId);
|
| 624 |
+
|
| 625 |
+
const percent = (score * 100).toFixed(0);
|
| 626 |
+
valueEl.textContent = percent + '%';
|
| 627 |
+
barEl.style.width = percent + '%';
|
| 628 |
+
|
| 629 |
+
// Color coding
|
| 630 |
+
let colorClass = 'high';
|
| 631 |
+
if (score < 0.7) colorClass = 'medium';
|
| 632 |
+
if (score < 0.5) colorClass = 'low';
|
| 633 |
+
|
| 634 |
+
valueEl.className = 'score-value ' + colorClass;
|
| 635 |
+
barEl.className = 'progress-fill ' + colorClass;
|
| 636 |
+
}
|
| 637 |
+
|
| 638 |
+
async function loadAggregateStats() {
|
| 639 |
+
try {
|
| 640 |
+
const response = await fetch(`${API_URL}/ragas/metrics`);
|
| 641 |
+
const data = await response.json();
|
| 642 |
+
|
| 643 |
+
document.getElementById('totalEvals').textContent = data.total_evaluations || 0;
|
| 644 |
+
document.getElementById('avgFaith').textContent =
|
| 645 |
+
data.avg_faithfulness ? (data.avg_faithfulness * 100).toFixed(0) + '%' : '-';
|
| 646 |
+
document.getElementById('avgPrecision').textContent =
|
| 647 |
+
data.avg_context_precision ? (data.avg_context_precision * 100).toFixed(0) + '%' : '-';
|
| 648 |
+
document.getElementById('avgRagas').textContent =
|
| 649 |
+
data.avg_ragas_score ? (data.avg_ragas_score * 100).toFixed(0) + '%' : '-';
|
| 650 |
+
} catch (e) {
|
| 651 |
+
console.error('Failed to load aggregate stats:', e);
|
| 652 |
+
}
|
| 653 |
+
}
|
| 654 |
+
|
| 655 |
+
function showStatus(message, type) {
|
| 656 |
+
const status = document.getElementById('status');
|
| 657 |
+
status.textContent = message;
|
| 658 |
+
status.className = `status ${type}`;
|
| 659 |
+
status.classList.remove('hidden');
|
| 660 |
+
}
|
| 661 |
+
|
| 662 |
+
function hideStatus() {
|
| 663 |
+
document.getElementById('status').classList.add('hidden');
|
| 664 |
+
}
|
| 665 |
+
|
| 666 |
+
// Load stats on page load
|
| 667 |
+
window.addEventListener('load', loadAggregateStats);
|
| 668 |
+
</script>
|
| 669 |
+
</body>
|
| 670 |
+
|
| 671 |
+
</html>
|
pyproject.toml
CHANGED
|
@@ -16,7 +16,10 @@ dependencies = [
|
|
| 16 |
"python-dotenv>=1.2.1",
|
| 17 |
"python-multipart>=0.0.20",
|
| 18 |
"requests>=2.32.5",
|
| 19 |
-
"sentence-transformers>=5.2.0",
|
| 20 |
"uvicorn[standard]>=0.38.0",
|
| 21 |
-
"scikit-learn>=1.3.0"
|
|
|
|
|
|
|
|
|
|
| 22 |
]
|
|
|
|
| 16 |
"python-dotenv>=1.2.1",
|
| 17 |
"python-multipart>=0.0.20",
|
| 18 |
"requests>=2.32.5",
|
| 19 |
+
"sentence-transformers>=5.2.0",
|
| 20 |
"uvicorn[standard]>=0.38.0",
|
| 21 |
+
"scikit-learn>=1.3.0",
|
| 22 |
+
"ragas>=0.4.2",
|
| 23 |
+
"langchain-groq>=1.1.1",
|
| 24 |
+
"langchain-openai>=1.1.6",
|
| 25 |
]
|
run_real_evaluation.py
ADDED
|
@@ -0,0 +1,473 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Real RAG Evaluation Script
|
| 3 |
+
Runs actual queries through my RAG and computes real metrics.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import tempfile
|
| 8 |
+
import os
|
| 9 |
+
import time
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import List, Dict, Any
|
| 12 |
+
import numpy as np
|
| 13 |
+
from sentence_transformers import SentenceTransformer, util
|
| 14 |
+
|
| 15 |
+
from src.rag.pipeline import RAGPipeline
|
| 16 |
+
from src.evaluation import RAGEvaluator, EvaluationResult
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# ==================== TEST DATASET ====================
|
| 20 |
+
# These are synthetic queries + documents, but metrics are REAL
|
| 21 |
+
# based on actual retrieval and generation from my RAG
|
| 22 |
+
|
| 23 |
+
TEST_DOCUMENTS = {
|
| 24 |
+
"medical_research_1.txt": """
|
| 25 |
+
Drug X Clinical Trial Results
|
| 26 |
+
|
| 27 |
+
A Phase III clinical trial was conducted to evaluate the efficacy and safety of Drug X
|
| 28 |
+
in patients with condition Y. The study enrolled 500 patients aged 18-65 with confirmed
|
| 29 |
+
diagnosis of condition Y.
|
| 30 |
+
|
| 31 |
+
Primary Efficacy Endpoint:
|
| 32 |
+
Drug X demonstrated a 65% response rate compared to 35% in the placebo group (p<0.001).
|
| 33 |
+
The median time to response was 4 weeks.
|
| 34 |
+
|
| 35 |
+
Safety Profile:
|
| 36 |
+
The most common adverse events were:
|
| 37 |
+
- Headache (12% of patients)
|
| 38 |
+
- Mild gastrointestinal upset (8% of patients)
|
| 39 |
+
- Dizziness (5% of patients)
|
| 40 |
+
- Fatigue (4% of patients)
|
| 41 |
+
|
| 42 |
+
Serious adverse events occurred in 2% of patients, including liver enzyme elevation.
|
| 43 |
+
No deaths were attributed to the drug during the trial period.
|
| 44 |
+
|
| 45 |
+
Dosage Recommendations:
|
| 46 |
+
The recommended dose is 500mg twice daily with meals. Dose adjustments may be necessary
|
| 47 |
+
for patients with renal impairment (dose reduction to 250mg twice daily recommended).
|
| 48 |
+
|
| 49 |
+
Mechanism of Action:
|
| 50 |
+
Drug X works by inhibiting protein kinase Y, which is overexpressed in condition Y cells.
|
| 51 |
+
This inhibition leads to cell cycle arrest and apoptosis of affected cells.
|
| 52 |
+
""",
|
| 53 |
+
|
| 54 |
+
"drug_interactions.txt": """
|
| 55 |
+
Drug X Drug Interaction Guide
|
| 56 |
+
|
| 57 |
+
Important Drug Interactions:
|
| 58 |
+
|
| 59 |
+
1. CYP3A4 Inhibitors (e.g., ketoconazole, ritonavir):
|
| 60 |
+
- May increase Drug X levels by 3-5 fold
|
| 61 |
+
- Monitor for adverse effects
|
| 62 |
+
- Consider dose reduction
|
| 63 |
+
|
| 64 |
+
2. Warfarin:
|
| 65 |
+
- Potential increased bleeding risk
|
| 66 |
+
- Monitor INR closely
|
| 67 |
+
- Baseline INR and weekly monitoring recommended
|
| 68 |
+
|
| 69 |
+
3. Oral Contraceptives:
|
| 70 |
+
- May reduce contraceptive efficacy
|
| 71 |
+
- Alternative contraception recommended
|
| 72 |
+
- No dose adjustment needed for Drug X
|
| 73 |
+
|
| 74 |
+
4. NSAIDs:
|
| 75 |
+
- Increased risk of GI bleeding
|
| 76 |
+
- Monitor for GI symptoms
|
| 77 |
+
- Consider gastroprotection
|
| 78 |
+
|
| 79 |
+
5. ACE Inhibitors:
|
| 80 |
+
- No significant interaction
|
| 81 |
+
- Safe to use concomitantly
|
| 82 |
+
- No monitoring required
|
| 83 |
+
""",
|
| 84 |
+
|
| 85 |
+
"patient_case_study.txt": """
|
| 86 |
+
Case Study: 45-year-old Female with Condition Y
|
| 87 |
+
|
| 88 |
+
Patient History:
|
| 89 |
+
A 45-year-old female presented with a 6-month history of progressive symptoms consistent
|
| 90 |
+
with condition Y. She has a past medical history of hypertension controlled on lisinopril
|
| 91 |
+
and type 2 diabetes on metformin.
|
| 92 |
+
|
| 93 |
+
Treatment Response:
|
| 94 |
+
Patient was started on Drug X 500mg twice daily. After 2 weeks of treatment, she reported
|
| 95 |
+
partial symptom improvement. By week 6, she achieved complete response with 95% symptom
|
| 96 |
+
resolution.
|
| 97 |
+
|
| 98 |
+
Side Effects Experienced:
|
| 99 |
+
- Mild headache (treated with acetaminophen)
|
| 100 |
+
- Occasional nausea (resolved with food intake)
|
| 101 |
+
- No serious adverse events
|
| 102 |
+
|
| 103 |
+
Follow-up:
|
| 104 |
+
Patient continues to do well on Drug X at 6-month follow-up with sustained response.
|
| 105 |
+
No dose adjustments were necessary. Lab values remain within normal limits.
|
| 106 |
+
"""
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
TEST_CASES = [
|
| 110 |
+
{
|
| 111 |
+
"query": "What is the response rate of Drug X?",
|
| 112 |
+
"expected_answer_keywords": ["65%", "response rate"],
|
| 113 |
+
"expected_source_docs": ["medical_research_1"],
|
| 114 |
+
"description": "Should retrieve clinical trial data"
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"query": "What are the side effects of Drug X?",
|
| 118 |
+
"expected_answer_keywords": ["headache", "gastrointestinal", "dizziness"],
|
| 119 |
+
"expected_source_docs": ["medical_research_1"],
|
| 120 |
+
"description": "Should retrieve safety profile section"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"query": "How does Drug X interact with warfarin?",
|
| 124 |
+
"expected_answer_keywords": ["warfarin", "bleeding", "INR"],
|
| 125 |
+
"expected_source_docs": ["drug_interactions"],
|
| 126 |
+
"description": "Should retrieve drug interactions guide"
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"query": "What is the recommended dosage of Drug X?",
|
| 130 |
+
"expected_answer_keywords": ["500mg", "twice daily"],
|
| 131 |
+
"expected_source_docs": ["medical_research_1"],
|
| 132 |
+
"description": "Should retrieve dosage recommendations"
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"query": "What is the mechanism of action for Drug X?",
|
| 136 |
+
"expected_answer_keywords": ["protein kinase", "inhibiting", "apoptosis"],
|
| 137 |
+
"expected_source_docs": ["medical_research_1"],
|
| 138 |
+
"description": "Should retrieve mechanism section"
|
| 139 |
+
},
|
| 140 |
+
]
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
# ==================== METRIC COMPUTATION ====================
|
| 144 |
+
|
| 145 |
+
def compute_retrieval_precision(
|
| 146 |
+
retrieved_docs: List[str],
|
| 147 |
+
expected_docs: List[str]
|
| 148 |
+
) -> float:
|
| 149 |
+
"""
|
| 150 |
+
Precision: Of the docs we retrieved, what % were actually relevant?
|
| 151 |
+
|
| 152 |
+
Formula: TP / (TP + FP)
|
| 153 |
+
where TP = relevant docs we retrieved
|
| 154 |
+
FP = irrelevant docs we retrieved
|
| 155 |
+
"""
|
| 156 |
+
if not retrieved_docs:
|
| 157 |
+
return 0.0
|
| 158 |
+
|
| 159 |
+
# Count how many retrieved docs match expected
|
| 160 |
+
relevant_count = sum(1 for doc in retrieved_docs if doc in expected_docs)
|
| 161 |
+
|
| 162 |
+
precision = relevant_count / len(retrieved_docs)
|
| 163 |
+
return float(precision)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def compute_retrieval_recall(
|
| 167 |
+
retrieved_docs: List[str],
|
| 168 |
+
expected_docs: List[str]
|
| 169 |
+
) -> float:
|
| 170 |
+
"""
|
| 171 |
+
Recall: Of all relevant docs, what % did we actually retrieve?
|
| 172 |
+
|
| 173 |
+
Formula: TP / (TP + FN)
|
| 174 |
+
where TP = relevant docs we retrieved
|
| 175 |
+
FN = relevant docs we missed
|
| 176 |
+
"""
|
| 177 |
+
if not expected_docs:
|
| 178 |
+
return 1.0 # If no docs expected, perfect recall
|
| 179 |
+
|
| 180 |
+
# Count how many expected docs were retrieved
|
| 181 |
+
relevant_count = sum(1 for doc in expected_docs if doc in retrieved_docs)
|
| 182 |
+
|
| 183 |
+
recall = relevant_count / len(expected_docs)
|
| 184 |
+
return float(recall)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def compute_bert_score(generated_answer: str, expected_keywords: List[str]) -> float:
|
| 188 |
+
"""
|
| 189 |
+
Semantic similarity: Does the answer contain the right semantic information?
|
| 190 |
+
|
| 191 |
+
Approximation: Check if expected keywords appear semantically in the answer
|
| 192 |
+
This is a simplified version. Real BERTScore would use embeddings.
|
| 193 |
+
"""
|
| 194 |
+
if not expected_keywords:
|
| 195 |
+
return 1.0
|
| 196 |
+
|
| 197 |
+
answer_lower = generated_answer.lower()
|
| 198 |
+
|
| 199 |
+
# Count how many keywords appear in the answer (fuzzy match)
|
| 200 |
+
found_keywords = 0
|
| 201 |
+
for keyword in expected_keywords:
|
| 202 |
+
if keyword.lower() in answer_lower:
|
| 203 |
+
found_keywords += 1
|
| 204 |
+
|
| 205 |
+
# Score based on keyword coverage
|
| 206 |
+
score = found_keywords / len(expected_keywords)
|
| 207 |
+
|
| 208 |
+
# Cap at 0.95 since answer likely contains more than just keywords
|
| 209 |
+
return float(min(score, 0.95))
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def compute_answer_relevance(generated_answer: str, query: str) -> float:
|
| 213 |
+
"""
|
| 214 |
+
Is the answer actually addressing the query?
|
| 215 |
+
|
| 216 |
+
Approximation: Check if answer is non-trivial and not a refusal
|
| 217 |
+
"""
|
| 218 |
+
answer_lower = generated_answer.lower()
|
| 219 |
+
|
| 220 |
+
# Refusal indicators
|
| 221 |
+
refusal_phrases = [
|
| 222 |
+
"i don't know",
|
| 223 |
+
"i cannot",
|
| 224 |
+
"i'm unable",
|
| 225 |
+
"not found",
|
| 226 |
+
"no information",
|
| 227 |
+
"unable to find"
|
| 228 |
+
]
|
| 229 |
+
|
| 230 |
+
if any(phrase in answer_lower for phrase in refusal_phrases):
|
| 231 |
+
return 0.3
|
| 232 |
+
|
| 233 |
+
# Answer has reasonable length
|
| 234 |
+
if len(generated_answer.split()) < 3:
|
| 235 |
+
return 0.4
|
| 236 |
+
|
| 237 |
+
return 0.85 # Assume relevant if not a refusal
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def detect_hallucinations(
|
| 241 |
+
generated_answer: str,
|
| 242 |
+
retrieved_context: str
|
| 243 |
+
) -> bool:
|
| 244 |
+
"""
|
| 245 |
+
Did the LLM make up information not in the sources?
|
| 246 |
+
|
| 247 |
+
Simplified approach: Check if answer contradicts source context
|
| 248 |
+
Real implementation would use NLI models
|
| 249 |
+
"""
|
| 250 |
+
# This is hard to do perfectly without advanced NLI
|
| 251 |
+
# For now, assume no hallucinations if answer is relatively short and grounded
|
| 252 |
+
# In production, you'd use a dedicated hallucination detector
|
| 253 |
+
|
| 254 |
+
answer_words = set(generated_answer.lower().split())
|
| 255 |
+
context_words = set(retrieved_context.lower().split())
|
| 256 |
+
|
| 257 |
+
# If too many words from answer aren't in context, might be hallucinating
|
| 258 |
+
# (very loose approximation)
|
| 259 |
+
overlap = len(answer_words & context_words) / max(len(answer_words), 1)
|
| 260 |
+
|
| 261 |
+
# Conservative: flag as hallucination if very low overlap
|
| 262 |
+
is_hallucination = overlap < 0.3
|
| 263 |
+
|
| 264 |
+
return is_hallucination
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def compute_faithfulness(
|
| 268 |
+
generated_answer: str,
|
| 269 |
+
retrieved_context: str
|
| 270 |
+
) -> float:
|
| 271 |
+
"""
|
| 272 |
+
Is the answer grounded in the sources?
|
| 273 |
+
|
| 274 |
+
Approximation: Word overlap between answer and context
|
| 275 |
+
Higher overlap = more grounded
|
| 276 |
+
"""
|
| 277 |
+
answer_words = set(generated_answer.lower().split())
|
| 278 |
+
context_words = set(retrieved_context.lower().split())
|
| 279 |
+
|
| 280 |
+
if not answer_words:
|
| 281 |
+
return 0.0
|
| 282 |
+
|
| 283 |
+
# Overlap ratio
|
| 284 |
+
overlap = len(answer_words & context_words) / len(answer_words)
|
| 285 |
+
|
| 286 |
+
# Convert to 0-1 scale (0.3 overlap = 0.6 faithfulness)
|
| 287 |
+
faithfulness = min(overlap * 2, 1.0)
|
| 288 |
+
|
| 289 |
+
return float(faithfulness)
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
def compute_source_attribution(
|
| 293 |
+
generated_answer: str,
|
| 294 |
+
retrieved_context: str
|
| 295 |
+
) -> float:
|
| 296 |
+
"""
|
| 297 |
+
What % of the answer is backed by sources?
|
| 298 |
+
|
| 299 |
+
Approximation: Check what % of answer words appear in retrieved context
|
| 300 |
+
"""
|
| 301 |
+
answer_words = generated_answer.lower().split()
|
| 302 |
+
context_words = set(retrieved_context.lower().split())
|
| 303 |
+
|
| 304 |
+
if not answer_words:
|
| 305 |
+
return 0.0
|
| 306 |
+
|
| 307 |
+
attributed_words = sum(1 for word in answer_words if word in context_words)
|
| 308 |
+
attribution_score = attributed_words / len(answer_words)
|
| 309 |
+
|
| 310 |
+
return float(attribution_score)
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
# ==================== MAIN EVALUATION LOOP ====================
|
| 314 |
+
|
| 315 |
+
def run_real_evaluation():
|
| 316 |
+
"""
|
| 317 |
+
Run actual evaluation against your RAG system.
|
| 318 |
+
"""
|
| 319 |
+
print("=" * 70)
|
| 320 |
+
print("REAL RAG EVALUATION")
|
| 321 |
+
print("=" * 70)
|
| 322 |
+
|
| 323 |
+
# Initialize evaluator
|
| 324 |
+
evaluator = RAGEvaluator(store_results=True, results_dir="evaluation_results")
|
| 325 |
+
|
| 326 |
+
# Create temporary directory for test documents
|
| 327 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 328 |
+
print(f"\nCreating test documents in {tmpdir}")
|
| 329 |
+
|
| 330 |
+
# Write test documents
|
| 331 |
+
doc_paths = {}
|
| 332 |
+
for doc_name, content in TEST_DOCUMENTS.items():
|
| 333 |
+
doc_path = Path(tmpdir) / doc_name
|
| 334 |
+
doc_path.write_text(content)
|
| 335 |
+
doc_paths[doc_name] = str(doc_path)
|
| 336 |
+
print(f" β {doc_name}")
|
| 337 |
+
|
| 338 |
+
# Initialize RAG pipeline
|
| 339 |
+
print(f"\nπ Initializing RAG pipeline...")
|
| 340 |
+
try:
|
| 341 |
+
pipeline = RAGPipeline()
|
| 342 |
+
except Exception as e:
|
| 343 |
+
print(f" β Failed to initialize pipeline: {e}")
|
| 344 |
+
return
|
| 345 |
+
|
| 346 |
+
# Ingest documents
|
| 347 |
+
print(f"\nIngesting documents into RAG...")
|
| 348 |
+
try:
|
| 349 |
+
for doc_name, content in TEST_DOCUMENTS.items():
|
| 350 |
+
doc_id = Path(doc_name).stem # Remove extension
|
| 351 |
+
pipeline.ingest(doc_id, content)
|
| 352 |
+
print(f" β Ingested {doc_name} (doc_id: {doc_id})")
|
| 353 |
+
except Exception as e:
|
| 354 |
+
print(f" β Failed to ingest documents: {e}")
|
| 355 |
+
return
|
| 356 |
+
|
| 357 |
+
# Run test cases
|
| 358 |
+
print(f"\nRunning {len(TEST_CASES)} test cases...\n")
|
| 359 |
+
|
| 360 |
+
all_retrieved_docs = []
|
| 361 |
+
|
| 362 |
+
for i, test_case in enumerate(TEST_CASES, 1):
|
| 363 |
+
query = test_case["query"]
|
| 364 |
+
expected_keywords = test_case["expected_answer_keywords"]
|
| 365 |
+
expected_docs = test_case["expected_source_docs"]
|
| 366 |
+
|
| 367 |
+
print(f"Test {i}: {query}")
|
| 368 |
+
print(f" Expected sources: {expected_docs}")
|
| 369 |
+
|
| 370 |
+
try:
|
| 371 |
+
start_time = time.time()
|
| 372 |
+
|
| 373 |
+
# Query the RAG
|
| 374 |
+
result = pipeline.query(query, return_sources=True)
|
| 375 |
+
|
| 376 |
+
latency_ms = (time.time() - start_time) * 1000
|
| 377 |
+
|
| 378 |
+
answer = result.get('answer', '')
|
| 379 |
+
retrieved_docs = result.get('sources', [])
|
| 380 |
+
context = result.get('context', '')
|
| 381 |
+
|
| 382 |
+
# If context is empty, reconstruct from retrieved sources
|
| 383 |
+
if not context and retrieved_docs:
|
| 384 |
+
# Combine previews from all retrieved sources
|
| 385 |
+
context = ' '.join([source.get('preview', '') for source in retrieved_docs if isinstance(source, dict)])
|
| 386 |
+
|
| 387 |
+
print(f" DEBUG - context length: {len(context)}")
|
| 388 |
+
|
| 389 |
+
# Extract doc names from sources
|
| 390 |
+
# Sources are dicts with 'chunk_id' like 'medical_research_1_chunk_0'
|
| 391 |
+
retrieved_doc_names = []
|
| 392 |
+
for source in retrieved_docs:
|
| 393 |
+
if isinstance(source, dict):
|
| 394 |
+
chunk_id = source.get('chunk_id', '')
|
| 395 |
+
# Extract doc_id from chunk_id (format: {doc_id}_chunk_{num})
|
| 396 |
+
doc_id = chunk_id.rsplit('_chunk_', 1)[0] if '_chunk_' in chunk_id else 'unknown'
|
| 397 |
+
retrieved_doc_names.append(doc_id)
|
| 398 |
+
|
| 399 |
+
all_retrieved_docs.extend(retrieved_doc_names)
|
| 400 |
+
|
| 401 |
+
print(f" Retrieved: {retrieved_doc_names}")
|
| 402 |
+
print(f" Answer: {answer[:100]}...")
|
| 403 |
+
|
| 404 |
+
# COMPUTE REAL METRICS
|
| 405 |
+
retrieval_precision = compute_retrieval_precision(
|
| 406 |
+
retrieved_doc_names,
|
| 407 |
+
expected_docs
|
| 408 |
+
)
|
| 409 |
+
retrieval_recall = compute_retrieval_recall(
|
| 410 |
+
retrieved_doc_names,
|
| 411 |
+
expected_docs
|
| 412 |
+
)
|
| 413 |
+
bert_score = compute_bert_score(answer, expected_keywords)
|
| 414 |
+
answer_relevance = compute_answer_relevance(answer, query)
|
| 415 |
+
faithfulness = compute_faithfulness(answer, context)
|
| 416 |
+
hallucination_detected = detect_hallucinations(answer, context)
|
| 417 |
+
source_attribution = compute_source_attribution(answer, context)
|
| 418 |
+
|
| 419 |
+
print(f" Precision: {retrieval_precision:.2f} | Recall: {retrieval_recall:.2f}")
|
| 420 |
+
print(f" BERTScore: {bert_score:.2f} | Relevance: {answer_relevance:.2f}")
|
| 421 |
+
print(f" Faithfulness: {faithfulness:.2f} | Attribution: {source_attribution:.2f}")
|
| 422 |
+
print(f" Hallucination: {hallucination_detected} | Latency: {latency_ms:.0f}ms")
|
| 423 |
+
|
| 424 |
+
# Create evaluation result
|
| 425 |
+
eval_result = EvaluationResult(
|
| 426 |
+
query=query,
|
| 427 |
+
answer=answer,
|
| 428 |
+
source_docs=retrieved_doc_names,
|
| 429 |
+
num_retrieved=len(retrieved_docs),
|
| 430 |
+
retrieval_precision=retrieval_precision,
|
| 431 |
+
retrieval_recall=retrieval_recall,
|
| 432 |
+
rank_position=1 if expected_docs[0] in retrieved_doc_names else 2,
|
| 433 |
+
rouge_l=bert_score, # Approximation
|
| 434 |
+
bert_score=bert_score,
|
| 435 |
+
answer_relevance=answer_relevance,
|
| 436 |
+
faithfulness=faithfulness,
|
| 437 |
+
hallucination_detected=hallucination_detected,
|
| 438 |
+
source_attribution_score=source_attribution,
|
| 439 |
+
latency_ms=latency_ms,
|
| 440 |
+
tokens_used=len(answer.split()),
|
| 441 |
+
cost_cents=0.004, # Estimate for Groq
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
+
evaluator.add_result(eval_result)
|
| 445 |
+
|
| 446 |
+
print()
|
| 447 |
+
|
| 448 |
+
except Exception as e:
|
| 449 |
+
print(f" β Test failed: {e}\n")
|
| 450 |
+
|
| 451 |
+
# Print summary
|
| 452 |
+
print("\n" + "=" * 70)
|
| 453 |
+
print("EVALUATION SUMMARY")
|
| 454 |
+
print("=" * 70)
|
| 455 |
+
|
| 456 |
+
metrics = evaluator.compute_aggregate_metrics()
|
| 457 |
+
|
| 458 |
+
print(f"\nResults:")
|
| 459 |
+
print(f" Total Evaluations: {metrics['total_evaluations']}")
|
| 460 |
+
print(f" Avg Precision: {metrics['retrieval_precision_mean']:.3f}")
|
| 461 |
+
print(f" Avg Recall: {metrics['retrieval_recall_mean']:.3f}")
|
| 462 |
+
print(f" Avg BERTScore: {metrics['bert_score_mean']:.3f}")
|
| 463 |
+
print(f" Avg Faithfulness: {metrics['faithfulness_mean']:.3f}")
|
| 464 |
+
print(f" Hallucination Rate: {metrics['hallucination_rate']*100:.1f}%")
|
| 465 |
+
print(f" Avg Latency: {metrics['latency_mean']:.0f}ms")
|
| 466 |
+
print(f" MRR: {metrics['mrr']:.3f}")
|
| 467 |
+
|
| 468 |
+
print(f"\nResults saved to: evaluation_results/results.jsonl")
|
| 469 |
+
print(f"View dashboard at: http://localhost:8000/evaluation")
|
| 470 |
+
|
| 471 |
+
|
| 472 |
+
if __name__ == "__main__":
|
| 473 |
+
run_real_evaluation()
|
src/evaluation/ragas_integration/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""RAGAS Integration for Document Intelligence RAG"""
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
from .pipeline_adapter import RagasReadyPipeline, EvaluationReadyResponse
|
| 5 |
+
from .ragas_evaluator import RagasEvaluator, RagasEvaluationResult
|
| 6 |
+
from .ragas_endpoints import init_ragas_router
|
| 7 |
+
|
| 8 |
+
__all__ = [
|
| 9 |
+
"RagasReadyPipeline",
|
| 10 |
+
"EvaluationReadyResponse",
|
| 11 |
+
"RagasEvaluator",
|
| 12 |
+
"RagasEvaluationResult",
|
| 13 |
+
"init_ragas_router",
|
| 14 |
+
]
|
src/evaluation/ragas_integration/pipeline_adapter.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pipeline Adapter - Captures full context for RAGAS evaluation
|
| 3 |
+
"""
|
| 4 |
+
import time
|
| 5 |
+
from typing import Dict, Any, List, Optional
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from src.rag.llm import build_context_string
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class EvaluationReadyResponse:
|
| 12 |
+
"""Response with full contexts for RAGAS."""
|
| 13 |
+
query: str
|
| 14 |
+
answer: str
|
| 15 |
+
contexts: List[str] # Full text of each retrieved chunk
|
| 16 |
+
sources: List[Dict] # Original source metadata
|
| 17 |
+
chunks_used: int
|
| 18 |
+
response_time_ms: float
|
| 19 |
+
status: str
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class RagasReadyPipeline:
|
| 23 |
+
"""
|
| 24 |
+
Wraps your RAGPipeline to capture full context.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(self, base_pipeline):
|
| 28 |
+
"""
|
| 29 |
+
Args:
|
| 30 |
+
base_pipeline: Your existing RAGPipeline instance
|
| 31 |
+
"""
|
| 32 |
+
self.pipeline = base_pipeline
|
| 33 |
+
self.config = base_pipeline.config
|
| 34 |
+
self.embeddings = base_pipeline.embeddings
|
| 35 |
+
self.llm = base_pipeline.llm
|
| 36 |
+
self.vector_store = base_pipeline.vector_store
|
| 37 |
+
|
| 38 |
+
def query_for_evaluation(
|
| 39 |
+
self,
|
| 40 |
+
query: str,
|
| 41 |
+
top_k: Optional[int] = None
|
| 42 |
+
) -> EvaluationReadyResponse:
|
| 43 |
+
"""
|
| 44 |
+
Query and capture FULL context for RAGAS.
|
| 45 |
+
|
| 46 |
+
TODO: Implement this by:
|
| 47 |
+
1. Embedding the query (use self.pipeline.embeddings)
|
| 48 |
+
2. Retrieving chunks (use self.pipeline.vector_store.retrieve)
|
| 49 |
+
3. Extracting FULL text from each chunk
|
| 50 |
+
4. Building context for LLM
|
| 51 |
+
5. Getting answer from LLM
|
| 52 |
+
6. Returning EvaluationReadyResponse with full contexts
|
| 53 |
+
"""
|
| 54 |
+
start_time = time.time()
|
| 55 |
+
|
| 56 |
+
if self.vector_store.size() == 0:
|
| 57 |
+
return EvaluationReadyResponse(
|
| 58 |
+
query=query,
|
| 59 |
+
answer="No documents in vector store",
|
| 60 |
+
contexts=[],
|
| 61 |
+
sources=[],
|
| 62 |
+
chunks_used=0,
|
| 63 |
+
response_time_ms=0,
|
| 64 |
+
status="no_documents"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
query_embedding = self.embeddings.embed(query)
|
| 68 |
+
k = top_k or self.config.top_k
|
| 69 |
+
retrieved_chunks = self.vector_store.retrieve(query_embedding, top_k=k)
|
| 70 |
+
if not retrieved_chunks:
|
| 71 |
+
return EvaluationReadyResponse(
|
| 72 |
+
query=query,
|
| 73 |
+
answer="No relevant documents found.",
|
| 74 |
+
contexts=[],
|
| 75 |
+
sources=[],
|
| 76 |
+
chunks_used=0,
|
| 77 |
+
response_time_ms=(time.time() - start_time) * 1000,
|
| 78 |
+
status="no_results"
|
| 79 |
+
)
|
| 80 |
+
contexts = [chunk.text for chunk in retrieved_chunks]
|
| 81 |
+
|
| 82 |
+
context_string = build_context_string(retrieved_chunks)
|
| 83 |
+
answer = self.llm.query(context=context_string, query=query)
|
| 84 |
+
sources = [
|
| 85 |
+
{
|
| 86 |
+
"chunk_id": chunk.chunk_id,
|
| 87 |
+
"similarity": round(chunk.similarity, 3),
|
| 88 |
+
"preview": chunk.text[:100] + "..." if len(chunk.text) > 100 else chunk.text
|
| 89 |
+
}
|
| 90 |
+
for chunk in retrieved_chunks
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
response_time_ms = (time.time() - start_time) * 1000
|
| 94 |
+
|
| 95 |
+
return EvaluationReadyResponse(
|
| 96 |
+
query=query,
|
| 97 |
+
answer=answer,
|
| 98 |
+
contexts=contexts, # Full texts for RAGAS!
|
| 99 |
+
sources=sources,
|
| 100 |
+
chunks_used=len(contexts),
|
| 101 |
+
response_time_ms=response_time_ms,
|
| 102 |
+
status="success"
|
| 103 |
+
)
|
src/evaluation/ragas_integration/ragas_endpoints.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RAGAS API Endpoints
|
| 3 |
+
"""
|
| 4 |
+
from fastapi import APIRouter, HTTPException
|
| 5 |
+
from pydantic import BaseModel
|
| 6 |
+
from typing import List, Optional
|
| 7 |
+
|
| 8 |
+
router = APIRouter()
|
| 9 |
+
|
| 10 |
+
# We'll set these from main.py
|
| 11 |
+
ragas_pipeline = None
|
| 12 |
+
ragas_evaluator = None
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class RagasEvalRequest(BaseModel):
|
| 16 |
+
"""Direct evaluation request."""
|
| 17 |
+
query: str
|
| 18 |
+
answer: str
|
| 19 |
+
contexts: List[str]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class RagasQueryRequest(BaseModel):
|
| 23 |
+
"""Query + evaluate request."""
|
| 24 |
+
query: str
|
| 25 |
+
top_k: int = 3
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def init_ragas_router(pipeline, evaluator):
|
| 29 |
+
"""Initialize router with pipeline and evaluator instances."""
|
| 30 |
+
global ragas_pipeline, ragas_evaluator
|
| 31 |
+
ragas_pipeline = pipeline
|
| 32 |
+
ragas_evaluator = evaluator
|
| 33 |
+
return router
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@router.post("/evaluate")
|
| 37 |
+
async def evaluate_direct(request: RagasEvalRequest):
|
| 38 |
+
"""
|
| 39 |
+
Evaluate a query-answer pair directly.
|
| 40 |
+
|
| 41 |
+
Use this when you already have the answer and contexts.
|
| 42 |
+
"""
|
| 43 |
+
if not ragas_evaluator:
|
| 44 |
+
raise HTTPException(status_code=503, detail="RAGAS not initialized")
|
| 45 |
+
|
| 46 |
+
result = await ragas_evaluator.evaluate_single(
|
| 47 |
+
query=request.query,
|
| 48 |
+
answer=request.answer,
|
| 49 |
+
contexts=request.contexts
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
return {
|
| 53 |
+
"eval_id": result.eval_id,
|
| 54 |
+
"faithfulness": result.faithfulness,
|
| 55 |
+
"context_precision": result.context_precision,
|
| 56 |
+
"ragas_score": result.ragas_score,
|
| 57 |
+
"latency_ms": round(result.latency_ms, 2)
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@router.post("/query-and-evaluate")
|
| 62 |
+
async def query_and_evaluate(request: RagasQueryRequest):
|
| 63 |
+
"""
|
| 64 |
+
Query the RAG system AND evaluate the response.
|
| 65 |
+
|
| 66 |
+
Returns both the answer and RAGAS metrics.
|
| 67 |
+
"""
|
| 68 |
+
if not ragas_pipeline or not ragas_evaluator:
|
| 69 |
+
raise HTTPException(status_code=503, detail="RAGAS not initialized")
|
| 70 |
+
|
| 71 |
+
# Step 1: Query pipeline
|
| 72 |
+
response = ragas_pipeline.query_for_evaluation(
|
| 73 |
+
query=request.query,
|
| 74 |
+
top_k=request.top_k
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
if response.status != "success":
|
| 78 |
+
return {
|
| 79 |
+
"query": response.query,
|
| 80 |
+
"answer": response.answer,
|
| 81 |
+
"status": response.status,
|
| 82 |
+
"ragas": None
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
# Step 2: Evaluate with RAGAS
|
| 86 |
+
eval_result = await ragas_evaluator.evaluate_single(
|
| 87 |
+
query=response.query,
|
| 88 |
+
answer=response.answer,
|
| 89 |
+
contexts=response.contexts
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
return {
|
| 93 |
+
"query": response.query,
|
| 94 |
+
"answer": response.answer,
|
| 95 |
+
"sources": response.sources,
|
| 96 |
+
"chunks_used": response.chunks_used,
|
| 97 |
+
"response_time_ms": round(response.response_time_ms, 2),
|
| 98 |
+
"ragas": {
|
| 99 |
+
"eval_id": eval_result.eval_id,
|
| 100 |
+
"faithfulness": eval_result.faithfulness,
|
| 101 |
+
"context_precision": eval_result.context_precision,
|
| 102 |
+
"ragas_score": eval_result.ragas_score,
|
| 103 |
+
"eval_time_ms": round(eval_result.latency_ms, 2)
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
@router.get("/metrics")
|
| 109 |
+
async def get_metrics():
|
| 110 |
+
"""Get aggregate RAGAS metrics from all evaluations."""
|
| 111 |
+
if not ragas_evaluator:
|
| 112 |
+
raise HTTPException(status_code=503, detail="RAGAS not initialized")
|
| 113 |
+
|
| 114 |
+
results = ragas_evaluator.results
|
| 115 |
+
|
| 116 |
+
if not results:
|
| 117 |
+
return {"total_evaluations": 0, "message": "No evaluations yet"}
|
| 118 |
+
|
| 119 |
+
# Calculate averages
|
| 120 |
+
avg_faithfulness = sum(r.faithfulness for r in results) / len(results)
|
| 121 |
+
avg_precision = sum(r.context_precision for r in results) / len(results)
|
| 122 |
+
avg_ragas = sum(r.ragas_score for r in results) / len(results)
|
| 123 |
+
|
| 124 |
+
return {
|
| 125 |
+
"total_evaluations": len(results),
|
| 126 |
+
"avg_faithfulness": round(avg_faithfulness, 3),
|
| 127 |
+
"avg_context_precision": round(avg_precision, 3),
|
| 128 |
+
"avg_ragas_score": round(avg_ragas, 3)
|
| 129 |
+
}
|
src/evaluation/ragas_integration/ragas_evaluator.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RAGAS Evaluator - Core evaluation logic using RAGAS framework
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import logging
|
| 6 |
+
from typing import List, Dict, Any, Optional
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
|
| 10 |
+
# RAGAS imports
|
| 11 |
+
from ragas.metrics import (
|
| 12 |
+
Faithfulness,
|
| 13 |
+
ResponseRelevancy,
|
| 14 |
+
LLMContextPrecisionWithoutReference,
|
| 15 |
+
)
|
| 16 |
+
from ragas.llms import LangchainLLMWrapper
|
| 17 |
+
from ragas.dataset_schema import SingleTurnSample
|
| 18 |
+
|
| 19 |
+
# LangChain for LLM wrapper (RAGAS requirement)
|
| 20 |
+
from langchain_groq import ChatGroq
|
| 21 |
+
|
| 22 |
+
logging.basicConfig(level=logging.INFO)
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class RagasEvaluationResult:
|
| 28 |
+
"""Result from RAGAS evaluation."""
|
| 29 |
+
eval_id: str
|
| 30 |
+
query: str
|
| 31 |
+
|
| 32 |
+
# RAGAS metrics (0-1 scale)
|
| 33 |
+
faithfulness: float
|
| 34 |
+
answer_relevancy: float
|
| 35 |
+
context_precision: float
|
| 36 |
+
|
| 37 |
+
# Composite score
|
| 38 |
+
ragas_score: float = 0.0
|
| 39 |
+
|
| 40 |
+
# Metadata
|
| 41 |
+
latency_ms: float = 0.0
|
| 42 |
+
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
| 43 |
+
|
| 44 |
+
def __post_init__(self):
|
| 45 |
+
"""Calculate composite RAGAS score."""
|
| 46 |
+
scores = [self.faithfulness, self.context_precision]
|
| 47 |
+
valid_scores = [s for s in scores if s > 0]
|
| 48 |
+
self.ragas_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class RagasEvaluator:
|
| 52 |
+
"""
|
| 53 |
+
Evaluates RAG responses using RAGAS metrics.
|
| 54 |
+
|
| 55 |
+
Metrics:
|
| 56 |
+
- Faithfulness: Is the answer grounded in the context?
|
| 57 |
+
- Answer Relevancy: Does the answer address the question?
|
| 58 |
+
- Context Precision: Are the retrieved chunks useful?
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
def __init__(self, groq_api_key: Optional[str] = None):
|
| 62 |
+
"""
|
| 63 |
+
Initialize RAGAS evaluator.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
groq_api_key: Your Groq API key (or uses GROQ_API_KEY env var)
|
| 67 |
+
"""
|
| 68 |
+
# TODO: Step 1 - Get API key
|
| 69 |
+
api_key = groq_api_key or os.getenv("GROQ_API_KEY")
|
| 70 |
+
if not api_key:
|
| 71 |
+
raise ValueError("GROQ_API_KEY required")
|
| 72 |
+
llm = ChatGroq(
|
| 73 |
+
api_key=api_key,
|
| 74 |
+
model_name="llama-3.3-70b-versatile",
|
| 75 |
+
temperature=0
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
self.evaluator_llm = LangchainLLMWrapper(llm)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
self.faithfulness = Faithfulness(llm=self.evaluator_llm)
|
| 82 |
+
# self.answer_relevancy = ResponseRelevancy(llm=self.evaluator_llm)
|
| 83 |
+
self.context_precision = LLMContextPrecisionWithoutReference(llm=self.evaluator_llm)
|
| 84 |
+
|
| 85 |
+
# Storage for results
|
| 86 |
+
self.results: List[RagasEvaluationResult] = []
|
| 87 |
+
|
| 88 |
+
logger.info("β RAGAS Evaluator initialized (Faithfulness + Context Precision)")
|
| 89 |
+
|
| 90 |
+
async def evaluate_single(
|
| 91 |
+
self,
|
| 92 |
+
query: str,
|
| 93 |
+
answer: str,
|
| 94 |
+
contexts: List[str],
|
| 95 |
+
ground_truth: Optional[str] = None
|
| 96 |
+
) -> RagasEvaluationResult:
|
| 97 |
+
"""
|
| 98 |
+
Evaluate a single RAG response.
|
| 99 |
+
"""
|
| 100 |
+
import time
|
| 101 |
+
import hashlib
|
| 102 |
+
|
| 103 |
+
start_time = time.time()
|
| 104 |
+
|
| 105 |
+
# 1. Create SingleTurnSample
|
| 106 |
+
sample = SingleTurnSample(
|
| 107 |
+
user_input=query,
|
| 108 |
+
response=answer,
|
| 109 |
+
retrieved_contexts=contexts,
|
| 110 |
+
reference=ground_truth or ""
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# 2. Score with each metric (async!)
|
| 114 |
+
faithfulness_score = await self.faithfulness.single_turn_ascore(sample)
|
| 115 |
+
# answer_relevancy_score = await self.answer_relevancy.single_turn_ascore(sample)
|
| 116 |
+
answer_relevancy_score = None
|
| 117 |
+
context_precision_score = await self.context_precision.single_turn_ascore(sample)
|
| 118 |
+
|
| 119 |
+
# 3. Calculate latency
|
| 120 |
+
latency_ms = (time.time() - start_time) * 1000
|
| 121 |
+
|
| 122 |
+
# 4. Generate eval_id
|
| 123 |
+
eval_id = hashlib.md5(f"{query}{datetime.now().isoformat()}".encode()).hexdigest()[:8]
|
| 124 |
+
|
| 125 |
+
# 5. Create and store result
|
| 126 |
+
result = RagasEvaluationResult(
|
| 127 |
+
eval_id=eval_id,
|
| 128 |
+
query=query,
|
| 129 |
+
faithfulness=float(faithfulness_score),
|
| 130 |
+
answer_relevancy=0.0, #float(answer_relevancy_score),
|
| 131 |
+
context_precision=float(context_precision_score),
|
| 132 |
+
latency_ms=latency_ms
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
self.results.append(result)
|
| 136 |
+
|
| 137 |
+
logger.info(f"Evaluation complete: RAGAS score = {result.ragas_score:.3f}")
|
| 138 |
+
|
| 139 |
+
return result
|
src/evaluation/ragas_integration/test_ragas.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Quick test for RAGAS integration
|
| 3 |
+
Run: python -m src.evaluation.ragas_integration.test_ragas
|
| 4 |
+
"""
|
| 5 |
+
import asyncio
|
| 6 |
+
from src.rag import RAGPipeline, RAGConfig
|
| 7 |
+
from src.evaluation.ragas_integration import RagasReadyPipeline, RagasEvaluator
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
async def test_ragas():
|
| 11 |
+
print("=" * 50)
|
| 12 |
+
print("Testing RAGAS Integration")
|
| 13 |
+
print("=" * 50)
|
| 14 |
+
|
| 15 |
+
# Step 1: Initialize pipeline
|
| 16 |
+
print("\n1. Initializing pipeline...")
|
| 17 |
+
pipeline = RAGPipeline(RAGConfig())
|
| 18 |
+
ragas_pipeline = RagasReadyPipeline(pipeline)
|
| 19 |
+
print(" β Pipeline ready")
|
| 20 |
+
|
| 21 |
+
# Step 2: Initialize evaluator
|
| 22 |
+
print("\n2. Initializing RAGAS evaluator...")
|
| 23 |
+
evaluator = RagasEvaluator()
|
| 24 |
+
print(" β Evaluator ready")
|
| 25 |
+
|
| 26 |
+
# Step 3: Ingest a test document
|
| 27 |
+
print("\n3. Ingesting test document...")
|
| 28 |
+
test_text = """
|
| 29 |
+
Machine learning is a subset of artificial intelligence that enables
|
| 30 |
+
systems to learn from data. Deep learning is a type of machine learning
|
| 31 |
+
that uses neural networks with multiple layers. Natural language processing
|
| 32 |
+
(NLP) is used to understand human language.
|
| 33 |
+
"""
|
| 34 |
+
pipeline.ingest("test_doc", test_text)
|
| 35 |
+
print(f" β Ingested {pipeline.vector_store.size()} chunks")
|
| 36 |
+
|
| 37 |
+
# Step 4: Query with full context capture
|
| 38 |
+
print("\n4. Querying pipeline...")
|
| 39 |
+
response = ragas_pipeline.query_for_evaluation("What is machine learning?")
|
| 40 |
+
print(f" Query: {response.query}")
|
| 41 |
+
print(f" Answer: {response.answer[:100]}...")
|
| 42 |
+
print(f" Contexts captured: {len(response.contexts)}")
|
| 43 |
+
print(f" Status: {response.status}")
|
| 44 |
+
|
| 45 |
+
# Step 5: Evaluate with RAGAS
|
| 46 |
+
print("\n5. Running RAGAS evaluation...")
|
| 47 |
+
result = await evaluator.evaluate_single(
|
| 48 |
+
query=response.query,
|
| 49 |
+
answer=response.answer,
|
| 50 |
+
contexts=response.contexts
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
print(f"\n{'=' * 50}")
|
| 54 |
+
print("RAGAS RESULTS")
|
| 55 |
+
print(f"{'=' * 50}")
|
| 56 |
+
print(f" Faithfulness: {result.faithfulness:.3f}")
|
| 57 |
+
print(f" Answer Relevancy: {result.answer_relevancy:.3f}")
|
| 58 |
+
print(f" Context Precision: {result.context_precision:.3f}")
|
| 59 |
+
print(f" βββββββββββββββββββββββββββββ")
|
| 60 |
+
print(f" RAGAS Score: {result.ragas_score:.3f}")
|
| 61 |
+
print(f" Eval Time: {result.latency_ms:.0f}ms")
|
| 62 |
+
print(f"{'=' * 50}")
|
| 63 |
+
|
| 64 |
+
return result
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
if __name__ == "__main__":
|
| 68 |
+
asyncio.run(test_ragas())
|
src/main.py
CHANGED
|
@@ -10,6 +10,12 @@ from datetime import datetime
|
|
| 10 |
import tempfile
|
| 11 |
from pathlib import Path
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
from src.rag import RAGPipeline, RAGConfig
|
| 14 |
from src.evaluation import RAGEvaluator, EvaluationResult
|
| 15 |
import io
|
|
@@ -49,7 +55,8 @@ if os.path.exists("frontend"):
|
|
| 49 |
|
| 50 |
# Global pipeline instance
|
| 51 |
pipeline: Optional[RAGPipeline] = None
|
| 52 |
-
|
|
|
|
| 53 |
|
| 54 |
# ==================== Pydantic Models ====================
|
| 55 |
|
|
@@ -108,7 +115,7 @@ class StatsResponse(BaseModel):
|
|
| 108 |
@app.on_event("startup")
|
| 109 |
async def startup_event():
|
| 110 |
"""Initialize pipeline on startup."""
|
| 111 |
-
global pipeline
|
| 112 |
|
| 113 |
logger.info("=" * 60)
|
| 114 |
logger.info("Starting Document Intelligence RAG API")
|
|
@@ -124,8 +131,18 @@ async def startup_event():
|
|
| 124 |
|
| 125 |
# Initialize pipeline (automatically uses get_embeddings_client())
|
| 126 |
pipeline = RAGPipeline(config=config)
|
| 127 |
-
|
| 128 |
logger.info("β Pipeline initialized successfully")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
logger.info(f"β Embedding backend: {config.embedding_backend}")
|
| 130 |
logger.info(f"β API ready at http://localhost:8000")
|
| 131 |
logger.info(f"β Interactive docs at http://localhost:8000/docs")
|
|
@@ -429,7 +446,7 @@ async def reset_system():
|
|
| 429 |
Returns:
|
| 430 |
Reset confirmation
|
| 431 |
"""
|
| 432 |
-
global pipeline
|
| 433 |
|
| 434 |
if not pipeline:
|
| 435 |
raise HTTPException(status_code=503, detail="Pipeline not initialized")
|
|
@@ -439,12 +456,15 @@ async def reset_system():
|
|
| 439 |
|
| 440 |
# Clear vector store
|
| 441 |
pipeline.vector_store.clear()
|
|
|
|
|
|
|
|
|
|
| 442 |
|
| 443 |
logger.info("β System reset complete")
|
| 444 |
|
| 445 |
return {
|
| 446 |
"status": "success",
|
| 447 |
-
"message": "All documents and
|
| 448 |
"chunks_remaining": 0,
|
| 449 |
"timestamp": datetime.now().isoformat()
|
| 450 |
}
|
|
@@ -659,6 +679,16 @@ async def query_with_evaluation(request: dict):
|
|
| 659 |
return {"error": str(e)}, 500
|
| 660 |
|
| 661 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
# ==================== Root Endpoint ====================
|
| 663 |
|
| 664 |
@app.get("/", response_class=FileResponse)
|
|
|
|
| 10 |
import tempfile
|
| 11 |
from pathlib import Path
|
| 12 |
|
| 13 |
+
from src.evaluation.ragas_integration import (
|
| 14 |
+
RagasReadyPipeline,
|
| 15 |
+
RagasEvaluator,
|
| 16 |
+
init_ragas_router,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
from src.rag import RAGPipeline, RAGConfig
|
| 20 |
from src.evaluation import RAGEvaluator, EvaluationResult
|
| 21 |
import io
|
|
|
|
| 55 |
|
| 56 |
# Global pipeline instance
|
| 57 |
pipeline: Optional[RAGPipeline] = None
|
| 58 |
+
ragas_pipeline = None
|
| 59 |
+
ragas_evaluator = None
|
| 60 |
|
| 61 |
# ==================== Pydantic Models ====================
|
| 62 |
|
|
|
|
| 115 |
@app.on_event("startup")
|
| 116 |
async def startup_event():
|
| 117 |
"""Initialize pipeline on startup."""
|
| 118 |
+
global pipeline, ragas_pipeline, ragas_evaluator
|
| 119 |
|
| 120 |
logger.info("=" * 60)
|
| 121 |
logger.info("Starting Document Intelligence RAG API")
|
|
|
|
| 131 |
|
| 132 |
# Initialize pipeline (automatically uses get_embeddings_client())
|
| 133 |
pipeline = RAGPipeline(config=config)
|
|
|
|
| 134 |
logger.info("β Pipeline initialized successfully")
|
| 135 |
+
|
| 136 |
+
# RAGAS integration
|
| 137 |
+
ragas_pipeline = RagasReadyPipeline(pipeline)
|
| 138 |
+
logger.info("β Ragas pipeline initialized successfully")
|
| 139 |
+
ragas_evaluator = RagasEvaluator()
|
| 140 |
+
logger.info("β Ragas evaluator initialized successfully")
|
| 141 |
+
ragas_router = init_ragas_router(ragas_pipeline, ragas_evaluator)
|
| 142 |
+
app.include_router(ragas_router, prefix="/ragas", tags=["RAGAS Evaluation"])
|
| 143 |
+
logger.info("β Ragas evaluator initialized successfully")
|
| 144 |
+
|
| 145 |
+
|
| 146 |
logger.info(f"β Embedding backend: {config.embedding_backend}")
|
| 147 |
logger.info(f"β API ready at http://localhost:8000")
|
| 148 |
logger.info(f"β Interactive docs at http://localhost:8000/docs")
|
|
|
|
| 446 |
Returns:
|
| 447 |
Reset confirmation
|
| 448 |
"""
|
| 449 |
+
global pipeline, ragas_evaluator
|
| 450 |
|
| 451 |
if not pipeline:
|
| 452 |
raise HTTPException(status_code=503, detail="Pipeline not initialized")
|
|
|
|
| 456 |
|
| 457 |
# Clear vector store
|
| 458 |
pipeline.vector_store.clear()
|
| 459 |
+
if ragas_evaluator:
|
| 460 |
+
ragas_evaluator.results = []
|
| 461 |
+
logger.info("β RAGAS evaluations cleared")
|
| 462 |
|
| 463 |
logger.info("β System reset complete")
|
| 464 |
|
| 465 |
return {
|
| 466 |
"status": "success",
|
| 467 |
+
"message": "All documents, embeddings, and RAGAS evaluations cleared",
|
| 468 |
"chunks_remaining": 0,
|
| 469 |
"timestamp": datetime.now().isoformat()
|
| 470 |
}
|
|
|
|
| 679 |
return {"error": str(e)}, 500
|
| 680 |
|
| 681 |
|
| 682 |
+
# ===================== RAGAS Endpoints ====================
|
| 683 |
+
|
| 684 |
+
@app.get("/ragas-demo")
|
| 685 |
+
async def ragas_demo_page():
|
| 686 |
+
"""Serve RAGAS evaluation demo page."""
|
| 687 |
+
frontend_path = "frontend/ragas.html"
|
| 688 |
+
if os.path.exists(frontend_path):
|
| 689 |
+
return FileResponse(frontend_path)
|
| 690 |
+
return {"error": "RAGAS demo page not found"}
|
| 691 |
+
|
| 692 |
# ==================== Root Endpoint ====================
|
| 693 |
|
| 694 |
@app.get("/", response_class=FileResponse)
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|