import pandas as pd from datasets import Dataset from ragas import evaluate from ragas.llms import LangchainLLMWrapper from ragas.embeddings import LangchainEmbeddingsWrapper from ragas.run_config import RunConfig from ragas.metrics import ( faithfulness, answer_relevancy, context_precision, context_recall, ) from langchain_openai import ChatOpenAI, OpenAIEmbeddings from src.chain import get_rag_chain from src import config # RAGAS v0.2+ requires LLM and Embeddings to be wrapped explicitly evaluator_llm = LangchainLLMWrapper(ChatOpenAI( model=config.LLM_MODEL, temperature=0, openai_api_key=config.OPENAI_API_KEY )) evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings( model=config.EMBEDDING_MODEL, openai_api_key=config.OPENAI_API_KEY )) # 18 questions grounded across all 6 clinical trial reports test_questions = [ # Report 1: Osimertinib / EGFR T790M (NCT01234567) { "question": "Does Osimertinib work for the C797S resistance mutation?", "ground_truth": "No, patients with the C797S resistance mutation showed no response to Osimertinib." }, { "question": "What was the progression-free survival outcome of Osimertinib versus chemotherapy in EGFR-mutated NSCLC?", "ground_truth": "Osimertinib showed a significant increase in progression-free survival compared to standard chemotherapy." }, { "question": "What is the clinical trial ID and date for the Osimertinib EGFR NSCLC study?", "ground_truth": "The clinical trial ID is NCT01234567 and the study date was 2023-05-12." }, # Report 2: Sotorasib / KRAS G12C (NCT03600883) { "question": "What is the objective response rate of Sotorasib in KRAS G12C-mutated NSCLC?", "ground_truth": "Sotorasib demonstrated an objective response rate of 37.1% in KRAS G12C-mutated NSCLC." }, { "question": "How do co-occurring STK11 mutations affect Sotorasib response?", "ground_truth": "Patients with co-occurring STK11 mutations showed significantly reduced response rates to Sotorasib compared to STK11 wild-type patients." }, { "question": "What were the most common severe adverse events reported with Sotorasib?", "ground_truth": "Grade 3 or higher adverse events occurred in 19% of patients, most commonly hepatotoxicity and diarrhea." }, # Report 3: Lorlatinib vs Alectinib / ALK (NCT04685369) { "question": "What was the intracranial response rate of Lorlatinib compared to Alectinib in ALK-positive NSCLC?", "ground_truth": "Lorlatinib demonstrated superior intracranial response rates of 82% compared to 58% for Alectinib." }, { "question": "What is the median progression-free survival for Lorlatinib versus Alectinib?", "ground_truth": "Median progression-free survival was 18.3 months for Lorlatinib versus 14.8 months for Alectinib." }, { "question": "Why is Lorlatinib preferred over Alectinib for ALK-positive NSCLC patients with brain metastases?", "ground_truth": "Lorlatinib is preferred due to its superior CNS penetration, achieving an 82% intracranial response rate compared to 58% for Alectinib." }, # Report 4: Pembrolizumab / PD-L1 (NCT02142738) { "question": "What PD-L1 TPS threshold qualifies patients for first-line Pembrolizumab monotherapy in NSCLC?", "ground_truth": "A PD-L1 TPS of 50% or higher is the validated threshold for Pembrolizumab monotherapy in first-line NSCLC." }, { "question": "What is the 5-year overall survival rate for NSCLC patients with PD-L1 TPS >= 50% treated with Pembrolizumab?", "ground_truth": "Patients with TPS >= 50% achieved a 5-year overall survival rate of 31.9% with Pembrolizumab, compared to 16.3% for chemotherapy." }, { "question": "What treatment should patients with low or negative PD-L1 expression receive instead of Pembrolizumab monotherapy?", "ground_truth": "Patients with low or negative PD-L1 should receive combination chemoimmunotherapy." }, # Report 5: Olaparib / BRCA1 & BRCA2 Ovarian Cancer (NCT01874353) { "question": "What was the median progression-free survival for Olaparib versus placebo in BRCA-mutated ovarian cancer?", "ground_truth": "Olaparib significantly extended median progression-free survival to 19.1 months compared to 5.5 months in the placebo group." }, { "question": "Do BRCA1 and BRCA2 mutations predict equal benefit from Olaparib maintenance therapy?", "ground_truth": "No. BRCA2-mutated patients showed greater benefit with a 7-year overall survival of 67%, while BRCA1-mutated patients showed somewhat less benefit, likely due to differences in reversion mutation rates." }, { "question": "What causes acquired resistance to Olaparib in ovarian cancer patients?", "ground_truth": "Patients who developed BRCA reversion mutations showed acquired resistance to Olaparib and had significantly shorter progression-free survival on rechallenge." }, # Report 6: Trastuzumab Deruxtecan / HER2 Breast Cancer (NCT03529110) { "question": "How does T-DXd compare to T-DM1 in HER2-positive metastatic breast cancer for progression-free survival?", "ground_truth": "T-DXd achieved a median progression-free survival of 28.8 months compared to 6.8 months for T-DM1 in HER2-positive patients." }, { "question": "What is the clinical significance of the HER2-low category based on the T-DXd trial?", "ground_truth": "T-DXd demonstrated clinical benefit in HER2-low patients with a median PFS of 9.9 months versus 5.1 months for chemotherapy, establishing HER2-low as a new actionable treatment category." }, { "question": "What is the most clinically significant adverse event associated with T-DXd, and how common is it?", "ground_truth": "Interstitial lung disease was the most clinically significant adverse event, occurring in 12.1% of patients, with 0.8% experiencing Grade 5 (fatal) events." }, ] def run_evaluation(): print("Starting RAGAS Evaluation...") chain = get_rag_chain() data = {"question": [], "answer": [], "contexts": [], "ground_truth": []} for item in test_questions: print(f" Testing: {item['question']}") result = chain(item["question"]) context_strings = [ctx["content"] for ctx in result["contexts"]] data["question"].append(item["question"]) data["answer"].append(result["answer"]) data["contexts"].append(context_strings) data["ground_truth"].append(item["ground_truth"]) dataset = Dataset.from_dict(data) print("\nRunning RAGAS scoring (this may take a few minutes)...") results = evaluate( dataset=dataset, metrics=[faithfulness, answer_relevancy, context_precision, context_recall], llm=evaluator_llm, embeddings=evaluator_embeddings, run_config=RunConfig(timeout=120, max_workers=4) ) print("\nEvaluation Scores:") print(results) df = results.to_pandas() df.to_csv("evaluation_results.csv", index=False) print("Results saved to evaluation_results.csv") if __name__ == "__main__": run_evaluation() config.qdrant_client.close()