File size: 7,384 Bytes
ea71a81 e8ade4e ea71a81 e8ade4e ea71a81 4df7450 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.run_config import RunConfig
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall,
)
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from src.chain import get_rag_chain
from src import config
# RAGAS v0.2+ requires LLM and Embeddings to be wrapped explicitly
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(
model=config.LLM_MODEL,
temperature=0,
openai_api_key=config.OPENAI_API_KEY
))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(
model=config.EMBEDDING_MODEL,
openai_api_key=config.OPENAI_API_KEY
))
# 18 questions grounded across all 6 clinical trial reports
test_questions = [
# Report 1: Osimertinib / EGFR T790M (NCT01234567)
{
"question": "Does Osimertinib work for the C797S resistance mutation?",
"ground_truth": "No, patients with the C797S resistance mutation showed no response to Osimertinib."
},
{
"question": "What was the progression-free survival outcome of Osimertinib versus chemotherapy in EGFR-mutated NSCLC?",
"ground_truth": "Osimertinib showed a significant increase in progression-free survival compared to standard chemotherapy."
},
{
"question": "What is the clinical trial ID and date for the Osimertinib EGFR NSCLC study?",
"ground_truth": "The clinical trial ID is NCT01234567 and the study date was 2023-05-12."
},
# Report 2: Sotorasib / KRAS G12C (NCT03600883)
{
"question": "What is the objective response rate of Sotorasib in KRAS G12C-mutated NSCLC?",
"ground_truth": "Sotorasib demonstrated an objective response rate of 37.1% in KRAS G12C-mutated NSCLC."
},
{
"question": "How do co-occurring STK11 mutations affect Sotorasib response?",
"ground_truth": "Patients with co-occurring STK11 mutations showed significantly reduced response rates to Sotorasib compared to STK11 wild-type patients."
},
{
"question": "What were the most common severe adverse events reported with Sotorasib?",
"ground_truth": "Grade 3 or higher adverse events occurred in 19% of patients, most commonly hepatotoxicity and diarrhea."
},
# Report 3: Lorlatinib vs Alectinib / ALK (NCT04685369)
{
"question": "What was the intracranial response rate of Lorlatinib compared to Alectinib in ALK-positive NSCLC?",
"ground_truth": "Lorlatinib demonstrated superior intracranial response rates of 82% compared to 58% for Alectinib."
},
{
"question": "What is the median progression-free survival for Lorlatinib versus Alectinib?",
"ground_truth": "Median progression-free survival was 18.3 months for Lorlatinib versus 14.8 months for Alectinib."
},
{
"question": "Why is Lorlatinib preferred over Alectinib for ALK-positive NSCLC patients with brain metastases?",
"ground_truth": "Lorlatinib is preferred due to its superior CNS penetration, achieving an 82% intracranial response rate compared to 58% for Alectinib."
},
# Report 4: Pembrolizumab / PD-L1 (NCT02142738)
{
"question": "What PD-L1 TPS threshold qualifies patients for first-line Pembrolizumab monotherapy in NSCLC?",
"ground_truth": "A PD-L1 TPS of 50% or higher is the validated threshold for Pembrolizumab monotherapy in first-line NSCLC."
},
{
"question": "What is the 5-year overall survival rate for NSCLC patients with PD-L1 TPS >= 50% treated with Pembrolizumab?",
"ground_truth": "Patients with TPS >= 50% achieved a 5-year overall survival rate of 31.9% with Pembrolizumab, compared to 16.3% for chemotherapy."
},
{
"question": "What treatment should patients with low or negative PD-L1 expression receive instead of Pembrolizumab monotherapy?",
"ground_truth": "Patients with low or negative PD-L1 should receive combination chemoimmunotherapy."
},
# Report 5: Olaparib / BRCA1 & BRCA2 Ovarian Cancer (NCT01874353)
{
"question": "What was the median progression-free survival for Olaparib versus placebo in BRCA-mutated ovarian cancer?",
"ground_truth": "Olaparib significantly extended median progression-free survival to 19.1 months compared to 5.5 months in the placebo group."
},
{
"question": "Do BRCA1 and BRCA2 mutations predict equal benefit from Olaparib maintenance therapy?",
"ground_truth": "No. BRCA2-mutated patients showed greater benefit with a 7-year overall survival of 67%, while BRCA1-mutated patients showed somewhat less benefit, likely due to differences in reversion mutation rates."
},
{
"question": "What causes acquired resistance to Olaparib in ovarian cancer patients?",
"ground_truth": "Patients who developed BRCA reversion mutations showed acquired resistance to Olaparib and had significantly shorter progression-free survival on rechallenge."
},
# Report 6: Trastuzumab Deruxtecan / HER2 Breast Cancer (NCT03529110)
{
"question": "How does T-DXd compare to T-DM1 in HER2-positive metastatic breast cancer for progression-free survival?",
"ground_truth": "T-DXd achieved a median progression-free survival of 28.8 months compared to 6.8 months for T-DM1 in HER2-positive patients."
},
{
"question": "What is the clinical significance of the HER2-low category based on the T-DXd trial?",
"ground_truth": "T-DXd demonstrated clinical benefit in HER2-low patients with a median PFS of 9.9 months versus 5.1 months for chemotherapy, establishing HER2-low as a new actionable treatment category."
},
{
"question": "What is the most clinically significant adverse event associated with T-DXd, and how common is it?",
"ground_truth": "Interstitial lung disease was the most clinically significant adverse event, occurring in 12.1% of patients, with 0.8% experiencing Grade 5 (fatal) events."
},
]
def run_evaluation():
print("Starting RAGAS Evaluation...")
chain = get_rag_chain()
data = {"question": [], "answer": [], "contexts": [], "ground_truth": []}
for item in test_questions:
print(f" Testing: {item['question']}")
result = chain(item["question"])
context_strings = [ctx["content"] for ctx in result["contexts"]]
data["question"].append(item["question"])
data["answer"].append(result["answer"])
data["contexts"].append(context_strings)
data["ground_truth"].append(item["ground_truth"])
dataset = Dataset.from_dict(data)
print("\nRunning RAGAS scoring (this may take a few minutes)...")
results = evaluate(
dataset=dataset,
metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
llm=evaluator_llm,
embeddings=evaluator_embeddings,
run_config=RunConfig(timeout=120, max_workers=4)
)
print("\nEvaluation Scores:")
print(results)
df = results.to_pandas()
df.to_csv("evaluation_results.csv", index=False)
print("Results saved to evaluation_results.csv")
if __name__ == "__main__":
run_evaluation()
config.qdrant_client.close()
|