geneseek / eval.py
prabhal's picture
added updates geneseek
e8ade4e
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.run_config import RunConfig
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall,
)
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from src.chain import get_rag_chain
from src import config
# RAGAS v0.2+ requires LLM and Embeddings to be wrapped explicitly
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(
model=config.LLM_MODEL,
temperature=0,
openai_api_key=config.OPENAI_API_KEY
))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(
model=config.EMBEDDING_MODEL,
openai_api_key=config.OPENAI_API_KEY
))
# 18 questions grounded across all 6 clinical trial reports
test_questions = [
# Report 1: Osimertinib / EGFR T790M (NCT01234567)
{
"question": "Does Osimertinib work for the C797S resistance mutation?",
"ground_truth": "No, patients with the C797S resistance mutation showed no response to Osimertinib."
},
{
"question": "What was the progression-free survival outcome of Osimertinib versus chemotherapy in EGFR-mutated NSCLC?",
"ground_truth": "Osimertinib showed a significant increase in progression-free survival compared to standard chemotherapy."
},
{
"question": "What is the clinical trial ID and date for the Osimertinib EGFR NSCLC study?",
"ground_truth": "The clinical trial ID is NCT01234567 and the study date was 2023-05-12."
},
# Report 2: Sotorasib / KRAS G12C (NCT03600883)
{
"question": "What is the objective response rate of Sotorasib in KRAS G12C-mutated NSCLC?",
"ground_truth": "Sotorasib demonstrated an objective response rate of 37.1% in KRAS G12C-mutated NSCLC."
},
{
"question": "How do co-occurring STK11 mutations affect Sotorasib response?",
"ground_truth": "Patients with co-occurring STK11 mutations showed significantly reduced response rates to Sotorasib compared to STK11 wild-type patients."
},
{
"question": "What were the most common severe adverse events reported with Sotorasib?",
"ground_truth": "Grade 3 or higher adverse events occurred in 19% of patients, most commonly hepatotoxicity and diarrhea."
},
# Report 3: Lorlatinib vs Alectinib / ALK (NCT04685369)
{
"question": "What was the intracranial response rate of Lorlatinib compared to Alectinib in ALK-positive NSCLC?",
"ground_truth": "Lorlatinib demonstrated superior intracranial response rates of 82% compared to 58% for Alectinib."
},
{
"question": "What is the median progression-free survival for Lorlatinib versus Alectinib?",
"ground_truth": "Median progression-free survival was 18.3 months for Lorlatinib versus 14.8 months for Alectinib."
},
{
"question": "Why is Lorlatinib preferred over Alectinib for ALK-positive NSCLC patients with brain metastases?",
"ground_truth": "Lorlatinib is preferred due to its superior CNS penetration, achieving an 82% intracranial response rate compared to 58% for Alectinib."
},
# Report 4: Pembrolizumab / PD-L1 (NCT02142738)
{
"question": "What PD-L1 TPS threshold qualifies patients for first-line Pembrolizumab monotherapy in NSCLC?",
"ground_truth": "A PD-L1 TPS of 50% or higher is the validated threshold for Pembrolizumab monotherapy in first-line NSCLC."
},
{
"question": "What is the 5-year overall survival rate for NSCLC patients with PD-L1 TPS >= 50% treated with Pembrolizumab?",
"ground_truth": "Patients with TPS >= 50% achieved a 5-year overall survival rate of 31.9% with Pembrolizumab, compared to 16.3% for chemotherapy."
},
{
"question": "What treatment should patients with low or negative PD-L1 expression receive instead of Pembrolizumab monotherapy?",
"ground_truth": "Patients with low or negative PD-L1 should receive combination chemoimmunotherapy."
},
# Report 5: Olaparib / BRCA1 & BRCA2 Ovarian Cancer (NCT01874353)
{
"question": "What was the median progression-free survival for Olaparib versus placebo in BRCA-mutated ovarian cancer?",
"ground_truth": "Olaparib significantly extended median progression-free survival to 19.1 months compared to 5.5 months in the placebo group."
},
{
"question": "Do BRCA1 and BRCA2 mutations predict equal benefit from Olaparib maintenance therapy?",
"ground_truth": "No. BRCA2-mutated patients showed greater benefit with a 7-year overall survival of 67%, while BRCA1-mutated patients showed somewhat less benefit, likely due to differences in reversion mutation rates."
},
{
"question": "What causes acquired resistance to Olaparib in ovarian cancer patients?",
"ground_truth": "Patients who developed BRCA reversion mutations showed acquired resistance to Olaparib and had significantly shorter progression-free survival on rechallenge."
},
# Report 6: Trastuzumab Deruxtecan / HER2 Breast Cancer (NCT03529110)
{
"question": "How does T-DXd compare to T-DM1 in HER2-positive metastatic breast cancer for progression-free survival?",
"ground_truth": "T-DXd achieved a median progression-free survival of 28.8 months compared to 6.8 months for T-DM1 in HER2-positive patients."
},
{
"question": "What is the clinical significance of the HER2-low category based on the T-DXd trial?",
"ground_truth": "T-DXd demonstrated clinical benefit in HER2-low patients with a median PFS of 9.9 months versus 5.1 months for chemotherapy, establishing HER2-low as a new actionable treatment category."
},
{
"question": "What is the most clinically significant adverse event associated with T-DXd, and how common is it?",
"ground_truth": "Interstitial lung disease was the most clinically significant adverse event, occurring in 12.1% of patients, with 0.8% experiencing Grade 5 (fatal) events."
},
]
def run_evaluation():
print("Starting RAGAS Evaluation...")
chain = get_rag_chain()
data = {"question": [], "answer": [], "contexts": [], "ground_truth": []}
for item in test_questions:
print(f" Testing: {item['question']}")
result = chain(item["question"])
context_strings = [ctx["content"] for ctx in result["contexts"]]
data["question"].append(item["question"])
data["answer"].append(result["answer"])
data["contexts"].append(context_strings)
data["ground_truth"].append(item["ground_truth"])
dataset = Dataset.from_dict(data)
print("\nRunning RAGAS scoring (this may take a few minutes)...")
results = evaluate(
dataset=dataset,
metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
llm=evaluator_llm,
embeddings=evaluator_embeddings,
run_config=RunConfig(timeout=120, max_workers=4)
)
print("\nEvaluation Scores:")
print(results)
df = results.to_pandas()
df.to_csv("evaluation_results.csv", index=False)
print("Results saved to evaluation_results.csv")
if __name__ == "__main__":
run_evaluation()
config.qdrant_client.close()