Spaces:

prabhal
/

geneseek

Sleeping

App Files Files Community

geneseek / eval.py

prabhal

added updates geneseek

e8ade4e about 1 month ago

raw

history blame contribute delete

7.38 kB

	import pandas as pd
	from datasets import Dataset
	from ragas import evaluate
	from ragas.llms import LangchainLLMWrapper
	from ragas.embeddings import LangchainEmbeddingsWrapper
	from ragas.run_config import RunConfig
	from ragas.metrics import (
	faithfulness,
	answer_relevancy,
	context_precision,
	context_recall,
	)
	from langchain_openai import ChatOpenAI, OpenAIEmbeddings
	from src.chain import get_rag_chain
	from src import config

	# RAGAS v0.2+ requires LLM and Embeddings to be wrapped explicitly
	evaluator_llm = LangchainLLMWrapper(ChatOpenAI(
	model=config.LLM_MODEL,
	temperature=0,
	openai_api_key=config.OPENAI_API_KEY
	))
	evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(
	model=config.EMBEDDING_MODEL,
	openai_api_key=config.OPENAI_API_KEY
	))

	# 18 questions grounded across all 6 clinical trial reports
	test_questions = [

	# Report 1: Osimertinib / EGFR T790M (NCT01234567)
	{
	"question": "Does Osimertinib work for the C797S resistance mutation?",
	"ground_truth": "No, patients with the C797S resistance mutation showed no response to Osimertinib."
	},
	{
	"question": "What was the progression-free survival outcome of Osimertinib versus chemotherapy in EGFR-mutated NSCLC?",
	"ground_truth": "Osimertinib showed a significant increase in progression-free survival compared to standard chemotherapy."
	},
	{
	"question": "What is the clinical trial ID and date for the Osimertinib EGFR NSCLC study?",
	"ground_truth": "The clinical trial ID is NCT01234567 and the study date was 2023-05-12."
	},

	# Report 2: Sotorasib / KRAS G12C (NCT03600883)
	{
	"question": "What is the objective response rate of Sotorasib in KRAS G12C-mutated NSCLC?",
	"ground_truth": "Sotorasib demonstrated an objective response rate of 37.1% in KRAS G12C-mutated NSCLC."
	},
	{
	"question": "How do co-occurring STK11 mutations affect Sotorasib response?",
	"ground_truth": "Patients with co-occurring STK11 mutations showed significantly reduced response rates to Sotorasib compared to STK11 wild-type patients."
	},
	{
	"question": "What were the most common severe adverse events reported with Sotorasib?",
	"ground_truth": "Grade 3 or higher adverse events occurred in 19% of patients, most commonly hepatotoxicity and diarrhea."
	},

	# Report 3: Lorlatinib vs Alectinib / ALK (NCT04685369)
	{
	"question": "What was the intracranial response rate of Lorlatinib compared to Alectinib in ALK-positive NSCLC?",
	"ground_truth": "Lorlatinib demonstrated superior intracranial response rates of 82% compared to 58% for Alectinib."
	},
	{
	"question": "What is the median progression-free survival for Lorlatinib versus Alectinib?",
	"ground_truth": "Median progression-free survival was 18.3 months for Lorlatinib versus 14.8 months for Alectinib."
	},
	{
	"question": "Why is Lorlatinib preferred over Alectinib for ALK-positive NSCLC patients with brain metastases?",
	"ground_truth": "Lorlatinib is preferred due to its superior CNS penetration, achieving an 82% intracranial response rate compared to 58% for Alectinib."
	},

	# Report 4: Pembrolizumab / PD-L1 (NCT02142738)
	{
	"question": "What PD-L1 TPS threshold qualifies patients for first-line Pembrolizumab monotherapy in NSCLC?",
	"ground_truth": "A PD-L1 TPS of 50% or higher is the validated threshold for Pembrolizumab monotherapy in first-line NSCLC."
	},
	{
	"question": "What is the 5-year overall survival rate for NSCLC patients with PD-L1 TPS >= 50% treated with Pembrolizumab?",
	"ground_truth": "Patients with TPS >= 50% achieved a 5-year overall survival rate of 31.9% with Pembrolizumab, compared to 16.3% for chemotherapy."
	},
	{
	"question": "What treatment should patients with low or negative PD-L1 expression receive instead of Pembrolizumab monotherapy?",
	"ground_truth": "Patients with low or negative PD-L1 should receive combination chemoimmunotherapy."
	},

	# Report 5: Olaparib / BRCA1 & BRCA2 Ovarian Cancer (NCT01874353)
	{
	"question": "What was the median progression-free survival for Olaparib versus placebo in BRCA-mutated ovarian cancer?",
	"ground_truth": "Olaparib significantly extended median progression-free survival to 19.1 months compared to 5.5 months in the placebo group."
	},
	{
	"question": "Do BRCA1 and BRCA2 mutations predict equal benefit from Olaparib maintenance therapy?",
	"ground_truth": "No. BRCA2-mutated patients showed greater benefit with a 7-year overall survival of 67%, while BRCA1-mutated patients showed somewhat less benefit, likely due to differences in reversion mutation rates."
	},
	{
	"question": "What causes acquired resistance to Olaparib in ovarian cancer patients?",
	"ground_truth": "Patients who developed BRCA reversion mutations showed acquired resistance to Olaparib and had significantly shorter progression-free survival on rechallenge."
	},

	# Report 6: Trastuzumab Deruxtecan / HER2 Breast Cancer (NCT03529110)
	{
	"question": "How does T-DXd compare to T-DM1 in HER2-positive metastatic breast cancer for progression-free survival?",
	"ground_truth": "T-DXd achieved a median progression-free survival of 28.8 months compared to 6.8 months for T-DM1 in HER2-positive patients."
	},
	{
	"question": "What is the clinical significance of the HER2-low category based on the T-DXd trial?",
	"ground_truth": "T-DXd demonstrated clinical benefit in HER2-low patients with a median PFS of 9.9 months versus 5.1 months for chemotherapy, establishing HER2-low as a new actionable treatment category."
	},
	{
	"question": "What is the most clinically significant adverse event associated with T-DXd, and how common is it?",
	"ground_truth": "Interstitial lung disease was the most clinically significant adverse event, occurring in 12.1% of patients, with 0.8% experiencing Grade 5 (fatal) events."
	},
	]

	def run_evaluation():
	print("Starting RAGAS Evaluation...")
	chain = get_rag_chain()

	data = {"question": [], "answer": [], "contexts": [], "ground_truth": []}

	for item in test_questions:
	print(f" Testing: {item['question']}")
	result = chain(item["question"])

	context_strings = [ctx["content"] for ctx in result["contexts"]]

	data["question"].append(item["question"])
	data["answer"].append(result["answer"])
	data["contexts"].append(context_strings)
	data["ground_truth"].append(item["ground_truth"])

	dataset = Dataset.from_dict(data)

	print("\nRunning RAGAS scoring (this may take a few minutes)...")
	results = evaluate(
	dataset=dataset,
	metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
	llm=evaluator_llm,
	embeddings=evaluator_embeddings,
	run_config=RunConfig(timeout=120, max_workers=4)
	)

	print("\nEvaluation Scores:")
	print(results)

	df = results.to_pandas()
	df.to_csv("evaluation_results.csv", index=False)
	print("Results saved to evaluation_results.csv")

	if __name__ == "__main__":
	run_evaluation()
	config.qdrant_client.close()