Spaces:

prabhal
/

geneseek

Sleeping

File size: 7,384 Bytes

import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.run_config import RunConfig
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from src.chain import get_rag_chain
from src import config

# RAGAS v0.2+ requires LLM and Embeddings to be wrapped explicitly
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(
    model=config.LLM_MODEL,
    temperature=0,
    openai_api_key=config.OPENAI_API_KEY
))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(
    model=config.EMBEDDING_MODEL,
    openai_api_key=config.OPENAI_API_KEY
))

# 18 questions grounded across all 6 clinical trial reports
test_questions = [

    # Report 1: Osimertinib / EGFR T790M (NCT01234567)
    {
        "question": "Does Osimertinib work for the C797S resistance mutation?",
        "ground_truth": "No, patients with the C797S resistance mutation showed no response to Osimertinib."
    },
    {
        "question": "What was the progression-free survival outcome of Osimertinib versus chemotherapy in EGFR-mutated NSCLC?",
        "ground_truth": "Osimertinib showed a significant increase in progression-free survival compared to standard chemotherapy."
    },
    {
        "question": "What is the clinical trial ID and date for the Osimertinib EGFR NSCLC study?",
        "ground_truth": "The clinical trial ID is NCT01234567 and the study date was 2023-05-12."
    },

    # Report 2: Sotorasib / KRAS G12C (NCT03600883)
    {
        "question": "What is the objective response rate of Sotorasib in KRAS G12C-mutated NSCLC?",
        "ground_truth": "Sotorasib demonstrated an objective response rate of 37.1% in KRAS G12C-mutated NSCLC."
    },
    {
        "question": "How do co-occurring STK11 mutations affect Sotorasib response?",
        "ground_truth": "Patients with co-occurring STK11 mutations showed significantly reduced response rates to Sotorasib compared to STK11 wild-type patients."
    },
    {
        "question": "What were the most common severe adverse events reported with Sotorasib?",
        "ground_truth": "Grade 3 or higher adverse events occurred in 19% of patients, most commonly hepatotoxicity and diarrhea."
    },

    # Report 3: Lorlatinib vs Alectinib / ALK (NCT04685369)
    {
        "question": "What was the intracranial response rate of Lorlatinib compared to Alectinib in ALK-positive NSCLC?",
        "ground_truth": "Lorlatinib demonstrated superior intracranial response rates of 82% compared to 58% for Alectinib."
    },
    {
        "question": "What is the median progression-free survival for Lorlatinib versus Alectinib?",
        "ground_truth": "Median progression-free survival was 18.3 months for Lorlatinib versus 14.8 months for Alectinib."
    },
    {
        "question": "Why is Lorlatinib preferred over Alectinib for ALK-positive NSCLC patients with brain metastases?",
        "ground_truth": "Lorlatinib is preferred due to its superior CNS penetration, achieving an 82% intracranial response rate compared to 58% for Alectinib."
    },

    # Report 4: Pembrolizumab / PD-L1 (NCT02142738)
    {
        "question": "What PD-L1 TPS threshold qualifies patients for first-line Pembrolizumab monotherapy in NSCLC?",
        "ground_truth": "A PD-L1 TPS of 50% or higher is the validated threshold for Pembrolizumab monotherapy in first-line NSCLC."
    },
    {
        "question": "What is the 5-year overall survival rate for NSCLC patients with PD-L1 TPS >= 50% treated with Pembrolizumab?",
        "ground_truth": "Patients with TPS >= 50% achieved a 5-year overall survival rate of 31.9% with Pembrolizumab, compared to 16.3% for chemotherapy."
    },
    {
        "question": "What treatment should patients with low or negative PD-L1 expression receive instead of Pembrolizumab monotherapy?",
        "ground_truth": "Patients with low or negative PD-L1 should receive combination chemoimmunotherapy."
    },

    # Report 5: Olaparib / BRCA1 & BRCA2 Ovarian Cancer (NCT01874353)
    {
        "question": "What was the median progression-free survival for Olaparib versus placebo in BRCA-mutated ovarian cancer?",
        "ground_truth": "Olaparib significantly extended median progression-free survival to 19.1 months compared to 5.5 months in the placebo group."
    },
    {
        "question": "Do BRCA1 and BRCA2 mutations predict equal benefit from Olaparib maintenance therapy?",
        "ground_truth": "No. BRCA2-mutated patients showed greater benefit with a 7-year overall survival of 67%, while BRCA1-mutated patients showed somewhat less benefit, likely due to differences in reversion mutation rates."
    },
    {
        "question": "What causes acquired resistance to Olaparib in ovarian cancer patients?",
        "ground_truth": "Patients who developed BRCA reversion mutations showed acquired resistance to Olaparib and had significantly shorter progression-free survival on rechallenge."
    },

    # Report 6: Trastuzumab Deruxtecan / HER2 Breast Cancer (NCT03529110)
    {
        "question": "How does T-DXd compare to T-DM1 in HER2-positive metastatic breast cancer for progression-free survival?",
        "ground_truth": "T-DXd achieved a median progression-free survival of 28.8 months compared to 6.8 months for T-DM1 in HER2-positive patients."
    },
    {
        "question": "What is the clinical significance of the HER2-low category based on the T-DXd trial?",
        "ground_truth": "T-DXd demonstrated clinical benefit in HER2-low patients with a median PFS of 9.9 months versus 5.1 months for chemotherapy, establishing HER2-low as a new actionable treatment category."
    },
    {
        "question": "What is the most clinically significant adverse event associated with T-DXd, and how common is it?",
        "ground_truth": "Interstitial lung disease was the most clinically significant adverse event, occurring in 12.1% of patients, with 0.8% experiencing Grade 5 (fatal) events."
    },
]

def run_evaluation():
    print("Starting RAGAS Evaluation...")
    chain = get_rag_chain()

    data = {"question": [], "answer": [], "contexts": [], "ground_truth": []}

    for item in test_questions:
        print(f"  Testing: {item['question']}")
        result = chain(item["question"])

        context_strings = [ctx["content"] for ctx in result["contexts"]]

        data["question"].append(item["question"])
        data["answer"].append(result["answer"])
        data["contexts"].append(context_strings)
        data["ground_truth"].append(item["ground_truth"])

    dataset = Dataset.from_dict(data)

    print("\nRunning RAGAS scoring (this may take a few minutes)...")
    results = evaluate(
        dataset=dataset,
        metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
        llm=evaluator_llm,
        embeddings=evaluator_embeddings,
        run_config=RunConfig(timeout=120, max_workers=4)
    )

    print("\nEvaluation Scores:")
    print(results)

    df = results.to_pandas()
    df.to_csv("evaluation_results.csv", index=False)
    print("Results saved to evaluation_results.csv")

if __name__ == "__main__":
    run_evaluation()
    config.qdrant_client.close()