| import pandas as pd |
| from datasets import Dataset |
| from ragas import evaluate |
| from ragas.llms import LangchainLLMWrapper |
| from ragas.embeddings import LangchainEmbeddingsWrapper |
| from ragas.run_config import RunConfig |
| from ragas.metrics import ( |
| faithfulness, |
| answer_relevancy, |
| context_precision, |
| context_recall, |
| ) |
| from langchain_openai import ChatOpenAI, OpenAIEmbeddings |
| from src.chain import get_rag_chain |
| from src import config |
|
|
| |
| evaluator_llm = LangchainLLMWrapper(ChatOpenAI( |
| model=config.LLM_MODEL, |
| temperature=0, |
| openai_api_key=config.OPENAI_API_KEY |
| )) |
| evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings( |
| model=config.EMBEDDING_MODEL, |
| openai_api_key=config.OPENAI_API_KEY |
| )) |
|
|
| |
| test_questions = [ |
|
|
| |
| { |
| "question": "Does Osimertinib work for the C797S resistance mutation?", |
| "ground_truth": "No, patients with the C797S resistance mutation showed no response to Osimertinib." |
| }, |
| { |
| "question": "What was the progression-free survival outcome of Osimertinib versus chemotherapy in EGFR-mutated NSCLC?", |
| "ground_truth": "Osimertinib showed a significant increase in progression-free survival compared to standard chemotherapy." |
| }, |
| { |
| "question": "What is the clinical trial ID and date for the Osimertinib EGFR NSCLC study?", |
| "ground_truth": "The clinical trial ID is NCT01234567 and the study date was 2023-05-12." |
| }, |
|
|
| |
| { |
| "question": "What is the objective response rate of Sotorasib in KRAS G12C-mutated NSCLC?", |
| "ground_truth": "Sotorasib demonstrated an objective response rate of 37.1% in KRAS G12C-mutated NSCLC." |
| }, |
| { |
| "question": "How do co-occurring STK11 mutations affect Sotorasib response?", |
| "ground_truth": "Patients with co-occurring STK11 mutations showed significantly reduced response rates to Sotorasib compared to STK11 wild-type patients." |
| }, |
| { |
| "question": "What were the most common severe adverse events reported with Sotorasib?", |
| "ground_truth": "Grade 3 or higher adverse events occurred in 19% of patients, most commonly hepatotoxicity and diarrhea." |
| }, |
|
|
| |
| { |
| "question": "What was the intracranial response rate of Lorlatinib compared to Alectinib in ALK-positive NSCLC?", |
| "ground_truth": "Lorlatinib demonstrated superior intracranial response rates of 82% compared to 58% for Alectinib." |
| }, |
| { |
| "question": "What is the median progression-free survival for Lorlatinib versus Alectinib?", |
| "ground_truth": "Median progression-free survival was 18.3 months for Lorlatinib versus 14.8 months for Alectinib." |
| }, |
| { |
| "question": "Why is Lorlatinib preferred over Alectinib for ALK-positive NSCLC patients with brain metastases?", |
| "ground_truth": "Lorlatinib is preferred due to its superior CNS penetration, achieving an 82% intracranial response rate compared to 58% for Alectinib." |
| }, |
|
|
| |
| { |
| "question": "What PD-L1 TPS threshold qualifies patients for first-line Pembrolizumab monotherapy in NSCLC?", |
| "ground_truth": "A PD-L1 TPS of 50% or higher is the validated threshold for Pembrolizumab monotherapy in first-line NSCLC." |
| }, |
| { |
| "question": "What is the 5-year overall survival rate for NSCLC patients with PD-L1 TPS >= 50% treated with Pembrolizumab?", |
| "ground_truth": "Patients with TPS >= 50% achieved a 5-year overall survival rate of 31.9% with Pembrolizumab, compared to 16.3% for chemotherapy." |
| }, |
| { |
| "question": "What treatment should patients with low or negative PD-L1 expression receive instead of Pembrolizumab monotherapy?", |
| "ground_truth": "Patients with low or negative PD-L1 should receive combination chemoimmunotherapy." |
| }, |
|
|
| |
| { |
| "question": "What was the median progression-free survival for Olaparib versus placebo in BRCA-mutated ovarian cancer?", |
| "ground_truth": "Olaparib significantly extended median progression-free survival to 19.1 months compared to 5.5 months in the placebo group." |
| }, |
| { |
| "question": "Do BRCA1 and BRCA2 mutations predict equal benefit from Olaparib maintenance therapy?", |
| "ground_truth": "No. BRCA2-mutated patients showed greater benefit with a 7-year overall survival of 67%, while BRCA1-mutated patients showed somewhat less benefit, likely due to differences in reversion mutation rates." |
| }, |
| { |
| "question": "What causes acquired resistance to Olaparib in ovarian cancer patients?", |
| "ground_truth": "Patients who developed BRCA reversion mutations showed acquired resistance to Olaparib and had significantly shorter progression-free survival on rechallenge." |
| }, |
|
|
| |
| { |
| "question": "How does T-DXd compare to T-DM1 in HER2-positive metastatic breast cancer for progression-free survival?", |
| "ground_truth": "T-DXd achieved a median progression-free survival of 28.8 months compared to 6.8 months for T-DM1 in HER2-positive patients." |
| }, |
| { |
| "question": "What is the clinical significance of the HER2-low category based on the T-DXd trial?", |
| "ground_truth": "T-DXd demonstrated clinical benefit in HER2-low patients with a median PFS of 9.9 months versus 5.1 months for chemotherapy, establishing HER2-low as a new actionable treatment category." |
| }, |
| { |
| "question": "What is the most clinically significant adverse event associated with T-DXd, and how common is it?", |
| "ground_truth": "Interstitial lung disease was the most clinically significant adverse event, occurring in 12.1% of patients, with 0.8% experiencing Grade 5 (fatal) events." |
| }, |
| ] |
|
|
| def run_evaluation(): |
| print("Starting RAGAS Evaluation...") |
| chain = get_rag_chain() |
|
|
| data = {"question": [], "answer": [], "contexts": [], "ground_truth": []} |
|
|
| for item in test_questions: |
| print(f" Testing: {item['question']}") |
| result = chain(item["question"]) |
|
|
| context_strings = [ctx["content"] for ctx in result["contexts"]] |
|
|
| data["question"].append(item["question"]) |
| data["answer"].append(result["answer"]) |
| data["contexts"].append(context_strings) |
| data["ground_truth"].append(item["ground_truth"]) |
|
|
| dataset = Dataset.from_dict(data) |
|
|
| print("\nRunning RAGAS scoring (this may take a few minutes)...") |
| results = evaluate( |
| dataset=dataset, |
| metrics=[faithfulness, answer_relevancy, context_precision, context_recall], |
| llm=evaluator_llm, |
| embeddings=evaluator_embeddings, |
| run_config=RunConfig(timeout=120, max_workers=4) |
| ) |
|
|
| print("\nEvaluation Scores:") |
| print(results) |
|
|
| df = results.to_pandas() |
| df.to_csv("evaluation_results.csv", index=False) |
| print("Results saved to evaluation_results.csv") |
|
|
| if __name__ == "__main__": |
| run_evaluation() |
| config.qdrant_client.close() |
|
|