ClinicianAssistant / scripts /ragas_eval.py
JDFPalladium
adding idsr define tool and reflecting tweaks to other scripts and notebooks
35274a7
# custom_rag_with_ragas.py
import numpy as np
import pandas as pd
from datasets import Dataset
from ragas.evaluation import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall
)
from llama_index.core import StorageContext, load_index_from_storage, QueryBundle
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.postprocessor import LLMRerank
from llama_index.embeddings.openai import OpenAIEmbedding
from langchain_core.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from llama_index.llms.openai import OpenAI
import os
from dotenv import load_dotenv
if os.path.exists("config.env"):
load_dotenv("config.env")
embeddings = np.load("data/processed/lp/summary_embeddings/embeddings.npy")
df = pd.read_csv("data/processed/lp/summary_embeddings/index.tsv", sep="\t")
embedding_model = OpenAIEmbedding()
# Define your reranker-compatible LLM
llm_llama = OpenAI(model="gpt-4o", temperature=0.0)
# Create LLM reranker
reranker = LLMRerank(llm=llm_llama, top_n=3)
# summarizer LLM
llm = ChatOpenAI(temperature=0.0, model="gpt-4o")
# Define a prompt template for query expansion
query_expansion_prompt = ChatPromptTemplate.from_messages([
("system", "You are an expert in HIV medicine."),
("user", (
"Given the query below, provide a concise, comma-separated list of related terms and synonyms "
"useful for document retrieval. Return only the list, no explanations.\n\n"
"Query: {query}"
))
])
def cosine_similarity_numpy(query_vec: np.ndarray, matrix: np.ndarray) -> np.ndarray:
query_norm = query_vec / np.linalg.norm(query_vec)
matrix_norm = matrix / np.linalg.norm(matrix, axis=1, keepdims=True)
return matrix_norm @ query_norm
def expand_query(query, llm, prompt_template):
messages = prompt_template.format_messages(query=query)
return llm.invoke(messages).content.strip()
def retrieve_contexts(expanded_query, embeddings, df, embedding_model):
query_vec = embedding_model.get_text_embedding(expanded_query)
similarities = cosine_similarity_numpy(query_vec, embeddings)
top_indices = similarities.argsort()[-3:][::-1]
paths = df.loc[top_indices, "vectorestore_path"].tolist()
print(paths)
all_nodes = []
for path in paths:
ctx = StorageContext.from_defaults(persist_dir=path)
index = load_index_from_storage(ctx)
retriever = VectorIndexRetriever(index=index, similarity_top_k=3)
all_nodes.extend(retriever.retrieve(expanded_query))
return [n.text for n in LLMRerank(llm=llm_llama, top_n=3).postprocess_nodes(all_nodes, QueryBundle(expanded_query))]
def summarize(query, contexts, llm):
prompt = (
"You're a clinical assistant helping a provider answer a question using HIV/AIDS guidelines.\n\n"
f"Question: {query}\n\n"
"Provide a detailed summary of the most relevant points from the following source texts using bullet points.\n\n"
+ "\n\n".join([f"Source {i+1}: {text}" for i, text in enumerate(contexts)])
)
return llm.invoke(prompt).content.strip()
# Run on test queries
test_queries = [
"What are important drug interactions with dolutegravir?",
"How should PrEP be provided to adolescent girls?",
"When is cotrimoxazole prophylaxis indicated?",
"What are the guidelines for ART failure?",
"How do you manage HIV in pregnancy?"
]
results = []
for q in test_queries:
print(f"⏳ Processing: {q}")
expanded = expand_query(q, llm, query_expansion_prompt)
contexts = retrieve_contexts(expanded, embeddings, df, embedding_model)
answer = summarize(q, contexts, llm)
results.append({
"question": q,
"contexts": contexts,
"answer": answer
})
# --- Ragas Evaluation ---
print("✅ Running Ragas evaluation...")
ragas_data = Dataset.from_list(results)
eval_results = evaluate(
ragas_data,
metrics=[faithfulness, answer_relevancy]
)
df_eval = eval_results.to_pandas()
df_eval.to_csv("ragas_eval_results.csv", index=False)
print("✅ Evaluation complete. Saved to ragas_eval_results.csv")
print(df_eval)