Spaces:
Sleeping
Sleeping
File size: 5,395 Bytes
35274a7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | import numpy as np
import pandas as pd
from dotenv import load_dotenv
import os
from llama_index.core import StorageContext, load_index_from_storage, QueryBundle
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.postprocessor import LLMRerank
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from trulens_eval import Tru
from trulens.core import Feedback
from trulens.providers.openai import OpenAI as OpenAIFeedbackProvider
from trulens_eval.tru_app import TruLlama
# Load environment
if os.path.exists("config.env"):
load_dotenv("config.env")
# Load vectorstore metadata
embeddings = np.load("data/processed/lp/summary_embeddings/embeddings.npy")
df = pd.read_csv("data/processed/lp/summary_embeddings/index.tsv", sep="\t")
# LLMs and components
embedding_model = OpenAIEmbedding()
llm_llama = OpenAI(model="gpt-4o", temperature=0.0)
reranker = LLMRerank(llm=llm_llama, top_n=3)
# langchain summarize LLM
llm = ChatOpenAI(model="gpt-4o", temperature=0.0)
grounded = Feedback(Groundedness()).on_input().on_context().with_name("faithfulness")
context_rel = Feedback(Relevance()).on_input().on_context().with_name("context_relevance")
answer_rel = Feedback(AnswerRelevance()).on_input().on_output().with_name("answer_relevance")
# Prompt for query expansion
query_expansion_prompt = ChatPromptTemplate.from_messages([
("system", "You are an expert in HIV medicine."),
("user", (
"Given the query below, provide a concise, comma-separated list of related terms and synonyms "
"useful for document retrieval. Return only the list, no explanations.\n\n"
"Query: {query}"
))
])
# ---------- Functions ----------
def cosine_similarity_numpy(query_vec: np.ndarray, matrix: np.ndarray) -> np.ndarray:
query_norm = query_vec / np.linalg.norm(query_vec)
matrix_norm = matrix / np.linalg.norm(matrix, axis=1, keepdims=True)
return matrix_norm @ query_norm
def expand_query(query, llm, prompt_template):
messages = prompt_template.format_messages(query=query)
return llm.invoke(messages).content.strip()
def retrieve_contexts(expanded_query, embeddings, df, embedding_model):
query_vec = embedding_model.get_text_embedding(expanded_query)
sims = cosine_similarity_numpy(query_vec, embeddings)
top_indices = sims.argsort()[-3:][::-1]
paths = df.loc[top_indices, "vectorestore_path"].tolist()
all_nodes = []
for path in paths:
ctx = StorageContext.from_defaults(persist_dir=path)
index = load_index_from_storage(ctx)
retriever = VectorIndexRetriever(index=index, similarity_top_k=3)
all_nodes.extend(retriever.retrieve(expanded_query))
reranked = reranker.postprocess_nodes(all_nodes, QueryBundle(expanded_query))
return [n.text for n in reranked]
def summarize(query, contexts, llm):
prompt = (
"You're a clinical assistant helping a provider answer a question using HIV/AIDS guidelines.\n\n"
f"Question: {query}\n\n"
"Provide a detailed summary of the most relevant points to the user question from the following source texts. Use bullet points.\n\n"
+ "\n\n".join([f"Source {i+1}: {t}" for i, t in enumerate(contexts)])
)
return llm.invoke(prompt).content.strip()
# ---------- RAG Pipeline ----------
def custom_rag_app(query):
expanded = expand_query(query, llm, query_expansion_prompt)
contexts = retrieve_contexts(expanded, embeddings, df, embedding_model)
answer = summarize(query, contexts, llm)
return {
"question": query,
"expanded_query": expanded,
"contexts": contexts,
"answer": answer
}
# ---------- Feedbacks ----------
provider = OpenAIFeedbackProvider()
f_grounded = Feedback(provider.groundedness).on_input().on_context().with_name("faithfulness")
f_context_rel = Feedback(provider.context_relevance).on_input().on_context().with_name("context_relevance")
f_answer_rel = Feedback(provider.relevance).on_input().on_output().with_name("answer_relevance")
# ---------- TruLens App ----------
tru_llama = TruLlama(
app=custom_rag_app,
feedbacks=[f_grounded, f_context_rel, f_answer_rel],
app_id="evaluate-trulens-llama-v2"
)
tru = Tru()
# ---------- Run Evaluation ----------
test_queries = [
"What are important drug interactions with dolutegravir?",
"How should PrEP be provided to adolescent girls?",
"When is cotrimoxazole prophylaxis indicated?",
"What are the guidelines for ART failure?",
"How do you manage HIV in pregnancy?"
]
records = []
for q in test_queries:
record = tru_llama.run_with_record(question=q)
fb = record["feedback"]
records.append({
"question": q,
"answer": record["output"],
"contexts": record["context"],
"faithfulness_score": fb["faithfulness"].get("score"),
"context_relevance_score": fb["context_relevance"].get("score"),
"answer_relevance_score": fb["answer_relevance"].get("score"),
"faithfulness_justification": fb["faithfulness"].get("justification", "")
})
df = pd.DataFrame(records)
df.to_csv("trulens_llama_eval_results.csv", index=False)
print("✅ Evaluation complete. Saved to trulens_llama_eval_results.csv")
print(df) |