Chatbot / evaluation.py
Priya-0914's picture
Create evaluation.py
db33d2e verified
from llama_index.core.schema import MetadataMode, TextNode
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from tqdm import tqdm
from typing import Dict, List
import uuid
import time
from llama_index.llms.openai import OpenAI
import pandas as pd
def evaluate_faithfulness(
question: str,
answer: str,
contexts: list[str],
llm,
) -> float:
context_text = "\n\n".join(contexts)
prompt = f"""
You are an evaluator.
Question:
{question}
Answer:
{answer}
Retrieved Context:
{context_text}
Task:
Determine whether the answer is fully supported by the retrieved context.
Scoring:
- 1.0 β†’ All claims are supported by the context
- 0.5 β†’ Some claims supported, some not
- 0.0 β†’ Mostly or fully unsupported / hallucinated
Return ONLY the score (1.0, 0.5, or 0.0).
"""
response = llm.complete(prompt)
try:
return float(str(response).strip())
except ValueError:
return 0.0
def evaluate_answer_relevance(
question: str,
answer: str,
llm,
) -> float:
prompt = f"""
You are an evaluator.
Question:
{question}
Answer:
{answer}
Task:
Evaluate how well the answer addresses the question.
Scoring:
- 1.0 β†’ Fully answers the question
- 0.5 β†’ Partially answers
- 0.0 β†’ Does not answer / off-topic
Return ONLY the score (1.0, 0.5, or 0.0).
"""
response = llm.complete(prompt)
try:
return float(str(response).strip())
except ValueError:
return 0.0
def evaluate_rag_answers_safe(
queries: list[str],
index,
llm,
top_k: int = 10,
per_call_delay: float = 6.5 # 6.5 seconds between Cohere API calls
):
"""
Evaluate RAG answers safely with respect to Cohere trial key limits.
"""
rows = []
query_engine = index.as_query_engine(
similarity_top_k=top_k,
node_postprocessors=[cohere_rerank3], # optional
)
for query in tqdm(queries, desc="Evaluating queries"):
response = query_engine.query(query)
answer = response.response
contexts = [n.node.get_content() for n in response.source_nodes]
faithfulness = evaluate_faithfulness(
question=query,
answer=answer,
contexts=contexts,
llm=llm,
)
relevance = evaluate_answer_relevance(
question=query,
answer=answer,
llm=llm,
)
rows.append({
"query": query,
"faithfulness": faithfulness,
"answer_relevance": relevance,
})
# Sleep after each call to avoid hitting the 10/min trial limit
time.sleep(per_call_delay)
df = pd.DataFrame(rows)
print("Average Scores:")
print(df.mean(numeric_only=True))
return df