Spaces:
Sleeping
Sleeping
File size: 2,759 Bytes
db33d2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
from llama_index.core.schema import MetadataMode, TextNode
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from tqdm import tqdm
from typing import Dict, List
import uuid
import time
from llama_index.llms.openai import OpenAI
import pandas as pd
def evaluate_faithfulness(
question: str,
answer: str,
contexts: list[str],
llm,
) -> float:
context_text = "\n\n".join(contexts)
prompt = f"""
You are an evaluator.
Question:
{question}
Answer:
{answer}
Retrieved Context:
{context_text}
Task:
Determine whether the answer is fully supported by the retrieved context.
Scoring:
- 1.0 → All claims are supported by the context
- 0.5 → Some claims supported, some not
- 0.0 → Mostly or fully unsupported / hallucinated
Return ONLY the score (1.0, 0.5, or 0.0).
"""
response = llm.complete(prompt)
try:
return float(str(response).strip())
except ValueError:
return 0.0
def evaluate_answer_relevance(
question: str,
answer: str,
llm,
) -> float:
prompt = f"""
You are an evaluator.
Question:
{question}
Answer:
{answer}
Task:
Evaluate how well the answer addresses the question.
Scoring:
- 1.0 → Fully answers the question
- 0.5 → Partially answers
- 0.0 → Does not answer / off-topic
Return ONLY the score (1.0, 0.5, or 0.0).
"""
response = llm.complete(prompt)
try:
return float(str(response).strip())
except ValueError:
return 0.0
def evaluate_rag_answers_safe(
queries: list[str],
index,
llm,
top_k: int = 10,
per_call_delay: float = 6.5 # 6.5 seconds between Cohere API calls
):
"""
Evaluate RAG answers safely with respect to Cohere trial key limits.
"""
rows = []
query_engine = index.as_query_engine(
similarity_top_k=top_k,
node_postprocessors=[cohere_rerank3], # optional
)
for query in tqdm(queries, desc="Evaluating queries"):
response = query_engine.query(query)
answer = response.response
contexts = [n.node.get_content() for n in response.source_nodes]
faithfulness = evaluate_faithfulness(
question=query,
answer=answer,
contexts=contexts,
llm=llm,
)
relevance = evaluate_answer_relevance(
question=query,
answer=answer,
llm=llm,
)
rows.append({
"query": query,
"faithfulness": faithfulness,
"answer_relevance": relevance,
})
# Sleep after each call to avoid hitting the 10/min trial limit
time.sleep(per_call_delay)
df = pd.DataFrame(rows)
print("Average Scores:")
print(df.mean(numeric_only=True))
return df
|