from llama_index.core.schema import MetadataMode, TextNode from llama_index.core.evaluation import EmbeddingQAFinetuneDataset from tqdm import tqdm from typing import Dict, List import uuid import time from llama_index.llms.openai import OpenAI import pandas as pd def evaluate_faithfulness( question: str, answer: str, contexts: list[str], llm, ) -> float: context_text = "\n\n".join(contexts) prompt = f""" You are an evaluator. Question: {question} Answer: {answer} Retrieved Context: {context_text} Task: Determine whether the answer is fully supported by the retrieved context. Scoring: - 1.0 → All claims are supported by the context - 0.5 → Some claims supported, some not - 0.0 → Mostly or fully unsupported / hallucinated Return ONLY the score (1.0, 0.5, or 0.0). """ response = llm.complete(prompt) try: return float(str(response).strip()) except ValueError: return 0.0 def evaluate_answer_relevance( question: str, answer: str, llm, ) -> float: prompt = f""" You are an evaluator. Question: {question} Answer: {answer} Task: Evaluate how well the answer addresses the question. Scoring: - 1.0 → Fully answers the question - 0.5 → Partially answers - 0.0 → Does not answer / off-topic Return ONLY the score (1.0, 0.5, or 0.0). """ response = llm.complete(prompt) try: return float(str(response).strip()) except ValueError: return 0.0 def evaluate_rag_answers_safe( queries: list[str], index, llm, top_k: int = 10, per_call_delay: float = 6.5 # 6.5 seconds between Cohere API calls ): """ Evaluate RAG answers safely with respect to Cohere trial key limits. """ rows = [] query_engine = index.as_query_engine( similarity_top_k=top_k, node_postprocessors=[cohere_rerank3], # optional ) for query in tqdm(queries, desc="Evaluating queries"): response = query_engine.query(query) answer = response.response contexts = [n.node.get_content() for n in response.source_nodes] faithfulness = evaluate_faithfulness( question=query, answer=answer, contexts=contexts, llm=llm, ) relevance = evaluate_answer_relevance( question=query, answer=answer, llm=llm, ) rows.append({ "query": query, "faithfulness": faithfulness, "answer_relevance": relevance, }) # Sleep after each call to avoid hitting the 10/min trial limit time.sleep(per_call_delay) df = pd.DataFrame(rows) print("Average Scores:") print(df.mean(numeric_only=True)) return df