Spaces:
Sleeping
Sleeping
| from llama_index.core.schema import MetadataMode, TextNode | |
| from llama_index.core.evaluation import EmbeddingQAFinetuneDataset | |
| from tqdm import tqdm | |
| from typing import Dict, List | |
| import uuid | |
| import time | |
| from llama_index.llms.openai import OpenAI | |
| import pandas as pd | |
| def evaluate_faithfulness( | |
| question: str, | |
| answer: str, | |
| contexts: list[str], | |
| llm, | |
| ) -> float: | |
| context_text = "\n\n".join(contexts) | |
| prompt = f""" | |
| You are an evaluator. | |
| Question: | |
| {question} | |
| Answer: | |
| {answer} | |
| Retrieved Context: | |
| {context_text} | |
| Task: | |
| Determine whether the answer is fully supported by the retrieved context. | |
| Scoring: | |
| - 1.0 β All claims are supported by the context | |
| - 0.5 β Some claims supported, some not | |
| - 0.0 β Mostly or fully unsupported / hallucinated | |
| Return ONLY the score (1.0, 0.5, or 0.0). | |
| """ | |
| response = llm.complete(prompt) | |
| try: | |
| return float(str(response).strip()) | |
| except ValueError: | |
| return 0.0 | |
| def evaluate_answer_relevance( | |
| question: str, | |
| answer: str, | |
| llm, | |
| ) -> float: | |
| prompt = f""" | |
| You are an evaluator. | |
| Question: | |
| {question} | |
| Answer: | |
| {answer} | |
| Task: | |
| Evaluate how well the answer addresses the question. | |
| Scoring: | |
| - 1.0 β Fully answers the question | |
| - 0.5 β Partially answers | |
| - 0.0 β Does not answer / off-topic | |
| Return ONLY the score (1.0, 0.5, or 0.0). | |
| """ | |
| response = llm.complete(prompt) | |
| try: | |
| return float(str(response).strip()) | |
| except ValueError: | |
| return 0.0 | |
| def evaluate_rag_answers_safe( | |
| queries: list[str], | |
| index, | |
| llm, | |
| top_k: int = 10, | |
| per_call_delay: float = 6.5 # 6.5 seconds between Cohere API calls | |
| ): | |
| """ | |
| Evaluate RAG answers safely with respect to Cohere trial key limits. | |
| """ | |
| rows = [] | |
| query_engine = index.as_query_engine( | |
| similarity_top_k=top_k, | |
| node_postprocessors=[cohere_rerank3], # optional | |
| ) | |
| for query in tqdm(queries, desc="Evaluating queries"): | |
| response = query_engine.query(query) | |
| answer = response.response | |
| contexts = [n.node.get_content() for n in response.source_nodes] | |
| faithfulness = evaluate_faithfulness( | |
| question=query, | |
| answer=answer, | |
| contexts=contexts, | |
| llm=llm, | |
| ) | |
| relevance = evaluate_answer_relevance( | |
| question=query, | |
| answer=answer, | |
| llm=llm, | |
| ) | |
| rows.append({ | |
| "query": query, | |
| "faithfulness": faithfulness, | |
| "answer_relevance": relevance, | |
| }) | |
| # Sleep after each call to avoid hitting the 10/min trial limit | |
| time.sleep(per_call_delay) | |
| df = pd.DataFrame(rows) | |
| print("Average Scores:") | |
| print(df.mean(numeric_only=True)) | |
| return df | |