File size: 2,759 Bytes
db33d2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from llama_index.core.schema import MetadataMode, TextNode
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from tqdm import tqdm
from typing import Dict, List
import uuid
import time
from llama_index.llms.openai import OpenAI
import pandas as pd


def evaluate_faithfulness(
    question: str,
    answer: str,
    contexts: list[str],
    llm,
) -> float:
    context_text = "\n\n".join(contexts)

    prompt = f"""
You are an evaluator.

Question:
{question}

Answer:
{answer}

Retrieved Context:
{context_text}

Task:
Determine whether the answer is fully supported by the retrieved context.

Scoring:
- 1.0 → All claims are supported by the context
- 0.5 → Some claims supported, some not
- 0.0 → Mostly or fully unsupported / hallucinated

Return ONLY the score (1.0, 0.5, or 0.0).
"""

    response = llm.complete(prompt)
    try:
        return float(str(response).strip())
    except ValueError:
        return 0.0

def evaluate_answer_relevance(
    question: str,
    answer: str,
    llm,
) -> float:
    prompt = f"""
You are an evaluator.

Question:
{question}

Answer:
{answer}

Task:
Evaluate how well the answer addresses the question.

Scoring:
- 1.0 → Fully answers the question
- 0.5 → Partially answers
- 0.0 → Does not answer / off-topic

Return ONLY the score (1.0, 0.5, or 0.0).
"""

    response = llm.complete(prompt)
    try:
        return float(str(response).strip())
    except ValueError:
        return 0.0

def evaluate_rag_answers_safe(
    queries: list[str],
    index,
    llm,
    top_k: int = 10,
    per_call_delay: float = 6.5  # 6.5 seconds between Cohere API calls
):
    """
    Evaluate RAG answers safely with respect to Cohere trial key limits.
    """

    rows = []
    query_engine = index.as_query_engine(
        similarity_top_k=top_k,
        node_postprocessors=[cohere_rerank3],  # optional
    )

    for query in tqdm(queries, desc="Evaluating queries"):
        response = query_engine.query(query)
        answer = response.response
        contexts = [n.node.get_content() for n in response.source_nodes]

        faithfulness = evaluate_faithfulness(
            question=query,
            answer=answer,
            contexts=contexts,
            llm=llm,
        )

        relevance = evaluate_answer_relevance(
            question=query,
            answer=answer,
            llm=llm,
        )

        rows.append({
            "query": query,
            "faithfulness": faithfulness,
            "answer_relevance": relevance,
        })

        # Sleep after each call to avoid hitting the 10/min trial limit
        time.sleep(per_call_delay)

    df = pd.DataFrame(rows)
    print("Average Scores:")
    print(df.mean(numeric_only=True))
    return df