Priya-0914 commited on
Commit
db33d2e
·
verified ·
1 Parent(s): b6033dc

Create evaluation.py

Browse files
Files changed (1) hide show
  1. evaluation.py +126 -0
evaluation.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.core.schema import MetadataMode, TextNode
2
+ from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
3
+ from tqdm import tqdm
4
+ from typing import Dict, List
5
+ import uuid
6
+ import time
7
+ from llama_index.llms.openai import OpenAI
8
+ import pandas as pd
9
+
10
+
11
+ def evaluate_faithfulness(
12
+ question: str,
13
+ answer: str,
14
+ contexts: list[str],
15
+ llm,
16
+ ) -> float:
17
+ context_text = "\n\n".join(contexts)
18
+
19
+ prompt = f"""
20
+ You are an evaluator.
21
+
22
+ Question:
23
+ {question}
24
+
25
+ Answer:
26
+ {answer}
27
+
28
+ Retrieved Context:
29
+ {context_text}
30
+
31
+ Task:
32
+ Determine whether the answer is fully supported by the retrieved context.
33
+
34
+ Scoring:
35
+ - 1.0 → All claims are supported by the context
36
+ - 0.5 → Some claims supported, some not
37
+ - 0.0 → Mostly or fully unsupported / hallucinated
38
+
39
+ Return ONLY the score (1.0, 0.5, or 0.0).
40
+ """
41
+
42
+ response = llm.complete(prompt)
43
+ try:
44
+ return float(str(response).strip())
45
+ except ValueError:
46
+ return 0.0
47
+
48
+ def evaluate_answer_relevance(
49
+ question: str,
50
+ answer: str,
51
+ llm,
52
+ ) -> float:
53
+ prompt = f"""
54
+ You are an evaluator.
55
+
56
+ Question:
57
+ {question}
58
+
59
+ Answer:
60
+ {answer}
61
+
62
+ Task:
63
+ Evaluate how well the answer addresses the question.
64
+
65
+ Scoring:
66
+ - 1.0 → Fully answers the question
67
+ - 0.5 → Partially answers
68
+ - 0.0 → Does not answer / off-topic
69
+
70
+ Return ONLY the score (1.0, 0.5, or 0.0).
71
+ """
72
+
73
+ response = llm.complete(prompt)
74
+ try:
75
+ return float(str(response).strip())
76
+ except ValueError:
77
+ return 0.0
78
+
79
+ def evaluate_rag_answers_safe(
80
+ queries: list[str],
81
+ index,
82
+ llm,
83
+ top_k: int = 10,
84
+ per_call_delay: float = 6.5 # 6.5 seconds between Cohere API calls
85
+ ):
86
+ """
87
+ Evaluate RAG answers safely with respect to Cohere trial key limits.
88
+ """
89
+
90
+ rows = []
91
+ query_engine = index.as_query_engine(
92
+ similarity_top_k=top_k,
93
+ node_postprocessors=[cohere_rerank3], # optional
94
+ )
95
+
96
+ for query in tqdm(queries, desc="Evaluating queries"):
97
+ response = query_engine.query(query)
98
+ answer = response.response
99
+ contexts = [n.node.get_content() for n in response.source_nodes]
100
+
101
+ faithfulness = evaluate_faithfulness(
102
+ question=query,
103
+ answer=answer,
104
+ contexts=contexts,
105
+ llm=llm,
106
+ )
107
+
108
+ relevance = evaluate_answer_relevance(
109
+ question=query,
110
+ answer=answer,
111
+ llm=llm,
112
+ )
113
+
114
+ rows.append({
115
+ "query": query,
116
+ "faithfulness": faithfulness,
117
+ "answer_relevance": relevance,
118
+ })
119
+
120
+ # Sleep after each call to avoid hitting the 10/min trial limit
121
+ time.sleep(per_call_delay)
122
+
123
+ df = pd.DataFrame(rows)
124
+ print("Average Scores:")
125
+ print(df.mean(numeric_only=True))
126
+ return df