Zekun Wu
commited on
Commit
·
ef3367f
1
Parent(s):
90100ff
add
Browse files- util/evaluator.py +85 -11
util/evaluator.py
CHANGED
|
@@ -20,7 +20,7 @@ class evaluator:
|
|
| 20 |
evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
|
| 21 |
an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
|
| 22 |
should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
|
| 23 |
-
and 1 indicates that the principle is fully satisfied.
|
| 24 |
|
| 25 |
Question:
|
| 26 |
{question}
|
|
@@ -50,17 +50,37 @@ class evaluator:
|
|
| 50 |
Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
|
| 51 |
Score: (0-1) How well does the explanation provide or support multiple perspectives?
|
| 52 |
|
| 53 |
-
After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary. Directly provide me with the
|
| 54 |
|
| 55 |
Example JSON format:
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
response = self.model.invoke(evaluation_prompt,temperature=0, max_tokens=500).strip()
|
| 63 |
-
|
| 64 |
print(response)
|
| 65 |
try:
|
| 66 |
scores = json.loads(response)
|
|
@@ -139,12 +159,18 @@ class evaluator:
|
|
| 139 |
|
| 140 |
return self.validate_scores(scores)
|
| 141 |
|
|
|
|
| 142 |
def write_evaluation_commentary(scores):
|
| 143 |
evaluation_details = []
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
if score == -1:
|
| 147 |
-
evaluation_details.append(
|
|
|
|
|
|
|
| 148 |
continue
|
| 149 |
|
| 150 |
if principle == "Factually Correct":
|
|
@@ -183,8 +209,56 @@ def write_evaluation_commentary(scores):
|
|
| 183 |
else:
|
| 184 |
comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
|
| 185 |
|
| 186 |
-
evaluation_details.append(
|
|
|
|
|
|
|
| 187 |
return evaluation_details
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
if __name__ == '__main__':
|
| 190 |
|
|
|
|
| 20 |
evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
|
| 21 |
an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
|
| 22 |
should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
|
| 23 |
+
and 1 indicates that the principle is fully satisfied. Additionally, provide a brief explanation for each score to justify your rating.
|
| 24 |
|
| 25 |
Question:
|
| 26 |
{question}
|
|
|
|
| 50 |
Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
|
| 51 |
Score: (0-1) How well does the explanation provide or support multiple perspectives?
|
| 52 |
|
| 53 |
+
After evaluating the provided question and explanation based on the five principles, please format your scores and justifications in a JSON dictionary. Directly provide me with the JSON without any additional text.
|
| 54 |
|
| 55 |
Example JSON format:
|
| 56 |
+
{{
|
| 57 |
+
"Factually Correct": {{
|
| 58 |
+
"Justification": "The explanation is mostly accurate with only minor inaccuracies.",
|
| 59 |
+
"Score": 0.9
|
| 60 |
+
}},
|
| 61 |
+
"Useful": {{
|
| 62 |
+
"Justification": "The explanation is very helpful in understanding the main concept.",
|
| 63 |
+
"Score": 0.85
|
| 64 |
+
}},
|
| 65 |
+
"Context Specific": {{
|
| 66 |
+
"Justification": "The explanation is generally relevant to the specific context but lacks some detail.",
|
| 67 |
+
"Score": 0.8
|
| 68 |
+
}},
|
| 69 |
+
"User Specific": {{
|
| 70 |
+
"Justification": "The explanation is appropriate for the typical user but may be too technical for some.",
|
| 71 |
+
"Score": 0.75
|
| 72 |
+
}},
|
| 73 |
+
"Provides Pluralism": {{
|
| 74 |
+
"Justification": "The explanation considers multiple perspectives but could include more viewpoints.",
|
| 75 |
+
"Score": 0.7
|
| 76 |
+
}}
|
| 77 |
+
}}
|
| 78 |
+
|
| 79 |
+
Answer:
|
| 80 |
+
"""
|
| 81 |
|
| 82 |
response = self.model.invoke(evaluation_prompt,temperature=0, max_tokens=500).strip()
|
| 83 |
+
|
| 84 |
print(response)
|
| 85 |
try:
|
| 86 |
scores = json.loads(response)
|
|
|
|
| 159 |
|
| 160 |
return self.validate_scores(scores)
|
| 161 |
|
| 162 |
+
|
| 163 |
def write_evaluation_commentary(scores):
|
| 164 |
evaluation_details = []
|
| 165 |
+
|
| 166 |
+
for principle, details in scores.items():
|
| 167 |
+
score = details.get('Score', -1)
|
| 168 |
+
justification = details.get('Justification', '')
|
| 169 |
|
| 170 |
if score == -1:
|
| 171 |
+
evaluation_details.append(
|
| 172 |
+
{'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.',
|
| 173 |
+
'Justification': justification})
|
| 174 |
continue
|
| 175 |
|
| 176 |
if principle == "Factually Correct":
|
|
|
|
| 209 |
else:
|
| 210 |
comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
|
| 211 |
|
| 212 |
+
evaluation_details.append(
|
| 213 |
+
{'Principle': principle, 'Score': score, 'Commentary': comment, 'Justification': justification})
|
| 214 |
+
|
| 215 |
return evaluation_details
|
| 216 |
+
# def write_evaluation_commentary(scores):
|
| 217 |
+
# evaluation_details = []
|
| 218 |
+
# for principle, score in scores.items():
|
| 219 |
+
#
|
| 220 |
+
# if score == -1:
|
| 221 |
+
# evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
|
| 222 |
+
# continue
|
| 223 |
+
#
|
| 224 |
+
# if principle == "Factually Correct":
|
| 225 |
+
# if score >= 0.8:
|
| 226 |
+
# comment = "Excellent accuracy! The information is precise and directly relevant to the question."
|
| 227 |
+
# elif score >= 0.5:
|
| 228 |
+
# comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
|
| 229 |
+
# else:
|
| 230 |
+
# comment = "The explanation contains significant inaccuracies or irrelevant information."
|
| 231 |
+
# elif principle == "Useful":
|
| 232 |
+
# if score >= 0.8:
|
| 233 |
+
# comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
|
| 234 |
+
# elif score >= 0.5:
|
| 235 |
+
# comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
|
| 236 |
+
# else:
|
| 237 |
+
# comment = "The explanation does little to help understand or apply the information provided."
|
| 238 |
+
# elif principle == "Context Specific":
|
| 239 |
+
# if score >= 0.8:
|
| 240 |
+
# comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
|
| 241 |
+
# elif score >= 0.5:
|
| 242 |
+
# comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
|
| 243 |
+
# else:
|
| 244 |
+
# comment = "Fails to address the context of the question, lacking relevance or specificity."
|
| 245 |
+
# elif principle == "User Specific":
|
| 246 |
+
# if score >= 0.8:
|
| 247 |
+
# comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
|
| 248 |
+
# elif score >= 0.5:
|
| 249 |
+
# comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
|
| 250 |
+
# else:
|
| 251 |
+
# comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
|
| 252 |
+
# elif principle == "Provides Pluralism":
|
| 253 |
+
# if score >= 0.8:
|
| 254 |
+
# comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
|
| 255 |
+
# elif score >= 0.5:
|
| 256 |
+
# comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
|
| 257 |
+
# else:
|
| 258 |
+
# comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
|
| 259 |
+
#
|
| 260 |
+
# evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
|
| 261 |
+
# return evaluation_details
|
| 262 |
|
| 263 |
if __name__ == '__main__':
|
| 264 |
|