playing_with_the_source_code
#2
by
XinGuan2000
- opened
- pages/1_Single_Evaluation.py +1 -1
- pages/{3_Benchmark_Data.py β 2_Benchmark_Data.py} +0 -0
- pages/{4_Explanation_Generation.py β 3_Explanation_Generation.py} +0 -0
- pages/{5_Batch_Evaluation.py β 4_Batch_Evaluation.py} +0 -0
- pages/{2_Conversation_Evaluation.py β 5_Conversation_Evaluation.py} +1 -1
- util/evaluator.py +60 -171
pages/1_Single_Evaluation.py
CHANGED
|
@@ -57,7 +57,7 @@ if not st.session_state.get('password_correct', False):
|
|
| 57 |
check_password()
|
| 58 |
else:
|
| 59 |
st.sidebar.success("Password Verified. Proceed with the demo.")
|
| 60 |
-
model_name = st.selectbox('Select a model:', ['gpt35-1106'])
|
| 61 |
|
| 62 |
# User choice between predefined examples or their own input
|
| 63 |
input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
|
|
|
|
| 57 |
check_password()
|
| 58 |
else:
|
| 59 |
st.sidebar.success("Password Verified. Proceed with the demo.")
|
| 60 |
+
model_name = st.selectbox('Select a model:', ['gpt4-1106', 'gpt35-1106'])
|
| 61 |
|
| 62 |
# User choice between predefined examples or their own input
|
| 63 |
input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
|
pages/{3_Benchmark_Data.py β 2_Benchmark_Data.py}
RENAMED
|
File without changes
|
pages/{4_Explanation_Generation.py β 3_Explanation_Generation.py}
RENAMED
|
File without changes
|
pages/{5_Batch_Evaluation.py β 4_Batch_Evaluation.py}
RENAMED
|
File without changes
|
pages/{2_Conversation_Evaluation.py β 5_Conversation_Evaluation.py}
RENAMED
|
@@ -66,7 +66,7 @@ if not st.session_state.get('password_correct', False):
|
|
| 66 |
check_password()
|
| 67 |
else:
|
| 68 |
st.sidebar.success("Password Verified. Proceed with the demo.")
|
| 69 |
-
model_name = st.selectbox('Select a model:', ['gpt35-1106'])
|
| 70 |
|
| 71 |
# User choice between predefined examples or their own input
|
| 72 |
input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
|
|
|
|
| 66 |
check_password()
|
| 67 |
else:
|
| 68 |
st.sidebar.success("Password Verified. Proceed with the demo.")
|
| 69 |
+
model_name = st.selectbox('Select a model:', ['gpt4-1106', 'gpt35-1106'])
|
| 70 |
|
| 71 |
# User choice between predefined examples or their own input
|
| 72 |
input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
|
util/evaluator.py
CHANGED
|
@@ -9,34 +9,20 @@ class evaluator:
|
|
| 9 |
|
| 10 |
def validate_scores(self, scores):
|
| 11 |
required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]
|
| 12 |
-
|
| 13 |
for key in required_keys:
|
| 14 |
-
if key not in scores:
|
| 15 |
-
return {
|
| 16 |
-
|
| 17 |
-
score_data = scores[key]
|
| 18 |
-
|
| 19 |
-
if not isinstance(score_data, dict):
|
| 20 |
-
return {k: {"Score": -1, "Justification": "Invalid input format"} for k in required_keys}
|
| 21 |
-
|
| 22 |
-
if "Score" not in score_data or not isinstance(score_data["Score"], (int, float)) or not (
|
| 23 |
-
0 <= score_data["Score"] <= 10):
|
| 24 |
-
return {k: {"Score": -1, "Justification": "Invalid score value"} for k in required_keys}
|
| 25 |
-
|
| 26 |
-
if "Justification" not in score_data or not isinstance(score_data["Justification"], str) or not score_data[
|
| 27 |
-
"Justification"].strip():
|
| 28 |
-
return {k: {"Score": -1, "Justification": "Invalid or missing justification"} for k in required_keys}
|
| 29 |
|
| 30 |
return scores
|
| 31 |
|
| 32 |
def evaluate_single(self, question,explanation):
|
| 33 |
|
| 34 |
-
evaluation_prompt = f"""You are provided with a user's
|
| 35 |
-
an
|
| 36 |
-
should be scored on a scale from 0 to
|
| 37 |
-
and
|
| 38 |
|
| 39 |
-
|
| 40 |
{question}
|
| 41 |
|
| 42 |
Provided Explanation:
|
|
@@ -46,55 +32,35 @@ class evaluator:
|
|
| 46 |
|
| 47 |
Factually Correct:
|
| 48 |
Definition: The explanation must be accurate and relevant to the question and the subject matter.
|
| 49 |
-
Score: (0-
|
| 50 |
|
| 51 |
Useful:
|
| 52 |
Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
|
| 53 |
-
Score: (0-
|
| 54 |
|
| 55 |
Context Specific:
|
| 56 |
Definition: The explanation should be relevant to the specific context or scenario implied by the question.
|
| 57 |
-
Score: (0-
|
| 58 |
|
| 59 |
User Specific:
|
| 60 |
Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
|
| 61 |
-
Score: (0-
|
| 62 |
|
| 63 |
Provides Pluralism:
|
| 64 |
Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
|
| 65 |
-
Score: (0-
|
| 66 |
|
| 67 |
-
After evaluating the provided question and explanation based on the five principles, please format your scores
|
| 68 |
|
| 69 |
Example JSON format:
|
| 70 |
-
|
| 71 |
-
"Factually Correct":
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
"Useful": {{
|
| 76 |
-
"Justification": "xxx",
|
| 77 |
-
"Score": 8.5
|
| 78 |
-
}},
|
| 79 |
-
"Context Specific": {{
|
| 80 |
-
"Justification": "xxx",
|
| 81 |
-
"Score": 8
|
| 82 |
-
}},
|
| 83 |
-
"User Specific": {{
|
| 84 |
-
"Justification": "xxx",
|
| 85 |
-
"Score": 7.5
|
| 86 |
-
}},
|
| 87 |
-
"Provides Pluralism": {{
|
| 88 |
-
"Justification": "xxx",
|
| 89 |
-
"Score": 7
|
| 90 |
-
}}
|
| 91 |
-
}}
|
| 92 |
-
|
| 93 |
-
Answer:
|
| 94 |
-
"""
|
| 95 |
-
|
| 96 |
-
response = self.model.invoke(evaluation_prompt,temperature=0.8, max_tokens=500).strip()
|
| 97 |
|
|
|
|
|
|
|
| 98 |
print(response)
|
| 99 |
try:
|
| 100 |
scores = json.loads(response)
|
|
@@ -119,70 +85,48 @@ class evaluator:
|
|
| 119 |
def evaluate_conversation(self, conversation, context):
|
| 120 |
formatted_conversation = self.format_conversation(conversation)
|
| 121 |
evaluation_prompt = f"""
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
"Factually Correct": {{
|
| 159 |
-
"Justification": "xxx",
|
| 160 |
-
"Score": 9
|
| 161 |
-
}},
|
| 162 |
-
"Useful": {{
|
| 163 |
-
"Justification": "xxx",
|
| 164 |
-
"Score": 8.5
|
| 165 |
-
}},
|
| 166 |
-
"Context Specific": {{
|
| 167 |
-
"Justification": "xxx",
|
| 168 |
-
"Score": 8
|
| 169 |
-
}},
|
| 170 |
-
"User Specific": {{
|
| 171 |
-
"Justification": "xxx",
|
| 172 |
-
"Score": 7.5
|
| 173 |
-
}},
|
| 174 |
-
"Provides Pluralism": {{
|
| 175 |
-
"Justification": "xxx",
|
| 176 |
-
"Score": 7
|
| 177 |
-
}}
|
| 178 |
-
}}
|
| 179 |
-
|
| 180 |
Answer:
|
| 181 |
"""
|
| 182 |
|
| 183 |
print(evaluation_prompt)
|
| 184 |
|
| 185 |
-
response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=
|
| 186 |
try:
|
| 187 |
scores = json.loads(response)
|
| 188 |
except json.JSONDecodeError:
|
|
@@ -195,19 +139,12 @@ class evaluator:
|
|
| 195 |
|
| 196 |
return self.validate_scores(scores)
|
| 197 |
|
| 198 |
-
|
| 199 |
def write_evaluation_commentary(scores):
|
| 200 |
evaluation_details = []
|
| 201 |
-
|
| 202 |
-
for principle, details in scores.items():
|
| 203 |
-
print(details)
|
| 204 |
-
score = details.get('Score', -1)
|
| 205 |
-
justification = details.get('Justification', '')
|
| 206 |
|
| 207 |
if score == -1:
|
| 208 |
-
evaluation_details.append(
|
| 209 |
-
{'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.',
|
| 210 |
-
'Justification': justification})
|
| 211 |
continue
|
| 212 |
|
| 213 |
if principle == "Factually Correct":
|
|
@@ -246,56 +183,8 @@ def write_evaluation_commentary(scores):
|
|
| 246 |
else:
|
| 247 |
comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
|
| 248 |
|
| 249 |
-
evaluation_details.append(
|
| 250 |
-
{'Principle': principle, 'Score': score, 'Justification': justification,'Commentary': comment})
|
| 251 |
-
|
| 252 |
return evaluation_details
|
| 253 |
-
# def write_evaluation_commentary(scores):
|
| 254 |
-
# evaluation_details = []
|
| 255 |
-
# for principle, score in scores.items():
|
| 256 |
-
#
|
| 257 |
-
# if score == -1:
|
| 258 |
-
# evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
|
| 259 |
-
# continue
|
| 260 |
-
#
|
| 261 |
-
# if principle == "Factually Correct":
|
| 262 |
-
# if score >= 0.8:
|
| 263 |
-
# comment = "Excellent accuracy! The information is precise and directly relevant to the question."
|
| 264 |
-
# elif score >= 0.5:
|
| 265 |
-
# comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
|
| 266 |
-
# else:
|
| 267 |
-
# comment = "The explanation contains significant inaccuracies or irrelevant information."
|
| 268 |
-
# elif principle == "Useful":
|
| 269 |
-
# if score >= 0.8:
|
| 270 |
-
# comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
|
| 271 |
-
# elif score >= 0.5:
|
| 272 |
-
# comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
|
| 273 |
-
# else:
|
| 274 |
-
# comment = "The explanation does little to help understand or apply the information provided."
|
| 275 |
-
# elif principle == "Context Specific":
|
| 276 |
-
# if score >= 0.8:
|
| 277 |
-
# comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
|
| 278 |
-
# elif score >= 0.5:
|
| 279 |
-
# comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
|
| 280 |
-
# else:
|
| 281 |
-
# comment = "Fails to address the context of the question, lacking relevance or specificity."
|
| 282 |
-
# elif principle == "User Specific":
|
| 283 |
-
# if score >= 0.8:
|
| 284 |
-
# comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
|
| 285 |
-
# elif score >= 0.5:
|
| 286 |
-
# comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
|
| 287 |
-
# else:
|
| 288 |
-
# comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
|
| 289 |
-
# elif principle == "Provides Pluralism":
|
| 290 |
-
# if score >= 0.8:
|
| 291 |
-
# comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
|
| 292 |
-
# elif score >= 0.5:
|
| 293 |
-
# comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
|
| 294 |
-
# else:
|
| 295 |
-
# comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
|
| 296 |
-
#
|
| 297 |
-
# evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
|
| 298 |
-
# return evaluation_details
|
| 299 |
|
| 300 |
if __name__ == '__main__':
|
| 301 |
|
|
|
|
| 9 |
|
| 10 |
def validate_scores(self, scores):
|
| 11 |
required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]
|
|
|
|
| 12 |
for key in required_keys:
|
| 13 |
+
if key not in scores or not isinstance(scores[key], (int, float)) or not (-1 <= scores[key] <= 1):
|
| 14 |
+
return {"Factually Correct": -1,"Useful": -1,"Context Specific": -1,"User Specific":-1,"Provides Pluralism":-1}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
return scores
|
| 17 |
|
| 18 |
def evaluate_single(self, question,explanation):
|
| 19 |
|
| 20 |
+
evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
|
| 21 |
+
an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
|
| 22 |
+
should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
|
| 23 |
+
and 1 indicates that the principle is fully satisfied.
|
| 24 |
|
| 25 |
+
Question:
|
| 26 |
{question}
|
| 27 |
|
| 28 |
Provided Explanation:
|
|
|
|
| 32 |
|
| 33 |
Factually Correct:
|
| 34 |
Definition: The explanation must be accurate and relevant to the question and the subject matter.
|
| 35 |
+
Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
|
| 36 |
|
| 37 |
Useful:
|
| 38 |
Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
|
| 39 |
+
Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
|
| 40 |
|
| 41 |
Context Specific:
|
| 42 |
Definition: The explanation should be relevant to the specific context or scenario implied by the question.
|
| 43 |
+
Score: (0-1) How well does the explanation address the specific context or scenario of the question?
|
| 44 |
|
| 45 |
User Specific:
|
| 46 |
Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
|
| 47 |
+
Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
|
| 48 |
|
| 49 |
Provides Pluralism:
|
| 50 |
Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
|
| 51 |
+
Score: (0-1) How well does the explanation provide or support multiple perspectives?
|
| 52 |
|
| 53 |
+
After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text.
|
| 54 |
|
| 55 |
Example JSON format:
|
| 56 |
+
|
| 57 |
+
Answer:{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}
|
| 58 |
+
|
| 59 |
+
Answer:
|
| 60 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
+
response = self.model.invoke(evaluation_prompt,temperature=0, max_tokens=500).strip()
|
| 63 |
+
#response = """{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}"""
|
| 64 |
print(response)
|
| 65 |
try:
|
| 66 |
scores = json.loads(response)
|
|
|
|
| 85 |
def evaluate_conversation(self, conversation, context):
|
| 86 |
formatted_conversation = self.format_conversation(conversation)
|
| 87 |
evaluation_prompt = f"""
|
| 88 |
+
You are provided with a conversation between a user and a chatbot and the context about them. Your task is to evaluate the chatbot explanation in the conversation based on the following five principles. Each principle should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all, and 1 indicates that the principle is fully satisfied.
|
| 89 |
+
|
| 90 |
+
Conversation:
|
| 91 |
+
{formatted_conversation}
|
| 92 |
+
|
| 93 |
+
Context:
|
| 94 |
+
{context}
|
| 95 |
+
|
| 96 |
+
Evaluation Criteria:
|
| 97 |
+
|
| 98 |
+
Factually Correct:
|
| 99 |
+
Definition: The explanation must be accurate and relevant to the question and the subject matter.
|
| 100 |
+
Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
|
| 101 |
+
|
| 102 |
+
Useful:
|
| 103 |
+
Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
|
| 104 |
+
Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
|
| 105 |
+
|
| 106 |
+
Context Specific:
|
| 107 |
+
Definition: The explanation should be relevant to the specific context or scenario implied by the question.
|
| 108 |
+
Score: (0-1) How well does the explanation address the specific context or scenario of the question?
|
| 109 |
+
|
| 110 |
+
User Specific:
|
| 111 |
+
Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
|
| 112 |
+
Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
|
| 113 |
+
|
| 114 |
+
Provides Pluralism:
|
| 115 |
+
Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
|
| 116 |
+
Score: (0-1) How well does the explanation provide or support multiple perspectives?
|
| 117 |
+
|
| 118 |
+
After evaluating the provided conversation based on the context and five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text.
|
| 119 |
+
|
| 120 |
+
Example JSON format:
|
| 121 |
+
|
| 122 |
+
Answer: {{"Factually Correct": 0.9, "Useful": 0.85, "Context Specific": 0.8, "User Specific": 0.75, "Provides Pluralism": 0.7}}
|
| 123 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
Answer:
|
| 125 |
"""
|
| 126 |
|
| 127 |
print(evaluation_prompt)
|
| 128 |
|
| 129 |
+
response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=500).strip()
|
| 130 |
try:
|
| 131 |
scores = json.loads(response)
|
| 132 |
except json.JSONDecodeError:
|
|
|
|
| 139 |
|
| 140 |
return self.validate_scores(scores)
|
| 141 |
|
|
|
|
| 142 |
def write_evaluation_commentary(scores):
|
| 143 |
evaluation_details = []
|
| 144 |
+
for principle, score in scores.items():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
if score == -1:
|
| 147 |
+
evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
|
|
|
|
|
|
|
| 148 |
continue
|
| 149 |
|
| 150 |
if principle == "Factually Correct":
|
|
|
|
| 183 |
else:
|
| 184 |
comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
|
| 185 |
|
| 186 |
+
evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
|
|
|
|
|
|
|
| 187 |
return evaluation_details
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
if __name__ == '__main__':
|
| 190 |
|