Spaces:

holistic-ai
/

explainbility_benchmark

Sleeping

App Files Files Community

playing_with_the_source_code

by XinGuan2000 - opened May 20, 2024

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+62

-173

This PR is in draft mode

Files changed (6) hide show

pages/1_Single_Evaluation.py +1 -1
pages/{3_Benchmark_Data.py → 2_Benchmark_Data.py} +0 -0
pages/{4_Explanation_Generation.py → 3_Explanation_Generation.py} +0 -0
pages/{5_Batch_Evaluation.py → 4_Batch_Evaluation.py} +0 -0
pages/{2_Conversation_Evaluation.py → 5_Conversation_Evaluation.py} +1 -1
util/evaluator.py +60 -171

pages/1_Single_Evaluation.py CHANGED Viewed

@@ -57,7 +57,7 @@ if not st.session_state.get('password_correct', False):
     check_password()
 else:
     st.sidebar.success("Password Verified. Proceed with the demo.")
-    model_name = st.selectbox('Select a model:', ['gpt35-1106'])
     # User choice between predefined examples or their own input
     input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))

     check_password()
 else:
     st.sidebar.success("Password Verified. Proceed with the demo.")
+    model_name = st.selectbox('Select a model:', ['gpt4-1106', 'gpt35-1106'])
     # User choice between predefined examples or their own input
     input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))

pages/{3_Benchmark_Data.py → 2_Benchmark_Data.py} RENAMED Viewed

File without changes

pages/{4_Explanation_Generation.py → 3_Explanation_Generation.py} RENAMED Viewed

File without changes

pages/{5_Batch_Evaluation.py → 4_Batch_Evaluation.py} RENAMED Viewed

File without changes

pages/{2_Conversation_Evaluation.py → 5_Conversation_Evaluation.py} RENAMED Viewed

@@ -66,7 +66,7 @@ if not st.session_state.get('password_correct', False):
     check_password()
 else:
     st.sidebar.success("Password Verified. Proceed with the demo.")
-    model_name = st.selectbox('Select a model:', ['gpt35-1106'])
     # User choice between predefined examples or their own input
     input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))

     check_password()
 else:
     st.sidebar.success("Password Verified. Proceed with the demo.")
+    model_name = st.selectbox('Select a model:', ['gpt4-1106', 'gpt35-1106'])
     # User choice between predefined examples or their own input
     input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))

util/evaluator.py CHANGED Viewed

@@ -9,34 +9,20 @@ class evaluator:
     def validate_scores(self, scores):
         required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]
         for key in required_keys:
-            if key not in scores:
-                return {k: {"Score": -1, "Justification": "Invalid input"} for k in required_keys}
-            score_data = scores[key]
-            if not isinstance(score_data, dict):
-                return {k: {"Score": -1, "Justification": "Invalid input format"} for k in required_keys}
-            if "Score" not in score_data or not isinstance(score_data["Score"], (int, float)) or not (
-                    0 <= score_data["Score"] <= 10):
-                return {k: {"Score": -1, "Justification": "Invalid score value"} for k in required_keys}
-            if "Justification" not in score_data or not isinstance(score_data["Justification"], str) or not score_data[
-                "Justification"].strip():
-                return {k: {"Score": -1, "Justification": "Invalid or missing justification"} for k in required_keys}
         return scores
     def evaluate_single(self, question,explanation):
-        evaluation_prompt = f"""You are provided with a user's query and the corresponding explanation generated by
-        an Chatbot. Your task is to evaluate the explanation based on the following five principles. Each principle
-        should be scored on a scale from 0 to 10, where 0 indicates that the principle is not met at all,
-        and 10 indicates that the principle is fully satisfied. Additionally, provide a brief ten words explanation for each score to justify your rating.
-        Query:
         {question}
         Provided Explanation:
@@ -46,55 +32,35 @@ class evaluator:
         Factually Correct:
         Definition: The explanation must be accurate and relevant to the question and the subject matter.
-        Score: (0-10) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
         Useful:
         Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
-        Score: (0-10) How useful is the explanation in helping the user understand the answer and make informed decisions?
         Context Specific:
         Definition: The explanation should be relevant to the specific context or scenario implied by the question.
-        Score: (0-10) How well does the explanation address the specific context or scenario of the question?
         User Specific:
         Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
-        Score: (0-10) How well does the explanation cater to the needs and knowledge level of the intended user?
         Provides Pluralism:
         Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
-        Score: (0-10) How well does the explanation provide or support multiple perspectives?
-        After evaluating the provided question and explanation based on the five principles, please format your scores and justifications in a JSON dictionary. Directly provide me with the JSON without any additional text.
         Example JSON format:
-        {{
-        "Factually Correct": {{
-            "Justification": "xxx",
-            "Score": 9
-        }},
-        "Useful": {{
-            "Justification": "xxx",
-            "Score": 8.5
-        }},
-        "Context Specific": {{
-            "Justification": "xxx",
-            "Score": 8
-        }},
-        "User Specific": {{
-            "Justification": "xxx",
-            "Score": 7.5
-        }},
-        "Provides Pluralism": {{
-            "Justification": "xxx",
-            "Score": 7
-        }}
-    }}
-    Answer:
-    """
-        response = self.model.invoke(evaluation_prompt,temperature=0.8, max_tokens=500).strip()
         print(response)
         try:
             scores = json.loads(response)
@@ -119,70 +85,48 @@ class evaluator:
     def evaluate_conversation(self, conversation, context):
         formatted_conversation = self.format_conversation(conversation)
         evaluation_prompt = f"""
-            You are provided with a conversation between a user and a chatbot and the context about them. Your task is to evaluate the explanation based on the following five principles. Each principle
-            should be scored on a scale from 0 to 10, where 0 indicates that the principle is not met at all,
-            and 10 indicates that the principle is fully satisfied. Additionally, provide a brief ten words explanation for each score to justify your rating.
-            Conversation:
-            {formatted_conversation}
-            Context:
-            {context}
-            Evaluation Criteria:
-            Factually Correct:
-            Definition: The explanation must be accurate and relevant to the question and the subject matter.
-            Score: (0-10) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
-            Useful:
-            Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
-            Score: (0-10) How useful is the explanation in helping the user understand the answer and make informed decisions?
-            Context Specific:
-            Definition: The explanation should be relevant to the specific context or scenario implied by the question.
-            Score: (0-10) How well does the explanation address the specific context or scenario of the question?
-            User Specific:
-            Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
-            Score: (0-10) How well does the explanation cater to the needs and knowledge level of the intended user?
-            Provides Pluralism:
-            Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
-            Score: (0-10) How well does the explanation provide or support multiple perspectives?
-            After evaluating the provided question and explanation based on the five principles, please format your scores and justifications in a JSON dictionary. Directly provide me with the JSON without any additional text.
-            Example JSON format:
-            {{
-            "Factually Correct": {{
-                "Justification": "xxx",
-                "Score": 9
-            }},
-            "Useful": {{
-                "Justification": "xxx",
-                "Score": 8.5
-            }},
-            "Context Specific": {{
-                "Justification": "xxx",
-                "Score": 8
-            }},
-            "User Specific": {{
-                "Justification": "xxx",
-                "Score": 7.5
-            }},
-            "Provides Pluralism": {{
-                "Justification": "xxx",
-                "Score": 7
-            }}
-        }}
         Answer:
         """
         print(evaluation_prompt)
-        response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=1000).strip()
         try:
             scores = json.loads(response)
         except json.JSONDecodeError:
@@ -195,19 +139,12 @@ class evaluator:
         return self.validate_scores(scores)
 def write_evaluation_commentary(scores):
     evaluation_details = []
-    for principle, details in scores.items():
-        print(details)
-        score = details.get('Score', -1)
-        justification = details.get('Justification', '')
         if score == -1:
-            evaluation_details.append(
-                {'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.',
-                 'Justification': justification})
             continue
         if principle == "Factually Correct":
@@ -246,56 +183,8 @@ def write_evaluation_commentary(scores):
             else:
                 comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
-        evaluation_details.append(
-            {'Principle': principle, 'Score': score, 'Justification': justification,'Commentary': comment})
     return evaluation_details
-# def write_evaluation_commentary(scores):
-#     evaluation_details = []
-#     for principle, score in scores.items():
-#
-#         if score == -1:
-#             evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
-#             continue
-#
-#         if principle == "Factually Correct":
-#             if score >= 0.8:
-#                 comment = "Excellent accuracy! The information is precise and directly relevant to the question."
-#             elif score >= 0.5:
-#                 comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
-#             else:
-#                 comment = "The explanation contains significant inaccuracies or irrelevant information."
-#         elif principle == "Useful":
-#             if score >= 0.8:
-#                 comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
-#             elif score >= 0.5:
-#                 comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
-#             else:
-#                 comment = "The explanation does little to help understand or apply the information provided."
-#         elif principle == "Context Specific":
-#             if score >= 0.8:
-#                 comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
-#             elif score >= 0.5:
-#                 comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
-#             else:
-#                 comment = "Fails to address the context of the question, lacking relevance or specificity."
-#         elif principle == "User Specific":
-#             if score >= 0.8:
-#                 comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
-#             elif score >= 0.5:
-#                 comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
-#             else:
-#                 comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
-#         elif principle == "Provides Pluralism":
-#             if score >= 0.8:
-#                 comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
-#             elif score >= 0.5:
-#                 comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
-#             else:
-#                 comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
-#
-#         evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
-#     return evaluation_details
 if __name__ == '__main__':

     def validate_scores(self, scores):
         required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]
         for key in required_keys:
+            if key not in scores or not isinstance(scores[key], (int, float)) or not (-1 <= scores[key] <= 1):
+                return {"Factually Correct": -1,"Useful": -1,"Context Specific": -1,"User Specific":-1,"Provides Pluralism":-1}
         return scores
     def evaluate_single(self, question,explanation):
+        evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
+        an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
+        should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
+        and 1 indicates that the principle is fully satisfied.
+        Question:
         {question}
         Provided Explanation:
         Factually Correct:
         Definition: The explanation must be accurate and relevant to the question and the subject matter.
+        Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
         Useful:
         Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
+        Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
         Context Specific:
         Definition: The explanation should be relevant to the specific context or scenario implied by the question.
+        Score: (0-1) How well does the explanation address the specific context or scenario of the question?
         User Specific:
         Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
+        Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
         Provides Pluralism:
         Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
+        Score: (0-1) How well does the explanation provide or support multiple perspectives?
+        After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text.
         Example JSON format:
+        Answer:{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}
+        Answer:
+        """
+        response = self.model.invoke(evaluation_prompt,temperature=0, max_tokens=500).strip()
+        #response = """{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}"""
         print(response)
         try:
             scores = json.loads(response)
     def evaluate_conversation(self, conversation, context):
         formatted_conversation = self.format_conversation(conversation)
         evaluation_prompt = f"""
+        You are provided with a conversation between a user and a chatbot and the context about them. Your task is to evaluate the chatbot explanation in the conversation based on the following five principles. Each principle should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all, and 1 indicates that the principle is fully satisfied.
+        Conversation:
+        {formatted_conversation}
+        Context:
+        {context}
+        Evaluation Criteria:
+        Factually Correct:
+        Definition: The explanation must be accurate and relevant to the question and the subject matter.
+        Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
+        Useful:
+        Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
+        Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
+        Context Specific:
+        Definition: The explanation should be relevant to the specific context or scenario implied by the question.
+        Score: (0-1) How well does the explanation address the specific context or scenario of the question?
+        User Specific:
+        Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
+        Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
+        Provides Pluralism:
+        Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
+        Score: (0-1) How well does the explanation provide or support multiple perspectives?
+        After evaluating the provided conversation based on the context and five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text.
+        Example JSON format:
+        Answer: {{"Factually Correct": 0.9, "Useful": 0.85, "Context Specific": 0.8, "User Specific": 0.75, "Provides Pluralism": 0.7}}
         Answer:
         """
         print(evaluation_prompt)
+        response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=500).strip()
         try:
             scores = json.loads(response)
         except json.JSONDecodeError:
         return self.validate_scores(scores)
 def write_evaluation_commentary(scores):
     evaluation_details = []
+    for principle, score in scores.items():
         if score == -1:
+            evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
             continue
         if principle == "Factually Correct":
             else:
                 comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
+        evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
     return evaluation_details
 if __name__ == '__main__':