Spaces:

holistic-ai
/

explainbility_benchmark

Sleeping

App Files Files Community

Zekun Wu commited on May 13, 2024

Commit

ea070cc

1 Parent(s): 0eb1a66

update

Browse files

Files changed (2) hide show

app.py +13 -40
evaluator.py +40 -0

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import streamlit as st
-from evaluator import evaluator
 import os
 # Predefined examples
@@ -14,45 +15,7 @@ examples = {
     }
 }
-def write_evaluation_commentary(scores):
-    for principle, score in scores.items():
-        if principle == "Factually Correct":
-            if score >= 0.8:
-                comment = "Excellent accuracy! The information is precise and directly relevant to the question."
-            elif score >= 0.5:
-                comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
-            else:
-                comment = "The explanation contains significant inaccuracies or irrelevant information."
-        elif principle == "Useful":
-            if score >= 0.8:
-                comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
-            elif score >= 0.5:
-                comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
-            else:
-                comment = "The explanation does little to help understand or apply the information provided."
-        elif principle == "Context Specific":
-            if score >= 0.8:
-                comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
-            elif score >= 0.5:
-                comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
-            else:
-                comment = "Fails to address the context of the question, lacking relevance or specificity."
-        elif principle == "User Specific":
-            if score >= 0.8:
-                comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
-            elif score >= 0.5:
-                comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
-            else:
-                comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
-        elif principle == "Provides Pluralism":
-            if score >= 0.8:
-                comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
-            elif score >= 0.5:
-                comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
-            else:
-                comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
-        st.write(f"{principle} ({score}): {comment}")
 # Function to check password
 def check_password():
@@ -101,6 +64,16 @@ else:
             eval = evaluator(model_name)
             scores = eval(question, explanation)
             st.write('### Scores')
-            write_evaluation_commentary(scores)
         else:
             st.error('Please enter both a question and an explanation to evaluate.')

+import pandas as pd
 import streamlit as st
+from evaluator import evaluator,write_evaluation_commentary
 import os
 # Predefined examples
     }
 }
 # Function to check password
 def check_password():
             eval = evaluator(model_name)
             scores = eval(question, explanation)
             st.write('### Scores')
+            details = write_evaluation_commentary(scores)
+            df = pd.DataFrame(details)
+            st.write(df)
+            csv = df.to_csv(index=False)
+            st.download_button(
+                label="Download evaluation as CSV",
+                data=csv,
+                file_name='evaluation.csv',
+                mime='text/csv',
+            )
         else:
             st.error('Please enter both a question and an explanation to evaluate.')

evaluator.py CHANGED Viewed

@@ -75,7 +75,47 @@ class evaluator:
         return self.validate_scores(scores)
 if __name__ == '__main__':
     eval = evaluator()

         return self.validate_scores(scores)
+def write_evaluation_commentary(scores):
+    evaluation_details = []
+    for principle, score in scores.items():
+        if principle == "Factually Correct":
+            if score >= 0.8:
+                comment = "Excellent accuracy! The information is precise and directly relevant to the question."
+            elif score >= 0.5:
+                comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
+            else:
+                comment = "The explanation contains significant inaccuracies or irrelevant information."
+        elif principle == "Useful":
+            if score >= 0.8:
+                comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
+            elif score >= 0.5:
+                comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
+            else:
+                comment = "The explanation does little to help understand or apply the information provided."
+        elif principle == "Context Specific":
+            if score >= 0.8:
+                comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
+            elif score >= 0.5:
+                comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
+            else:
+                comment = "Fails to address the context of the question, lacking relevance or specificity."
+        elif principle == "User Specific":
+            if score >= 0.8:
+                comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
+            elif score >= 0.5:
+                comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
+            else:
+                comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
+        elif principle == "Provides Pluralism":
+            if score >= 0.8:
+                comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
+            elif score >= 0.5:
+                comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
+            else:
+                comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
+        evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
+    return evaluation_details
 if __name__ == '__main__':
     eval = evaluator()