Create single-evaluation
#1
by
loljk
- opened
- pages/single-evaluation +80 -0
pages/single-evaluation
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import streamlit as st
|
| 3 |
+
from transformers import pipeline
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
# Load models
|
| 7 |
+
model1 = pipeline("text-classification", model="vectara/hallucination_evaluation_model")
|
| 8 |
+
model2 = pipeline("text-classification", model="sileod/deberta-v3-base-tasksource-nli")
|
| 9 |
+
|
| 10 |
+
# Predefined examples
|
| 11 |
+
examples = {
|
| 12 |
+
'good': {
|
| 13 |
+
'question': "What causes rainbows to appear in the sky?",
|
| 14 |
+
'explanation': "Rainbows appear when sunlight is refracted, dispersed, and reflected inside water droplets in the atmosphere, resulting in a spectrum of light appearing in the sky.",
|
| 15 |
+
'ground_truth': "Correct"
|
| 16 |
+
},
|
| 17 |
+
'bad': {
|
| 18 |
+
'question': "What causes rainbows to appear in the sky?",
|
| 19 |
+
'explanation': "Rainbows happen because light in the sky gets mixed up and sometimes shows colors when it's raining or when there is water around.",
|
| 20 |
+
'ground_truth': "Incorrect"
|
| 21 |
+
}
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
# Function to evaluate explanations using the two models
|
| 25 |
+
def evaluate_explanation(question, explanation):
|
| 26 |
+
results1 = model1(explanation)
|
| 27 |
+
results2 = model2(explanation)
|
| 28 |
+
return results1, results2
|
| 29 |
+
|
| 30 |
+
# Function to compare vectors (simple difference in scores as example)
|
| 31 |
+
def compare_vectors(v1, v2):
|
| 32 |
+
diff = abs(v1[0]['score'] - v2[0]['score'])
|
| 33 |
+
return diff
|
| 34 |
+
|
| 35 |
+
# Title of the application
|
| 36 |
+
st.title('Dual Model Evaluation of Explanations')
|
| 37 |
+
|
| 38 |
+
# Check for password before allowing access
|
| 39 |
+
def check_password():
|
| 40 |
+
def password_entered():
|
| 41 |
+
if password_input == os.getenv('PASSWORD'):
|
| 42 |
+
st.session_state['password_correct'] = True
|
| 43 |
+
else:
|
| 44 |
+
st.error("Incorrect Password, please try again.")
|
| 45 |
+
|
| 46 |
+
password_input = st.text_input("Enter Password:", type="password")
|
| 47 |
+
submit_button = st.button("Submit", on_click=password_entered)
|
| 48 |
+
|
| 49 |
+
if submit_button and not st.session_state.get('password_correct', False):
|
| 50 |
+
st.error("Please enter a valid password to access the demo.")
|
| 51 |
+
|
| 52 |
+
# Password check
|
| 53 |
+
if not st.session_state.get('password_correct', False):
|
| 54 |
+
check_password()
|
| 55 |
+
else:
|
| 56 |
+
st.sidebar.success("Password Verified. Proceed with the demo.")
|
| 57 |
+
|
| 58 |
+
input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
|
| 59 |
+
if input_type == 'Use predefined example':
|
| 60 |
+
example_type = st.radio("Select an example type:", ('good', 'bad'))
|
| 61 |
+
selected_example = examples[example_type]
|
| 62 |
+
question = selected_example['question']
|
| 63 |
+
explanation = selected_example['explanation']
|
| 64 |
+
ground_truth = selected_example['ground_truth']
|
| 65 |
+
else:
|
| 66 |
+
question = st.text_input('Enter your question:', '')
|
| 67 |
+
explanation = st.text_input('Enter your explanation:', '')
|
| 68 |
+
ground_truth = st.text_input('Enter ground truth:', '')
|
| 69 |
+
|
| 70 |
+
if st.button('Evaluate Explanation'):
|
| 71 |
+
if question and explanation and ground_truth:
|
| 72 |
+
results1, results2 = evaluate_explanation(question, explanation)
|
| 73 |
+
diff = compare_vectors(results1, results2)
|
| 74 |
+
st.write('### Model 1 Results')
|
| 75 |
+
st.write(results1)
|
| 76 |
+
st.write('### Model 2 Results')
|
| 77 |
+
st.write(results2)
|
| 78 |
+
st.write(f'### Score Difference: {diff}')
|
| 79 |
+
else:
|
| 80 |
+
st.error('Please enter a question, explanation, and ground truth to evaluate.')
|