leaderboard / app /app.py
ofermend's picture
updated to gradio; python 3.11; visual improvements
d0c57df
raw
history blame
4.49 kB
import gradio as gr
import pandas as pd
import matplotlib.pyplot
from app_utils import load_results, visualize_leaderboard
results_df = load_results()
DESCRIPTION = """
# Hughes Hallucination Evaluation Model (HHEM) Leaderboard
Using [Vectara](https://vectara.com/)'s proprietary [Factual Consistency Evaluation Model](https://www.vectara.com/blog/hallucination-detection-commercial-vs-open-source-a-deep-dive),
this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document.
For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates.
For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
"""
def leaderboard(
filter_models_by_name: str = "",
high_ar_only: bool = False,
size_filter: str = "all",
access_filter: str = "all"
):
"""Filter and display the leaderboard."""
df = results_df.copy()
# Filter by answer rate if toggle is on
if high_ar_only:
df = df[df["Answer %"] >= 95]
# Filter by model size
if size_filter and size_filter != "all":
df = df[df["Model Size"] == size_filter]
# Filter by accessibility
if access_filter and access_filter != "all":
df = df[df["Accessibility"] == access_filter]
# Filter by model name
filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name.lower():
filter_list = [name.lower() for name in filter_models_by_name.split(";") if name]
df = df[df["LLM_lower_case"].str.contains("|".join(filter_list), na=False)]
if len(df) == 0:
# Show "no results" message in the plot
fig, ax = matplotlib.pyplot.subplots(figsize=(10, 5))
ax.text(0.5, 0.5, "No models found matching your filter",
ha='center', va='center', fontsize=14, color='gray')
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')
return fig, pd.DataFrame(columns=["LLM", "Hallucination %", "Answer %", "Avg Summary Words"])
fig = visualize_leaderboard(df)
return fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]
with gr.Blocks(
title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard",
theme=gr.themes.Soft(),
css="""
.header-logo {
display: flex;
align-items: center;
gap: 10px;
margin-bottom: 10px;
}
.header-logo img {
height: 40px;
}
footer { display: none !important; }
"""
) as demo:
gr.HTML(
'<div class="header-logo">'
'<img src="https://huggingface.co/spaces/vectara/README/resolve/main/Vectara-logo.png" alt="Vectara">'
'</div>'
)
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column(scale=3):
plot_output = gr.Plot(show_label=False)
with gr.Column(scale=1):
filter_input = gr.Textbox(
placeholder="Filter models...",
show_label=False,
value=""
)
high_ar_toggle = gr.Checkbox(
label="Only models with ≥95% answer rate",
value=False
)
size_filter = gr.Radio(
choices=["all", "small", "large"],
value="all",
label="Model size"
)
access_filter = gr.Radio(
choices=["all", "commercial", "open"],
value="all",
label="Model type"
)
with gr.Row():
table_output = gr.Dataframe(
label="Leaderboard",
interactive=False,
max_height=500
)
inputs = [filter_input, high_ar_toggle, size_filter, access_filter]
outputs = [plot_output, table_output]
# Load initial data on page load
demo.load(fn=leaderboard, inputs=inputs, outputs=outputs)
# Update on filter change or toggle change
filter_input.change(fn=leaderboard, inputs=inputs, outputs=outputs)
high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)