leaderboard / app /app.py
ofermend's picture
updated
0e2da72
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
from app_utils import load_results, visualize_leaderboard, apply_data_slice, DATA_SLICE_MAP
results_df = load_results()
DESCRIPTION = """
# Hughes Hallucination Evaluation Model (HHEM) Leaderboard
Using [Vectara](https://vectara.com/)'s proprietary [Factual Consistency Evaluation Model](https://www.vectara.com/blog/hallucination-detection-commercial-vs-open-source-a-deep-dive),
this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document.
For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates.
For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
"""
def leaderboard(
filter_models_by_name: str = "",
high_ar_only: bool = False,
size_filter: str = "all",
access_filter: str = "all",
data_slice: str = "Overall"
):
"""Filter and display the leaderboard."""
df = results_df.copy()
# Apply data slice first (recalculates metrics and re-sorts)
df = apply_data_slice(df, data_slice)
# Filter by answer rate if toggle is on
if high_ar_only:
df = df[df["Answer %"] >= 95]
# Filter by model size
if size_filter and size_filter != "all":
df = df[df["Model Size"] == size_filter]
# Filter by accessibility
if access_filter and access_filter != "all":
df = df[df["Accessibility"] == access_filter]
# Filter by model name
filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name.lower():
filter_list = [name.lower() for name in filter_models_by_name.split(";") if name]
df = df[df["LLM_lower_case"].str.contains("|".join(filter_list), na=False)]
if len(df) == 0:
# Show "no results" message in the plot
fig = go.Figure()
fig.add_annotation(
text="No models found matching your filter",
xref="paper", yref="paper", x=0.5, y=0.5,
showarrow=False, font=dict(size=14, color="gray")
)
fig.update_layout(
xaxis=dict(visible=False), yaxis=dict(visible=False),
height=400, margin=dict(l=50, r=50, t=50, b=50)
)
return fig, pd.DataFrame(columns=["LLM", "Hallucination %", "Answer %", "Avg Summary Words"])
fig = visualize_leaderboard(df)
return fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]
with gr.Blocks(
title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard",
theme=gr.themes.Soft(),
css="""
.header-logo {
display: flex;
align-items: center;
gap: 10px;
margin-bottom: 10px;
}
.header-logo img {
height: 40px;
}
footer { display: none !important; }
.modebar { display: none !important; }
.horizontal-radio .wrap {
display: flex !important;
flex-direction: row !important;
gap: 8px !important;
}
"""
) as demo:
gr.HTML(
'<div class="header-logo">'
'<img src="https://huggingface.co/spaces/vectara/README/resolve/main/Vectara-logo.png" alt="Vectara">'
'</div>'
)
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column(scale=3):
plot_output = gr.Plot(show_label=False)
with gr.Column(scale=1):
filter_input = gr.Textbox(
placeholder="Filter models...",
show_label=False,
value=""
)
high_ar_toggle = gr.Checkbox(
label="Only models with ≥95% answer rate",
value=False
)
size_filter = gr.Radio(
choices=["all", "small", "large"],
value="all",
label="Model size",
elem_classes=["horizontal-radio"]
)
access_filter = gr.Radio(
choices=["all", "commercial", "open"],
value="all",
label="Model type",
elem_classes=["horizontal-radio"]
)
data_slice = gr.Dropdown(
choices=list(DATA_SLICE_MAP.keys()),
value="Overall",
label="Data Slice"
)
with gr.Row():
table_output = gr.Dataframe(
label="Leaderboard",
interactive=False,
max_height=500
)
inputs = [filter_input, high_ar_toggle, size_filter, access_filter, data_slice]
outputs = [plot_output, table_output]
# Load initial data on page load
demo.load(fn=leaderboard, inputs=inputs, outputs=outputs)
# Update on filter change or toggle change
filter_input.change(fn=leaderboard, inputs=inputs, outputs=outputs)
high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
data_slice.change(fn=leaderboard, inputs=inputs, outputs=outputs)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)