Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 5,339 Bytes
d0c57df 8c3427d 7cd85bf 8c3427d 0e2da72 8c3427d d0c57df 8c3427d d0c57df 0e2da72 d0c57df 8c3427d 0e2da72 d0c57df 8c3427d d0c57df 8c3427d d0c57df 696341e d0c57df 7cd85bf d0c57df 8c3427d d0c57df 7cd85bf 0e2da72 d0c57df 0e2da72 d0c57df 0e2da72 d0c57df 0e2da72 d0c57df 0e2da72 d0c57df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
from app_utils import load_results, visualize_leaderboard, apply_data_slice, DATA_SLICE_MAP
results_df = load_results()
DESCRIPTION = """
# Hughes Hallucination Evaluation Model (HHEM) Leaderboard
Using [Vectara](https://vectara.com/)'s proprietary [Factual Consistency Evaluation Model](https://www.vectara.com/blog/hallucination-detection-commercial-vs-open-source-a-deep-dive),
this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document.
For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates.
For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
"""
def leaderboard(
filter_models_by_name: str = "",
high_ar_only: bool = False,
size_filter: str = "all",
access_filter: str = "all",
data_slice: str = "Overall"
):
"""Filter and display the leaderboard."""
df = results_df.copy()
# Apply data slice first (recalculates metrics and re-sorts)
df = apply_data_slice(df, data_slice)
# Filter by answer rate if toggle is on
if high_ar_only:
df = df[df["Answer %"] >= 95]
# Filter by model size
if size_filter and size_filter != "all":
df = df[df["Model Size"] == size_filter]
# Filter by accessibility
if access_filter and access_filter != "all":
df = df[df["Accessibility"] == access_filter]
# Filter by model name
filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name.lower():
filter_list = [name.lower() for name in filter_models_by_name.split(";") if name]
df = df[df["LLM_lower_case"].str.contains("|".join(filter_list), na=False)]
if len(df) == 0:
# Show "no results" message in the plot
fig = go.Figure()
fig.add_annotation(
text="No models found matching your filter",
xref="paper", yref="paper", x=0.5, y=0.5,
showarrow=False, font=dict(size=14, color="gray")
)
fig.update_layout(
xaxis=dict(visible=False), yaxis=dict(visible=False),
height=400, margin=dict(l=50, r=50, t=50, b=50)
)
return fig, pd.DataFrame(columns=["LLM", "Hallucination %", "Answer %", "Avg Summary Words"])
fig = visualize_leaderboard(df)
return fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]
with gr.Blocks(
title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard",
theme=gr.themes.Soft(),
css="""
.header-logo {
display: flex;
align-items: center;
gap: 10px;
margin-bottom: 10px;
}
.header-logo img {
height: 40px;
}
footer { display: none !important; }
.modebar { display: none !important; }
.horizontal-radio .wrap {
display: flex !important;
flex-direction: row !important;
gap: 8px !important;
}
"""
) as demo:
gr.HTML(
'<div class="header-logo">'
'<img src="https://huggingface.co/spaces/vectara/README/resolve/main/Vectara-logo.png" alt="Vectara">'
'</div>'
)
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column(scale=3):
plot_output = gr.Plot(show_label=False)
with gr.Column(scale=1):
filter_input = gr.Textbox(
placeholder="Filter models...",
show_label=False,
value=""
)
high_ar_toggle = gr.Checkbox(
label="Only models with ≥95% answer rate",
value=False
)
size_filter = gr.Radio(
choices=["all", "small", "large"],
value="all",
label="Model size",
elem_classes=["horizontal-radio"]
)
access_filter = gr.Radio(
choices=["all", "commercial", "open"],
value="all",
label="Model type",
elem_classes=["horizontal-radio"]
)
data_slice = gr.Dropdown(
choices=list(DATA_SLICE_MAP.keys()),
value="Overall",
label="Data Slice"
)
with gr.Row():
table_output = gr.Dataframe(
label="Leaderboard",
interactive=False,
max_height=500
)
inputs = [filter_input, high_ar_toggle, size_filter, access_filter, data_slice]
outputs = [plot_output, table_output]
# Load initial data on page load
demo.load(fn=leaderboard, inputs=inputs, outputs=outputs)
# Update on filter change or toggle change
filter_input.change(fn=leaderboard, inputs=inputs, outputs=outputs)
high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
data_slice.change(fn=leaderboard, inputs=inputs, outputs=outputs)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860) |