Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

File size: 5,339 Bytes

d0c57df
8c3427d
7cd85bf
8c3427d
0e2da72
8c3427d
 
 
d0c57df
 
 
 
 
 
 
 
8c3427d
 
d0c57df
 
 
 
0e2da72
 
d0c57df
 
 
8c3427d
0e2da72
 
 
d0c57df
 
 
8c3427d
d0c57df
 
 
 
 
 
 
8c3427d
d0c57df
696341e
d0c57df
 
 
 
 
 
7cd85bf
 
 
 
 
 
 
 
 
 
d0c57df
8c3427d
 
d0c57df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7cd85bf
0e2da72
 
 
 
 
d0c57df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e2da72
 
d0c57df
 
 
 
0e2da72
 
 
 
 
 
 
d0c57df
 
 
 
 
 
 
 
 
0e2da72
d0c57df
 
 
 
 
 
 
 
 
 
0e2da72
d0c57df

import gradio as gr
import pandas as pd
import plotly.graph_objects as go

from app_utils import load_results, visualize_leaderboard, apply_data_slice, DATA_SLICE_MAP

results_df = load_results()

DESCRIPTION = """
# Hughes Hallucination Evaluation Model (HHEM) Leaderboard

Using [Vectara](https://vectara.com/)'s proprietary [Factual Consistency Evaluation Model](https://www.vectara.com/blog/hallucination-detection-commercial-vs-open-source-a-deep-dive),
this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document.
For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates.
For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
"""


def leaderboard(
    filter_models_by_name: str = "",
    high_ar_only: bool = False,
    size_filter: str = "all",
    access_filter: str = "all",
    data_slice: str = "Overall"
):
    """Filter and display the leaderboard."""
    df = results_df.copy()

    # Apply data slice first (recalculates metrics and re-sorts)
    df = apply_data_slice(df, data_slice)

    # Filter by answer rate if toggle is on
    if high_ar_only:
        df = df[df["Answer %"] >= 95]

    # Filter by model size
    if size_filter and size_filter != "all":
        df = df[df["Model Size"] == size_filter]

    # Filter by accessibility
    if access_filter and access_filter != "all":
        df = df[df["Accessibility"] == access_filter]

    # Filter by model name
    filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
    if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name.lower():
        filter_list = [name.lower() for name in filter_models_by_name.split(";") if name]
        df = df[df["LLM_lower_case"].str.contains("|".join(filter_list), na=False)]

    if len(df) == 0:
        # Show "no results" message in the plot
        fig = go.Figure()
        fig.add_annotation(
            text="No models found matching your filter",
            xref="paper", yref="paper", x=0.5, y=0.5,
            showarrow=False, font=dict(size=14, color="gray")
        )
        fig.update_layout(
            xaxis=dict(visible=False), yaxis=dict(visible=False),
            height=400, margin=dict(l=50, r=50, t=50, b=50)
        )
        return fig, pd.DataFrame(columns=["LLM", "Hallucination %", "Answer %", "Avg Summary Words"])

    fig = visualize_leaderboard(df)
    return fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]


with gr.Blocks(
    title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard",
    theme=gr.themes.Soft(),
    css="""
    .header-logo {
        display: flex;
        align-items: center;
        gap: 10px;
        margin-bottom: 10px;
    }
    .header-logo img {
        height: 40px;
    }
    footer { display: none !important; }
    .modebar { display: none !important; }
    .horizontal-radio .wrap {
        display: flex !important;
        flex-direction: row !important;
        gap: 8px !important;
    }
    """
) as demo:
    gr.HTML(
        '<div class="header-logo">'
        '<img src="https://huggingface.co/spaces/vectara/README/resolve/main/Vectara-logo.png" alt="Vectara">'
        '</div>'
    )
    gr.Markdown(DESCRIPTION)

    with gr.Row():
        with gr.Column(scale=3):
            plot_output = gr.Plot(show_label=False)
        with gr.Column(scale=1):
            filter_input = gr.Textbox(
                placeholder="Filter models...",
                show_label=False,
                value=""
            )
            high_ar_toggle = gr.Checkbox(
                label="Only models with ≥95% answer rate",
                value=False
            )
            size_filter = gr.Radio(
                choices=["all", "small", "large"],
                value="all",
                label="Model size",
                elem_classes=["horizontal-radio"]
            )
            access_filter = gr.Radio(
                choices=["all", "commercial", "open"],
                value="all",
                label="Model type",
                elem_classes=["horizontal-radio"]
            )
            data_slice = gr.Dropdown(
                choices=list(DATA_SLICE_MAP.keys()),
                value="Overall",
                label="Data Slice"
            )

    with gr.Row():
        table_output = gr.Dataframe(
            label="Leaderboard",
            interactive=False,
            max_height=500
        )

    inputs = [filter_input, high_ar_toggle, size_filter, access_filter, data_slice]
    outputs = [plot_output, table_output]

    # Load initial data on page load
    demo.load(fn=leaderboard, inputs=inputs, outputs=outputs)

    # Update on filter change or toggle change
    filter_input.change(fn=leaderboard, inputs=inputs, outputs=outputs)
    high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
    size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
    access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
    data_slice.change(fn=leaderboard, inputs=inputs, outputs=outputs)


if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)