Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

File size: 4,488 Bytes

d0c57df
8c3427d
d0c57df
8c3427d
 
 
 
 
d0c57df
 
 
 
 
 
 
 
8c3427d
 
d0c57df
 
 
 
 
 
 
 
8c3427d
d0c57df
 
 
8c3427d
d0c57df
 
 
 
 
 
 
8c3427d
d0c57df
696341e
d0c57df
 
 
 
 
 
 
 
 
 
 
 
 
8c3427d
 
d0c57df

import gradio as gr
import pandas as pd
import matplotlib.pyplot

from app_utils import load_results, visualize_leaderboard

results_df = load_results()

DESCRIPTION = """
# Hughes Hallucination Evaluation Model (HHEM) Leaderboard

Using [Vectara](https://vectara.com/)'s proprietary [Factual Consistency Evaluation Model](https://www.vectara.com/blog/hallucination-detection-commercial-vs-open-source-a-deep-dive),
this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document.
For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates.
For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
"""


def leaderboard(
    filter_models_by_name: str = "",
    high_ar_only: bool = False,
    size_filter: str = "all",
    access_filter: str = "all"
):
    """Filter and display the leaderboard."""
    df = results_df.copy()

    # Filter by answer rate if toggle is on
    if high_ar_only:
        df = df[df["Answer %"] >= 95]

    # Filter by model size
    if size_filter and size_filter != "all":
        df = df[df["Model Size"] == size_filter]

    # Filter by accessibility
    if access_filter and access_filter != "all":
        df = df[df["Accessibility"] == access_filter]

    # Filter by model name
    filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
    if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name.lower():
        filter_list = [name.lower() for name in filter_models_by_name.split(";") if name]
        df = df[df["LLM_lower_case"].str.contains("|".join(filter_list), na=False)]

    if len(df) == 0:
        # Show "no results" message in the plot
        fig, ax = matplotlib.pyplot.subplots(figsize=(10, 5))
        ax.text(0.5, 0.5, "No models found matching your filter",
                ha='center', va='center', fontsize=14, color='gray')
        ax.set_xlim(0, 1)
        ax.set_ylim(0, 1)
        ax.axis('off')
        return fig, pd.DataFrame(columns=["LLM", "Hallucination %", "Answer %", "Avg Summary Words"])

    fig = visualize_leaderboard(df)
    return fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]


with gr.Blocks(
    title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard",
    theme=gr.themes.Soft(),
    css="""
    .header-logo {
        display: flex;
        align-items: center;
        gap: 10px;
        margin-bottom: 10px;
    }
    .header-logo img {
        height: 40px;
    }
    footer { display: none !important; }
    """
) as demo:
    gr.HTML(
        '<div class="header-logo">'
        '<img src="https://huggingface.co/spaces/vectara/README/resolve/main/Vectara-logo.png" alt="Vectara">'
        '</div>'
    )
    gr.Markdown(DESCRIPTION)

    with gr.Row():
        with gr.Column(scale=3):
            plot_output = gr.Plot(show_label=False)
        with gr.Column(scale=1):
            filter_input = gr.Textbox(
                placeholder="Filter models...",
                show_label=False,
                value=""
            )
            high_ar_toggle = gr.Checkbox(
                label="Only models with ≥95% answer rate",
                value=False
            )
            size_filter = gr.Radio(
                choices=["all", "small", "large"],
                value="all",
                label="Model size"
            )
            access_filter = gr.Radio(
                choices=["all", "commercial", "open"],
                value="all",
                label="Model type"
            )

    with gr.Row():
        table_output = gr.Dataframe(
            label="Leaderboard",
            interactive=False,
            max_height=500
        )

    inputs = [filter_input, high_ar_toggle, size_filter, access_filter]
    outputs = [plot_output, table_output]

    # Load initial data on page load
    demo.load(fn=leaderboard, inputs=inputs, outputs=outputs)

    # Update on filter change or toggle change
    filter_input.change(fn=leaderboard, inputs=inputs, outputs=outputs)
    high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
    size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
    access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)


if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)