File size: 5,339 Bytes
d0c57df
8c3427d
7cd85bf
8c3427d
0e2da72
8c3427d
 
 
d0c57df
 
 
 
 
 
 
 
8c3427d
 
d0c57df
 
 
 
0e2da72
 
d0c57df
 
 
8c3427d
0e2da72
 
 
d0c57df
 
 
8c3427d
d0c57df
 
 
 
 
 
 
8c3427d
d0c57df
696341e
d0c57df
 
 
 
 
 
7cd85bf
 
 
 
 
 
 
 
 
 
d0c57df
8c3427d
 
d0c57df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7cd85bf
0e2da72
 
 
 
 
d0c57df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e2da72
 
d0c57df
 
 
 
0e2da72
 
 
 
 
 
 
d0c57df
 
 
 
 
 
 
 
 
0e2da72
d0c57df
 
 
 
 
 
 
 
 
 
0e2da72
d0c57df
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import gradio as gr
import pandas as pd
import plotly.graph_objects as go

from app_utils import load_results, visualize_leaderboard, apply_data_slice, DATA_SLICE_MAP

results_df = load_results()

DESCRIPTION = """
# Hughes Hallucination Evaluation Model (HHEM) Leaderboard

Using [Vectara](https://vectara.com/)'s proprietary [Factual Consistency Evaluation Model](https://www.vectara.com/blog/hallucination-detection-commercial-vs-open-source-a-deep-dive),
this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document.
For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates.
For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
"""


def leaderboard(
    filter_models_by_name: str = "",
    high_ar_only: bool = False,
    size_filter: str = "all",
    access_filter: str = "all",
    data_slice: str = "Overall"
):
    """Filter and display the leaderboard."""
    df = results_df.copy()

    # Apply data slice first (recalculates metrics and re-sorts)
    df = apply_data_slice(df, data_slice)

    # Filter by answer rate if toggle is on
    if high_ar_only:
        df = df[df["Answer %"] >= 95]

    # Filter by model size
    if size_filter and size_filter != "all":
        df = df[df["Model Size"] == size_filter]

    # Filter by accessibility
    if access_filter and access_filter != "all":
        df = df[df["Accessibility"] == access_filter]

    # Filter by model name
    filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
    if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name.lower():
        filter_list = [name.lower() for name in filter_models_by_name.split(";") if name]
        df = df[df["LLM_lower_case"].str.contains("|".join(filter_list), na=False)]

    if len(df) == 0:
        # Show "no results" message in the plot
        fig = go.Figure()
        fig.add_annotation(
            text="No models found matching your filter",
            xref="paper", yref="paper", x=0.5, y=0.5,
            showarrow=False, font=dict(size=14, color="gray")
        )
        fig.update_layout(
            xaxis=dict(visible=False), yaxis=dict(visible=False),
            height=400, margin=dict(l=50, r=50, t=50, b=50)
        )
        return fig, pd.DataFrame(columns=["LLM", "Hallucination %", "Answer %", "Avg Summary Words"])

    fig = visualize_leaderboard(df)
    return fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]


with gr.Blocks(
    title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard",
    theme=gr.themes.Soft(),
    css="""
    .header-logo {
        display: flex;
        align-items: center;
        gap: 10px;
        margin-bottom: 10px;
    }
    .header-logo img {
        height: 40px;
    }
    footer { display: none !important; }
    .modebar { display: none !important; }
    .horizontal-radio .wrap {
        display: flex !important;
        flex-direction: row !important;
        gap: 8px !important;
    }
    """
) as demo:
    gr.HTML(
        '<div class="header-logo">'
        '<img src="https://huggingface.co/spaces/vectara/README/resolve/main/Vectara-logo.png" alt="Vectara">'
        '</div>'
    )
    gr.Markdown(DESCRIPTION)

    with gr.Row():
        with gr.Column(scale=3):
            plot_output = gr.Plot(show_label=False)
        with gr.Column(scale=1):
            filter_input = gr.Textbox(
                placeholder="Filter models...",
                show_label=False,
                value=""
            )
            high_ar_toggle = gr.Checkbox(
                label="Only models with ≥95% answer rate",
                value=False
            )
            size_filter = gr.Radio(
                choices=["all", "small", "large"],
                value="all",
                label="Model size",
                elem_classes=["horizontal-radio"]
            )
            access_filter = gr.Radio(
                choices=["all", "commercial", "open"],
                value="all",
                label="Model type",
                elem_classes=["horizontal-radio"]
            )
            data_slice = gr.Dropdown(
                choices=list(DATA_SLICE_MAP.keys()),
                value="Overall",
                label="Data Slice"
            )

    with gr.Row():
        table_output = gr.Dataframe(
            label="Leaderboard",
            interactive=False,
            max_height=500
        )

    inputs = [filter_input, high_ar_toggle, size_filter, access_filter, data_slice]
    outputs = [plot_output, table_output]

    # Load initial data on page load
    demo.load(fn=leaderboard, inputs=inputs, outputs=outputs)

    # Update on filter change or toggle change
    filter_input.change(fn=leaderboard, inputs=inputs, outputs=outputs)
    high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
    size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
    access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
    data_slice.change(fn=leaderboard, inputs=inputs, outputs=outputs)


if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)