import gradio as gr import pandas as pd import plotly.graph_objects as go from app_utils import load_results, visualize_leaderboard, apply_data_slice, DATA_SLICE_MAP results_df = load_results() DESCRIPTION = """ # Hughes Hallucination Evaluation Model (HHEM) Leaderboard Using [Vectara](https://vectara.com/)'s proprietary [Factual Consistency Evaluation Model](https://www.vectara.com/blog/hallucination-detection-commercial-vs-open-source-a-deep-dive), this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document. For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates. For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard). """ def leaderboard( filter_models_by_name: str = "", high_ar_only: bool = False, size_filter: str = "all", access_filter: str = "all", data_slice: str = "Overall" ): """Filter and display the leaderboard.""" df = results_df.copy() # Apply data slice first (recalculates metrics and re-sorts) df = apply_data_slice(df, data_slice) # Filter by answer rate if toggle is on if high_ar_only: df = df[df["Answer %"] >= 95] # Filter by model size if size_filter and size_filter != "all": df = df[df["Model Size"] == size_filter] # Filter by accessibility if access_filter and access_filter != "all": df = df[df["Accessibility"] == access_filter] # Filter by model name filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "") if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name.lower(): filter_list = [name.lower() for name in filter_models_by_name.split(";") if name] df = df[df["LLM_lower_case"].str.contains("|".join(filter_list), na=False)] if len(df) == 0: # Show "no results" message in the plot fig = go.Figure() fig.add_annotation( text="No models found matching your filter", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False, font=dict(size=14, color="gray") ) fig.update_layout( xaxis=dict(visible=False), yaxis=dict(visible=False), height=400, margin=dict(l=50, r=50, t=50, b=50) ) return fig, pd.DataFrame(columns=["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]) fig = visualize_leaderboard(df) return fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]] with gr.Blocks( title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard", theme=gr.themes.Soft(), css=""" .header-logo { display: flex; align-items: center; gap: 10px; margin-bottom: 10px; } .header-logo img { height: 40px; } footer { display: none !important; } .modebar { display: none !important; } .horizontal-radio .wrap { display: flex !important; flex-direction: row !important; gap: 8px !important; } """ ) as demo: gr.HTML( '' ) gr.Markdown(DESCRIPTION) with gr.Row(): with gr.Column(scale=3): plot_output = gr.Plot(show_label=False) with gr.Column(scale=1): filter_input = gr.Textbox( placeholder="Filter models...", show_label=False, value="" ) high_ar_toggle = gr.Checkbox( label="Only models with ≥95% answer rate", value=False ) size_filter = gr.Radio( choices=["all", "small", "large"], value="all", label="Model size", elem_classes=["horizontal-radio"] ) access_filter = gr.Radio( choices=["all", "commercial", "open"], value="all", label="Model type", elem_classes=["horizontal-radio"] ) data_slice = gr.Dropdown( choices=list(DATA_SLICE_MAP.keys()), value="Overall", label="Data Slice" ) with gr.Row(): table_output = gr.Dataframe( label="Leaderboard", interactive=False, max_height=500 ) inputs = [filter_input, high_ar_toggle, size_filter, access_filter, data_slice] outputs = [plot_output, table_output] # Load initial data on page load demo.load(fn=leaderboard, inputs=inputs, outputs=outputs) # Update on filter change or toggle change filter_input.change(fn=leaderboard, inputs=inputs, outputs=outputs) high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs) size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs) access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs) data_slice.change(fn=leaderboard, inputs=inputs, outputs=outputs) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)