Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

App Files Files Community

leaderboard / app /app.py

ofermend

updated

0e2da72 about 11 hours ago

raw

history blame contribute delete

5.34 kB

	import gradio as gr
	import pandas as pd
	import plotly.graph_objects as go

	from app_utils import load_results, visualize_leaderboard, apply_data_slice, DATA_SLICE_MAP

	results_df = load_results()

	DESCRIPTION = """
	# Hughes Hallucination Evaluation Model (HHEM) Leaderboard

	Using [Vectara](https://vectara.com/)'s proprietary [Factual Consistency Evaluation Model](https://www.vectara.com/blog/hallucination-detection-commercial-vs-open-source-a-deep-dive),
	this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document.
	For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates.
	For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
	"""


	def leaderboard(
	filter_models_by_name: str = "",
	high_ar_only: bool = False,
	size_filter: str = "all",
	access_filter: str = "all",
	data_slice: str = "Overall"
	):
	"""Filter and display the leaderboard."""
	df = results_df.copy()

	# Apply data slice first (recalculates metrics and re-sorts)
	df = apply_data_slice(df, data_slice)

	# Filter by answer rate if toggle is on
	if high_ar_only:
	df = df[df["Answer %"] >= 95]

	# Filter by model size
	if size_filter and size_filter != "all":
	df = df[df["Model Size"] == size_filter]

	# Filter by accessibility
	if access_filter and access_filter != "all":
	df = df[df["Accessibility"] == access_filter]

	# Filter by model name
	filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
	if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name.lower():
	filter_list = [name.lower() for name in filter_models_by_name.split(";") if name]
	df = df[df["LLM_lower_case"].str.contains("\|".join(filter_list), na=False)]

	if len(df) == 0:
	# Show "no results" message in the plot
	fig = go.Figure()
	fig.add_annotation(
	text="No models found matching your filter",
	xref="paper", yref="paper", x=0.5, y=0.5,
	showarrow=False, font=dict(size=14, color="gray")
	)
	fig.update_layout(
	xaxis=dict(visible=False), yaxis=dict(visible=False),
	height=400, margin=dict(l=50, r=50, t=50, b=50)
	)
	return fig, pd.DataFrame(columns=["LLM", "Hallucination %", "Answer %", "Avg Summary Words"])

	fig = visualize_leaderboard(df)
	return fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]


	with gr.Blocks(
	title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard",
	theme=gr.themes.Soft(),
	css="""
	.header-logo {
	display: flex;
	align-items: center;
	gap: 10px;
	margin-bottom: 10px;
	}
	.header-logo img {
	height: 40px;
	}
	footer { display: none !important; }
	.modebar { display: none !important; }
	.horizontal-radio .wrap {
	display: flex !important;
	flex-direction: row !important;
	gap: 8px !important;
	}
	"""
	) as demo:
	gr.HTML(
	'<div class="header-logo">'
	'<img src="https://huggingface.co/spaces/vectara/README/resolve/main/Vectara-logo.png" alt="Vectara">'
	'</div>'
	)
	gr.Markdown(DESCRIPTION)

	with gr.Row():
	with gr.Column(scale=3):
	plot_output = gr.Plot(show_label=False)
	with gr.Column(scale=1):
	filter_input = gr.Textbox(
	placeholder="Filter models...",
	show_label=False,
	value=""
	)
	high_ar_toggle = gr.Checkbox(
	label="Only models with ≥95% answer rate",
	value=False
	)
	size_filter = gr.Radio(
	choices=["all", "small", "large"],
	value="all",
	label="Model size",
	elem_classes=["horizontal-radio"]
	)
	access_filter = gr.Radio(
	choices=["all", "commercial", "open"],
	value="all",
	label="Model type",
	elem_classes=["horizontal-radio"]
	)
	data_slice = gr.Dropdown(
	choices=list(DATA_SLICE_MAP.keys()),
	value="Overall",
	label="Data Slice"
	)

	with gr.Row():
	table_output = gr.Dataframe(
	label="Leaderboard",
	interactive=False,
	max_height=500
	)

	inputs = [filter_input, high_ar_toggle, size_filter, access_filter, data_slice]
	outputs = [plot_output, table_output]

	# Load initial data on page load
	demo.load(fn=leaderboard, inputs=inputs, outputs=outputs)

	# Update on filter change or toggle change
	filter_input.change(fn=leaderboard, inputs=inputs, outputs=outputs)
	high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
	size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
	access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
	data_slice.change(fn=leaderboard, inputs=inputs, outputs=outputs)


	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)