Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

App Files Files Community

leaderboard / app /app.py

ofermend

updated to gradio; python 3.11; visual improvements

d0c57df 3 days ago

raw

history blame

4.49 kB

	import gradio as gr
	import pandas as pd
	import matplotlib.pyplot

	from app_utils import load_results, visualize_leaderboard

	results_df = load_results()

	DESCRIPTION = """
	# Hughes Hallucination Evaluation Model (HHEM) Leaderboard

	Using [Vectara](https://vectara.com/)'s proprietary [Factual Consistency Evaluation Model](https://www.vectara.com/blog/hallucination-detection-commercial-vs-open-source-a-deep-dive),
	this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document.
	For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates.
	For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
	"""


	def leaderboard(
	filter_models_by_name: str = "",
	high_ar_only: bool = False,
	size_filter: str = "all",
	access_filter: str = "all"
	):
	"""Filter and display the leaderboard."""
	df = results_df.copy()

	# Filter by answer rate if toggle is on
	if high_ar_only:
	df = df[df["Answer %"] >= 95]

	# Filter by model size
	if size_filter and size_filter != "all":
	df = df[df["Model Size"] == size_filter]

	# Filter by accessibility
	if access_filter and access_filter != "all":
	df = df[df["Accessibility"] == access_filter]

	# Filter by model name
	filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
	if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name.lower():
	filter_list = [name.lower() for name in filter_models_by_name.split(";") if name]
	df = df[df["LLM_lower_case"].str.contains("\|".join(filter_list), na=False)]

	if len(df) == 0:
	# Show "no results" message in the plot
	fig, ax = matplotlib.pyplot.subplots(figsize=(10, 5))
	ax.text(0.5, 0.5, "No models found matching your filter",
	ha='center', va='center', fontsize=14, color='gray')
	ax.set_xlim(0, 1)
	ax.set_ylim(0, 1)
	ax.axis('off')
	return fig, pd.DataFrame(columns=["LLM", "Hallucination %", "Answer %", "Avg Summary Words"])

	fig = visualize_leaderboard(df)
	return fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]


	with gr.Blocks(
	title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard",
	theme=gr.themes.Soft(),
	css="""
	.header-logo {
	display: flex;
	align-items: center;
	gap: 10px;
	margin-bottom: 10px;
	}
	.header-logo img {
	height: 40px;
	}
	footer { display: none !important; }
	"""
	) as demo:
	gr.HTML(
	'<div class="header-logo">'
	'<img src="https://huggingface.co/spaces/vectara/README/resolve/main/Vectara-logo.png" alt="Vectara">'
	'</div>'
	)
	gr.Markdown(DESCRIPTION)

	with gr.Row():
	with gr.Column(scale=3):
	plot_output = gr.Plot(show_label=False)
	with gr.Column(scale=1):
	filter_input = gr.Textbox(
	placeholder="Filter models...",
	show_label=False,
	value=""
	)
	high_ar_toggle = gr.Checkbox(
	label="Only models with ≥95% answer rate",
	value=False
	)
	size_filter = gr.Radio(
	choices=["all", "small", "large"],
	value="all",
	label="Model size"
	)
	access_filter = gr.Radio(
	choices=["all", "commercial", "open"],
	value="all",
	label="Model type"
	)

	with gr.Row():
	table_output = gr.Dataframe(
	label="Leaderboard",
	interactive=False,
	max_height=500
	)

	inputs = [filter_input, high_ar_toggle, size_filter, access_filter]
	outputs = [plot_output, table_output]

	# Load initial data on page load
	demo.load(fn=leaderboard, inputs=inputs, outputs=outputs)

	# Update on filter change or toggle change
	filter_input.change(fn=leaderboard, inputs=inputs, outputs=outputs)
	high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
	size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
	access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)


	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)