Spaces:

venkatasg
/

fizzbuzz-bench

Sleeping

App Files Files Community

fizzbuzz-bench / app.py

venkatasg

2 new models

983b472 about 2 months ago

raw

history blame contribute delete

4.05 kB

	import gradio as gr
	import pandas as pd
	import matplotlib
	matplotlib.use("Agg")
	import seaborn as sns
	import matplotlib.pyplot as plt
	import numpy as np

	sns.set(style='whitegrid', context='notebook', font_scale=1.75)

	DESCRIPTION = """\
	# FizzBuzz LLM Benchmark

	A (silly) benchmark for testing how well LLMs can play the children's game [Fizzbuzz](https://en.wikipedia.org/wiki/Fizz_buzz). By modifying the game's standard rules, this benchmark tests generalization, long multi-turn conversation, arithmetic and counting abilities of LLMs.
	See [the GitHub repository](https://github.com/venkatasg/fizzbuzz-bench) for all the details on how I tested the models. I'll try to keep this updated with the latest models.
	"""

	COLUMNS = ["Model", "Standard FizzBuzz", "Buzz=7"]

	INITIAL_DATA = [
	["gpt-5.2-pro", 200, 200],
	["claude-opus-4-6", 200, 186],
	["claude-opus-4-5", 200, 200],
	["claude-sonnet-4", 200, 200],
	["gemini-3-pro-preview", 200, 103],
	["GLM-4.7", 13, 52],
	["gemini-2.0-flash", 200, 39],
	["claude-sonnet-4-5", 200, 200],
	["Llama-4-Maverick", 5, 41],
	["gemini-3-flash-preview", 115, 83],
	["gpt-5.1", 39, 27],
	["Kimi-K2-Thinking", 21, 9],
	["claude-haiku-4-5", 5, 13],
	["gpt-5.2", 5, 11],
	["Qwen3-235B-A22B", 25, 11],
	["DeepSeek-V3-0324", 43, 9],
	["gemini-2.5-pro", 200, 200],
	["gpt-4.1", 23, 5],
	["gpt-4.1-mini", 33, 5],
	["gpt-3.5-turbo", 11, 1],
	["claude-3-7-sonnet", 200, 5],
	["gemini-2.5-flash", 3, 5],
	["gemma-3-27b-it", 5, 5],
	["DeepSeek-V3.1", 95, 5],
	["Qwen3-Next-80B-A3B", 17, 5],
	["gpt-4.1-nano", 3, 3],
	["Llama-3.3-70B", 3, 3],
	["Kimi-K2-Instruct-0905", 200, 19],
	["Minimax-M2.5", 9, 9],
	["GLM-5", 53, 45],
	]


	def make_sorted_df(raw=None):
	"""Build a DataFrame sorted by Buzz=7 descending."""
	if raw is None:
	df = pd.DataFrame(INITIAL_DATA, columns=COLUMNS)
	elif isinstance(raw, pd.DataFrame):
	df = raw.copy()
	df.columns = COLUMNS
	else:
	df = pd.DataFrame(raw, columns=COLUMNS)
	df["Standard FizzBuzz"] = pd.to_numeric(df["Standard FizzBuzz"], errors="coerce").fillna(0).astype(int)
	df["Buzz=7"] = pd.to_numeric(df["Buzz=7"], errors="coerce").fillna(0).astype(int)
	df = df.sort_values(["Buzz=7", "Standard FizzBuzz"], ascending=False).reset_index(drop=True)
	return df


	def create_chart(df):
	"""Create a grouped horizontal bar chart of the top 15 models using seaborn."""
	top10 = df.head(15).copy()

	# Reshape to long format for seaborn
	plot_df = (
	top10
	.melt(id_vars="Model",
	var_name="Task",
	value_name="Score")
	)

	fig, ax = plt.subplots(figsize=(15, 15))

	sns.barplot(
	data=plot_df,
	y="Model",
	x="Score",
	hue="Task",
	hue_order=['Buzz=7', 'Standard FizzBuzz'],
	orient="h",
	ax=ax,
	palette='colorblind'
	)

	ax.set_xlim(0, 200)
	ax.set_xlabel("Successful turns")
	ax.set_ylabel("")
	ax.set_title("Top 15 models ranked by modified FizzBuzz (Buzz=7) score")
	ax.legend(loc="lower right")
	ax.tick_params(axis="y")

	fig.tight_layout()
	return fig


	def on_data_edit(table_data):
	"""Regenerate the chart when the user edits the table."""
	df = make_sorted_df(table_data)
	return create_chart(df)


	# --- Build the initial state ---
	initial_df = make_sorted_df()

	# --- UI ---
	with gr.Blocks(title="FizzBuzz LLM Benchmark") as demo:
	gr.Markdown(DESCRIPTION)
	plot = gr.Plot(value=create_chart(initial_df))
	gr.Markdown("### All Model Scores")
	table = gr.Dataframe(
	value=initial_df,
	interactive=True,
	column_count=(3, "fixed"),
	)
	table.input(on_data_edit, inputs=[table], outputs=[plot])

	# Define the custom CSS
	css = """
	.gradio-container {
	max-width: 800px !important;
	margin-left: auto !important;
	margin-right: auto !important;
	}
	"""

	demo.launch(css=css)