fizzbuzz-bench / app.py
venkatasg's picture
2 new models
983b472
import gradio as gr
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sns.set(style='whitegrid', context='notebook', font_scale=1.75)
DESCRIPTION = """\
# FizzBuzz LLM Benchmark
A (silly) benchmark for testing how well LLMs can play the children's game [Fizzbuzz](https://en.wikipedia.org/wiki/Fizz_buzz). By modifying the game's standard rules, this benchmark tests generalization, long multi-turn conversation, arithmetic and counting abilities of LLMs.
See [the GitHub repository](https://github.com/venkatasg/fizzbuzz-bench) for all the details on how I tested the models. I'll try to keep this updated with the latest models.
"""
COLUMNS = ["Model", "Standard FizzBuzz", "Buzz=7"]
INITIAL_DATA = [
["gpt-5.2-pro", 200, 200],
["claude-opus-4-6", 200, 186],
["claude-opus-4-5", 200, 200],
["claude-sonnet-4", 200, 200],
["gemini-3-pro-preview", 200, 103],
["GLM-4.7", 13, 52],
["gemini-2.0-flash", 200, 39],
["claude-sonnet-4-5", 200, 200],
["Llama-4-Maverick", 5, 41],
["gemini-3-flash-preview", 115, 83],
["gpt-5.1", 39, 27],
["Kimi-K2-Thinking", 21, 9],
["claude-haiku-4-5", 5, 13],
["gpt-5.2", 5, 11],
["Qwen3-235B-A22B", 25, 11],
["DeepSeek-V3-0324", 43, 9],
["gemini-2.5-pro", 200, 200],
["gpt-4.1", 23, 5],
["gpt-4.1-mini", 33, 5],
["gpt-3.5-turbo", 11, 1],
["claude-3-7-sonnet", 200, 5],
["gemini-2.5-flash", 3, 5],
["gemma-3-27b-it", 5, 5],
["DeepSeek-V3.1", 95, 5],
["Qwen3-Next-80B-A3B", 17, 5],
["gpt-4.1-nano", 3, 3],
["Llama-3.3-70B", 3, 3],
["Kimi-K2-Instruct-0905", 200, 19],
["Minimax-M2.5", 9, 9],
["GLM-5", 53, 45],
]
def make_sorted_df(raw=None):
"""Build a DataFrame sorted by Buzz=7 descending."""
if raw is None:
df = pd.DataFrame(INITIAL_DATA, columns=COLUMNS)
elif isinstance(raw, pd.DataFrame):
df = raw.copy()
df.columns = COLUMNS
else:
df = pd.DataFrame(raw, columns=COLUMNS)
df["Standard FizzBuzz"] = pd.to_numeric(df["Standard FizzBuzz"], errors="coerce").fillna(0).astype(int)
df["Buzz=7"] = pd.to_numeric(df["Buzz=7"], errors="coerce").fillna(0).astype(int)
df = df.sort_values(["Buzz=7", "Standard FizzBuzz"], ascending=False).reset_index(drop=True)
return df
def create_chart(df):
"""Create a grouped horizontal bar chart of the top 15 models using seaborn."""
top10 = df.head(15).copy()
# Reshape to long format for seaborn
plot_df = (
top10
.melt(id_vars="Model",
var_name="Task",
value_name="Score")
)
fig, ax = plt.subplots(figsize=(15, 15))
sns.barplot(
data=plot_df,
y="Model",
x="Score",
hue="Task",
hue_order=['Buzz=7', 'Standard FizzBuzz'],
orient="h",
ax=ax,
palette='colorblind'
)
ax.set_xlim(0, 200)
ax.set_xlabel("Successful turns")
ax.set_ylabel("")
ax.set_title("Top 15 models ranked by modified FizzBuzz (Buzz=7) score")
ax.legend(loc="lower right")
ax.tick_params(axis="y")
fig.tight_layout()
return fig
def on_data_edit(table_data):
"""Regenerate the chart when the user edits the table."""
df = make_sorted_df(table_data)
return create_chart(df)
# --- Build the initial state ---
initial_df = make_sorted_df()
# --- UI ---
with gr.Blocks(title="FizzBuzz LLM Benchmark") as demo:
gr.Markdown(DESCRIPTION)
plot = gr.Plot(value=create_chart(initial_df))
gr.Markdown("### All Model Scores")
table = gr.Dataframe(
value=initial_df,
interactive=True,
column_count=(3, "fixed"),
)
table.input(on_data_edit, inputs=[table], outputs=[plot])
# Define the custom CSS
css = """
.gradio-container {
max-width: 800px !important;
margin-left: auto !important;
margin-right: auto !important;
}
"""
demo.launch(css=css)