import gradio as gr import pandas as pd import matplotlib matplotlib.use("Agg") import seaborn as sns import matplotlib.pyplot as plt import numpy as np sns.set(style='whitegrid', context='notebook', font_scale=1.75) DESCRIPTION = """\ # FizzBuzz LLM Benchmark A (silly) benchmark for testing how well LLMs can play the children's game [Fizzbuzz](https://en.wikipedia.org/wiki/Fizz_buzz). By modifying the game's standard rules, this benchmark tests generalization, long multi-turn conversation, arithmetic and counting abilities of LLMs. See [the GitHub repository](https://github.com/venkatasg/fizzbuzz-bench) for all the details on how I tested the models. I'll try to keep this updated with the latest models. """ COLUMNS = ["Model", "Standard FizzBuzz", "Buzz=7"] INITIAL_DATA = [ ["gpt-5.2-pro", 200, 200], ["claude-opus-4-6", 200, 186], ["claude-opus-4-5", 200, 200], ["claude-sonnet-4", 200, 200], ["gemini-3-pro-preview", 200, 103], ["GLM-4.7", 13, 52], ["gemini-2.0-flash", 200, 39], ["claude-sonnet-4-5", 200, 200], ["Llama-4-Maverick", 5, 41], ["gemini-3-flash-preview", 115, 83], ["gpt-5.1", 39, 27], ["Kimi-K2-Thinking", 21, 9], ["claude-haiku-4-5", 5, 13], ["gpt-5.2", 5, 11], ["Qwen3-235B-A22B", 25, 11], ["DeepSeek-V3-0324", 43, 9], ["gemini-2.5-pro", 200, 200], ["gpt-4.1", 23, 5], ["gpt-4.1-mini", 33, 5], ["gpt-3.5-turbo", 11, 1], ["claude-3-7-sonnet", 200, 5], ["gemini-2.5-flash", 3, 5], ["gemma-3-27b-it", 5, 5], ["DeepSeek-V3.1", 95, 5], ["Qwen3-Next-80B-A3B", 17, 5], ["gpt-4.1-nano", 3, 3], ["Llama-3.3-70B", 3, 3], ["Kimi-K2-Instruct-0905", 200, 19], ["Minimax-M2.5", 9, 9], ["GLM-5", 53, 45], ] def make_sorted_df(raw=None): """Build a DataFrame sorted by Buzz=7 descending.""" if raw is None: df = pd.DataFrame(INITIAL_DATA, columns=COLUMNS) elif isinstance(raw, pd.DataFrame): df = raw.copy() df.columns = COLUMNS else: df = pd.DataFrame(raw, columns=COLUMNS) df["Standard FizzBuzz"] = pd.to_numeric(df["Standard FizzBuzz"], errors="coerce").fillna(0).astype(int) df["Buzz=7"] = pd.to_numeric(df["Buzz=7"], errors="coerce").fillna(0).astype(int) df = df.sort_values(["Buzz=7", "Standard FizzBuzz"], ascending=False).reset_index(drop=True) return df def create_chart(df): """Create a grouped horizontal bar chart of the top 15 models using seaborn.""" top10 = df.head(15).copy() # Reshape to long format for seaborn plot_df = ( top10 .melt(id_vars="Model", var_name="Task", value_name="Score") ) fig, ax = plt.subplots(figsize=(15, 15)) sns.barplot( data=plot_df, y="Model", x="Score", hue="Task", hue_order=['Buzz=7', 'Standard FizzBuzz'], orient="h", ax=ax, palette='colorblind' ) ax.set_xlim(0, 200) ax.set_xlabel("Successful turns") ax.set_ylabel("") ax.set_title("Top 15 models ranked by modified FizzBuzz (Buzz=7) score") ax.legend(loc="lower right") ax.tick_params(axis="y") fig.tight_layout() return fig def on_data_edit(table_data): """Regenerate the chart when the user edits the table.""" df = make_sorted_df(table_data) return create_chart(df) # --- Build the initial state --- initial_df = make_sorted_df() # --- UI --- with gr.Blocks(title="FizzBuzz LLM Benchmark") as demo: gr.Markdown(DESCRIPTION) plot = gr.Plot(value=create_chart(initial_df)) gr.Markdown("### All Model Scores") table = gr.Dataframe( value=initial_df, interactive=True, column_count=(3, "fixed"), ) table.input(on_data_edit, inputs=[table], outputs=[plot]) # Define the custom CSS css = """ .gradio-container { max-width: 800px !important; margin-left: auto !important; margin-right: auto !important; } """ demo.launch(css=css)