Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| sns.set(style='whitegrid', context='notebook', font_scale=1.75) | |
| DESCRIPTION = """\ | |
| # FizzBuzz LLM Benchmark | |
| A (silly) benchmark for testing how well LLMs can play the children's game [Fizzbuzz](https://en.wikipedia.org/wiki/Fizz_buzz). By modifying the game's standard rules, this benchmark tests generalization, long multi-turn conversation, arithmetic and counting abilities of LLMs. | |
| See [the GitHub repository](https://github.com/venkatasg/fizzbuzz-bench) for all the details on how I tested the models. I'll try to keep this updated with the latest models. | |
| """ | |
| COLUMNS = ["Model", "Standard FizzBuzz", "Buzz=7"] | |
| INITIAL_DATA = [ | |
| ["gpt-5.2-pro", 200, 200], | |
| ["claude-opus-4-6", 200, 186], | |
| ["claude-opus-4-5", 200, 200], | |
| ["claude-sonnet-4", 200, 200], | |
| ["gemini-3-pro-preview", 200, 103], | |
| ["GLM-4.7", 13, 52], | |
| ["gemini-2.0-flash", 200, 39], | |
| ["claude-sonnet-4-5", 200, 200], | |
| ["Llama-4-Maverick", 5, 41], | |
| ["gemini-3-flash-preview", 115, 83], | |
| ["gpt-5.1", 39, 27], | |
| ["Kimi-K2-Thinking", 21, 9], | |
| ["claude-haiku-4-5", 5, 13], | |
| ["gpt-5.2", 5, 11], | |
| ["Qwen3-235B-A22B", 25, 11], | |
| ["DeepSeek-V3-0324", 43, 9], | |
| ["gemini-2.5-pro", 200, 200], | |
| ["gpt-4.1", 23, 5], | |
| ["gpt-4.1-mini", 33, 5], | |
| ["gpt-3.5-turbo", 11, 1], | |
| ["claude-3-7-sonnet", 200, 5], | |
| ["gemini-2.5-flash", 3, 5], | |
| ["gemma-3-27b-it", 5, 5], | |
| ["DeepSeek-V3.1", 95, 5], | |
| ["Qwen3-Next-80B-A3B", 17, 5], | |
| ["gpt-4.1-nano", 3, 3], | |
| ["Llama-3.3-70B", 3, 3], | |
| ["Kimi-K2-Instruct-0905", 200, 19], | |
| ["Minimax-M2.5", 9, 9], | |
| ["GLM-5", 53, 45], | |
| ] | |
| def make_sorted_df(raw=None): | |
| """Build a DataFrame sorted by Buzz=7 descending.""" | |
| if raw is None: | |
| df = pd.DataFrame(INITIAL_DATA, columns=COLUMNS) | |
| elif isinstance(raw, pd.DataFrame): | |
| df = raw.copy() | |
| df.columns = COLUMNS | |
| else: | |
| df = pd.DataFrame(raw, columns=COLUMNS) | |
| df["Standard FizzBuzz"] = pd.to_numeric(df["Standard FizzBuzz"], errors="coerce").fillna(0).astype(int) | |
| df["Buzz=7"] = pd.to_numeric(df["Buzz=7"], errors="coerce").fillna(0).astype(int) | |
| df = df.sort_values(["Buzz=7", "Standard FizzBuzz"], ascending=False).reset_index(drop=True) | |
| return df | |
| def create_chart(df): | |
| """Create a grouped horizontal bar chart of the top 15 models using seaborn.""" | |
| top10 = df.head(15).copy() | |
| # Reshape to long format for seaborn | |
| plot_df = ( | |
| top10 | |
| .melt(id_vars="Model", | |
| var_name="Task", | |
| value_name="Score") | |
| ) | |
| fig, ax = plt.subplots(figsize=(15, 15)) | |
| sns.barplot( | |
| data=plot_df, | |
| y="Model", | |
| x="Score", | |
| hue="Task", | |
| hue_order=['Buzz=7', 'Standard FizzBuzz'], | |
| orient="h", | |
| ax=ax, | |
| palette='colorblind' | |
| ) | |
| ax.set_xlim(0, 200) | |
| ax.set_xlabel("Successful turns") | |
| ax.set_ylabel("") | |
| ax.set_title("Top 15 models ranked by modified FizzBuzz (Buzz=7) score") | |
| ax.legend(loc="lower right") | |
| ax.tick_params(axis="y") | |
| fig.tight_layout() | |
| return fig | |
| def on_data_edit(table_data): | |
| """Regenerate the chart when the user edits the table.""" | |
| df = make_sorted_df(table_data) | |
| return create_chart(df) | |
| # --- Build the initial state --- | |
| initial_df = make_sorted_df() | |
| # --- UI --- | |
| with gr.Blocks(title="FizzBuzz LLM Benchmark") as demo: | |
| gr.Markdown(DESCRIPTION) | |
| plot = gr.Plot(value=create_chart(initial_df)) | |
| gr.Markdown("### All Model Scores") | |
| table = gr.Dataframe( | |
| value=initial_df, | |
| interactive=True, | |
| column_count=(3, "fixed"), | |
| ) | |
| table.input(on_data_edit, inputs=[table], outputs=[plot]) | |
| # Define the custom CSS | |
| css = """ | |
| .gradio-container { | |
| max-width: 800px !important; | |
| margin-left: auto !important; | |
| margin-right: auto !important; | |
| } | |
| """ | |
| demo.launch(css=css) | |