import gradio as gr
import pandas as pd
import matplotlib
matplotlib.use("Agg")  
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

sns.set(style='whitegrid', context='notebook', font_scale=1.75)

DESCRIPTION = """\
# FizzBuzz LLM Benchmark

A (silly) benchmark for testing how well LLMs can play the children's game [Fizzbuzz](https://en.wikipedia.org/wiki/Fizz_buzz). By modifying the game's standard rules, this benchmark tests generalization, long multi-turn conversation, arithmetic and counting abilities of LLMs.
See [the GitHub repository](https://github.com/venkatasg/fizzbuzz-bench) for all the details on how I tested the models. I'll try to keep this updated with the latest models.
"""

COLUMNS = ["Model", "Standard FizzBuzz", "Buzz=7"]

INITIAL_DATA = [
    ["gpt-5.2-pro", 200, 200],
    ["claude-opus-4-6", 200, 186],
    ["claude-opus-4-5", 200, 200],
    ["claude-sonnet-4", 200, 200],
    ["gemini-3-pro-preview", 200, 103],
    ["GLM-4.7", 13, 52],
    ["gemini-2.0-flash", 200, 39],
    ["claude-sonnet-4-5", 200, 200],
    ["Llama-4-Maverick", 5, 41],
    ["gemini-3-flash-preview", 115, 83],
    ["gpt-5.1", 39, 27],
    ["Kimi-K2-Thinking", 21, 9],
    ["claude-haiku-4-5", 5, 13],
    ["gpt-5.2", 5, 11],
    ["Qwen3-235B-A22B", 25, 11],
    ["DeepSeek-V3-0324", 43, 9],
    ["gemini-2.5-pro", 200, 200],
    ["gpt-4.1", 23, 5],
    ["gpt-4.1-mini", 33, 5],
    ["gpt-3.5-turbo", 11, 1],
    ["claude-3-7-sonnet", 200, 5],
    ["gemini-2.5-flash", 3, 5],
    ["gemma-3-27b-it", 5, 5],
    ["DeepSeek-V3.1", 95, 5],
    ["Qwen3-Next-80B-A3B", 17, 5],
    ["gpt-4.1-nano", 3, 3],
    ["Llama-3.3-70B", 3, 3],
    ["Kimi-K2-Instruct-0905", 200, 19],
    ["Minimax-M2.5", 9, 9],
    ["GLM-5", 53, 45],
]


def make_sorted_df(raw=None):
    """Build a DataFrame sorted by Buzz=7 descending."""
    if raw is None:
        df = pd.DataFrame(INITIAL_DATA, columns=COLUMNS)
    elif isinstance(raw, pd.DataFrame):
        df = raw.copy()
        df.columns = COLUMNS
    else:
        df = pd.DataFrame(raw, columns=COLUMNS)
    df["Standard FizzBuzz"] = pd.to_numeric(df["Standard FizzBuzz"], errors="coerce").fillna(0).astype(int)
    df["Buzz=7"] = pd.to_numeric(df["Buzz=7"], errors="coerce").fillna(0).astype(int)
    df = df.sort_values(["Buzz=7", "Standard FizzBuzz"], ascending=False).reset_index(drop=True)
    return df


def create_chart(df):
    """Create a grouped horizontal bar chart of the top 15 models using seaborn."""
    top10 = df.head(15).copy()  
    
    # Reshape to long format for seaborn
    plot_df = (
        top10
        .melt(id_vars="Model", 
              var_name="Task", 
              value_name="Score")
    )
    
    fig, ax = plt.subplots(figsize=(15, 15))
    
    sns.barplot(
        data=plot_df,
        y="Model",
        x="Score",
        hue="Task",
        hue_order=['Buzz=7', 'Standard FizzBuzz'],
        orient="h",
        ax=ax,
        palette='colorblind'
    )
    
    ax.set_xlim(0, 200)
    ax.set_xlabel("Successful turns")
    ax.set_ylabel("")
    ax.set_title("Top 15 models ranked by modified FizzBuzz (Buzz=7) score")
    ax.legend(loc="lower right")
    ax.tick_params(axis="y")
    
    fig.tight_layout()
    return fig


def on_data_edit(table_data):
    """Regenerate the chart when the user edits the table."""
    df = make_sorted_df(table_data)
    return create_chart(df)


# --- Build the initial state ---
initial_df = make_sorted_df()

# --- UI ---
with gr.Blocks(title="FizzBuzz LLM Benchmark") as demo:
    gr.Markdown(DESCRIPTION)
    plot = gr.Plot(value=create_chart(initial_df))
    gr.Markdown("### All Model Scores")
    table = gr.Dataframe(
        value=initial_df,
        interactive=True,
        column_count=(3, "fixed"),
    )
    table.input(on_data_edit, inputs=[table], outputs=[plot])

# Define the custom CSS
    css = """
    .gradio-container {
        max-width: 800px !important;
        margin-left: auto !important;
        margin-right: auto !important;
    }
    """

demo.launch(css=css)