| | """A gradio app that renders a static leaderboard. This is used for Hugging Face Space.""" |
| |
|
| | import ast |
| | import argparse |
| | import glob |
| | import pickle |
| |
|
| | import gradio as gr |
| | import numpy as np |
| | import pandas as pd |
| | import plotly.graph_objects as go |
| | import pandas as pd |
| |
|
| |
|
| |
|
| | MODEL_NAME_COST = { |
| | "anthropic/claude-2.1": 8, |
| | "anthropic/claude-3-haiku": 0.25, |
| | "anthropic/claude-3-opus": 15, |
| | "anthropic/claude-3-sonnet": 3, |
| | "cohere/command-r": 0.5, |
| | "google/gemini-pro": 0.12, |
| | "google/gemma-7b-it": 0.1, |
| | "mistralai/mistral-large": 8, |
| | "mistralai/mistral-medium": 2.7, |
| | "mistralai/mixtral-8x7b-instruct": 0.7, |
| | "openai/gpt-3.5-turbo": 0.5, |
| | "openai/gpt-4-1106-preview": 10, |
| | } |
| |
|
| |
|
| | def make_default_md(): |
| |
|
| | leaderboard_md = f""" |
| | # 🏆 CZ-EVAL Leaderboard |
| | [Developer](https://me.hynky.name/) | [Twitter](https://twitter.com/HKydlicek) |
| | |
| | CZ-EVAL is a evaluation leadboard of Tasks in Czech for LLMs. |
| | |
| | It's evaluated on following datasets: |
| | |
| | - Math Problems Understanding [Klokan-QA](https://huggingface.co/datasets/hynky/klokan-qa) |
| | - Reasoning and General Knowledge [TSP-QA](https://huggingface.co/datasets/hynky/tsp-qa) |
| | |
| | 💻 Code: The evaluation code can be found at [hynky1999/LLM-Eval](https://github.com/hynky1999/LLM-Eval). Model inference is done using [Open-Router](https://openrouter.ai/) or on cloud using [Modal Labs](https://modal.com/). |
| | """ |
| | return leaderboard_md |
| |
|
| |
|
| | def make_arena_leaderboard_md(arena_df): |
| | total_models = len(arena_df) |
| |
|
| | leaderboard_md = f""" |
| | Total #models: **{total_models}**. Last updated: Mar 17, 2024. |
| | """ |
| | return leaderboard_md |
| |
|
| |
|
| | def make_full_leaderboard_md(elo_results): |
| | leaderboard_md = f""" |
| | Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**. |
| | - [Klokan-QA](https://huggingface.co/datasets/hynky/klokan-qa) - Mathematical competitions dataset |
| | - [TSP](https://huggingface.co/datasets/hynky/TSP) - Comprehensive dataset of |
| | |
| | """ |
| | return leaderboard_md |
| |
|
| |
|
| | |
| |
|
| |
|
| | def plot_spider(df, title): |
| | categories = df.columns.tolist()[1:] |
| | categories = [ |
| | *categories, |
| | categories[0], |
| | ] |
| | colors = [ |
| | '#1f77b4', |
| | '#ff7f0e', |
| | '#2ca02c', |
| | '#d62728', |
| | '#9467bd', |
| | '#8c564b', |
| | '#e377c2', |
| | '#7f7f7f', |
| | '#bcbd22', |
| | '#17becf', |
| | '#f7b6d2', |
| | '#bcbd22', |
| | '#dbdb8d', |
| | '#17becf', |
| | '#9edae5', |
| | '#c5b0d5', |
| | '#c49c94', |
| | '#f7b6d2', |
| | '#bcbd22', |
| | '#dbdb8d', |
| | '#17becf', |
| | '#9edae5', |
| | '#c5b0d5', |
| | '#c49c94', |
| | ] |
| |
|
| | |
| | fig_1000 = go.Figure() |
| |
|
| | for i, (idx, row) in enumerate(df.iterrows()): |
| | name = row[0] |
| | row = row.tolist()[1:] |
| | row = row + [ |
| | row[0] |
| | ] |
| | color = colors[i] |
| | fig_1000.add_trace( |
| | go.Scatterpolar( |
| | r=row, |
| | theta=categories, |
| | opacity=0.4, |
| | name=name, |
| | line=dict( |
| | color=color, width=4 |
| | ), |
| | ) |
| | ) |
| |
|
| | fig_1000.update_layout( |
| | width=600, |
| | height=950, |
| | polar=dict( |
| | angularaxis=dict( |
| | gridwidth=2, |
| | rotation=90, |
| | direction="clockwise", |
| | ), |
| | radialaxis=dict( |
| | visible=True, |
| | range=[0, 100], |
| | angle=45, |
| | tickangle=45, |
| | tickvals=[0, 25, 50, 75, 100], |
| | ticktext=["0%", "25%", "50%", "75%", "100%"], |
| | ), |
| | ), |
| | title_text=title, |
| | title_x=0.5, |
| | title_y=0.97, |
| | title_xanchor="center", |
| | title_yanchor="top", |
| | title_font_size=24, |
| | title_font_color="#333333", |
| | font=dict(family="Arial", size=16, color="#333333"), |
| | legend=dict( |
| | orientation="h", yanchor="bottom", y=-0.45, xanchor="center", x=0.5 |
| | ), |
| | ) |
| | return fig_1000 |
| |
|
| |
|
| | def openrouter_hyperlink(model_name): |
| | return f'<a target="_blank" href="https://openrouter.ai/models/{model_name}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |
| |
|
| |
|
| | def get_full_table(model_table_df): |
| | num_cols = ["klokan", "culture", "analytical", "critical", "verbal"] |
| | |
| | |
| | model_table_df["average"] = model_table_df[num_cols].mean(axis=1) |
| | model_table_df[num_cols + ["average"]] = model_table_df[ |
| | num_cols + ["average"] |
| | ].apply(lambda x: round(x * 100, 2)) |
| |
|
| | |
| | model_table_df.sort_values(by="average", ascending=False, inplace=True) |
| | model_table_df.insert(0, "rank", np.arange(1, len(model_table_df) + 1)) |
| |
|
| | |
| | model_table_df["completion_price"] = model_table_df["model_name"].apply( |
| | lambda x: f"{MODEL_NAME_COST[x]}$" |
| | ) |
| |
|
| | |
| | model_table_df["model_name"] = model_table_df["model_name"].apply( |
| | lambda x: openrouter_hyperlink(x) |
| | ) |
| |
|
| | |
| | model_table_df = model_table_df[["rank", "model_name", "completion_price", "klokan", "culture", "analytical", "critical", "verbal", "average"]] |
| |
|
| | model_table_df.rename( |
| | columns={ |
| | "model_name": "🤖 Model", |
| | "completion_price": "💰 Cost (1M-Tokens)", |
| | "klokan": "🧮 Klokan-QA", |
| | "culture": "🌍 TSP-Culture", |
| | "analytical": "🔍 TSP-Analytical", |
| | "critical": "💡 TSP-Critical", |
| | "verbal": "📖 TSP-Verbal", |
| | "average": "📊 Average", |
| | }, |
| | inplace=True, |
| | ) |
| |
|
| |
|
| | return model_table_df |
| |
|
| |
|
| | def build_leaderboard_tab(leaderboard_table_file, klokan_table_file, tsp_table_file): |
| |
|
| | results = pd.read_csv(leaderboard_table_file) |
| | results = get_full_table(results) |
| | |
| | default_md = make_default_md() |
| |
|
| | md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown") |
| | with gr.Tabs() as tabs: |
| | |
| | with gr.Tab("CZ-EVAL Leaderboard", id=0): |
| | md = make_arena_leaderboard_md(results) |
| | gr.Markdown(md, elem_id="leaderboard_markdown") |
| | gr.Dataframe( |
| | datatype=[ |
| | "str", |
| | "markdown", |
| | "number", |
| | "number", |
| | "number", |
| | "number", |
| | "number", |
| | "number", |
| | "str", |
| | "str", |
| | "str", |
| | ], |
| | value=results, |
| | elem_id="arena_leaderboard_dataframe", |
| | height=700, |
| | column_widths=[ |
| | 70, |
| | 200, |
| | 110, |
| | 120, |
| | 120, |
| | 120, |
| | 120, |
| | 100, |
| | 100, |
| | ], |
| | wrap=True, |
| | ) |
| |
|
| | p1 = plot_spider(pd.read_csv(klokan_table_file), "Klokan-QA - Acurracy") |
| | p2 = plot_spider(pd.read_csv(tsp_table_file), "TSP - Accuracy") |
| |
|
| | gr.Markdown( |
| | f"""## More Statistics for CZ-EVAL\n |
| | Below are figures for more statistics. |
| | """, |
| | elem_id="leaderboard_markdown", |
| | ) |
| | with gr.Row(): |
| | with gr.Column(): |
| | gr.Markdown( |
| | "#### Figure 1: Performance of models on Klokan-QA per difficulty" |
| | ) |
| | plot_1 = gr.Plot(p1, show_label=False) |
| | with gr.Column(): |
| | gr.Markdown("#### Figure 2: Performance of models on TSP dataset") |
| | plot_2 = gr.Plot(p2, show_label=False) |
| |
|
| | return [md_1, plot_1, plot_2] |
| |
|
| |
|
| | block_css = """ |
| | #notice_markdown { |
| | font-size: 104% |
| | } |
| | #notice_markdown th { |
| | display: none; |
| | } |
| | #notice_markdown td { |
| | padding-top: 6px; |
| | padding-bottom: 6px; |
| | } |
| | #leaderboard_markdown { |
| | font-size: 104% |
| | } |
| | #leaderboard_markdown td { |
| | padding-top: 6px; |
| | padding-bottom: 6px; |
| | } |
| | #leaderboard_dataframe td { |
| | line-height: 0.1em; |
| | } |
| | footer { |
| | display:none !important |
| | } |
| | .image-container { |
| | display: flex; |
| | align-items: center; |
| | padding: 1px; |
| | } |
| | .image-container img { |
| | margin: 0 30px; |
| | height: 20px; |
| | max-height: 100%; |
| | width: auto; |
| | max-width: 20%; |
| | } |
| | """ |
| |
|
| |
|
| | def build_demo(leadboard_table, klokan_table, tsp_table): |
| | text_size = gr.themes.sizes.text_lg |
| |
|
| | with gr.Blocks( |
| | title="CZ-EVAL Leaderboard", |
| | theme=gr.themes.Base(text_size=text_size), |
| | css=block_css, |
| | ) as demo: |
| | leader_components = build_leaderboard_tab( |
| | leadboard_table, klokan_table, tsp_table |
| | ) |
| | return demo |
| |
|
| |
|
| | demo = build_demo( |
| | leadboard_table="./leaderboard/table.csv", |
| | klokan_table="./leaderboard/klokan.csv", |
| | tsp_table="./leaderboard/tsp.csv", |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |
| |
|