| import gradio as gr |
| import json |
| import pandas as pd |
| from urllib.request import urlopen, URLError |
| import re |
| from datetime import datetime |
|
|
| |
| CITATION_BUTTON_TEXT = r"""@misc{2023opencompass, |
| title={OpenCompass: A Universal Evaluation Platform for Foundation Models}, |
| author={OpenCompass Contributors}, |
| howpublished = {\url{https://github.com/open-compass/opencompass}}, |
| year={2023} |
| }""" |
| CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" |
| |
| |
| |
| DATA_URL_BASE = "http://opencompass.oss-cn-shanghai.aliyuncs.com/assets/research-rank/research-data.REALTIME." |
|
|
| def find_latest_data_url(): |
| """Find the latest available data URL by trying different dates.""" |
| today = datetime.now() |
| for i in range(365): |
| date = today.replace(day=today.day - i) |
| date_str = date.strftime("%Y%m%d") |
| url = f"{DATA_URL_BASE}{date_str}.json" |
| try: |
| urlopen(url) |
| return url, date_str |
| except URLError: |
| continue |
| return None, None |
|
|
| def get_latest_data(): |
| """Get latest data URL and update time""" |
| data_url, update_time = find_latest_data_url() |
| if not data_url: |
| raise Exception("Could not find valid data URL") |
| formatted_update_time = datetime.strptime(update_time, "%Y%m%d").strftime("%Y-%m-%d") |
| return data_url, formatted_update_time |
|
|
| def get_leaderboard_title(update_time): |
| return f"# CompassAcademic Leaderboard (Last Updated: {update_time})" |
|
|
| MAIN_LEADERBOARD_DESCRIPTION = """## Main Evaluation Results |
| The CompassAcademic currently focuses on the comprehensive reasoning abilities of LLMs. |
| - The datasets selected so far include General Knowledge Reasoning (MMLU-Pro/GPQA-Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Completion (LiveCodeBench, HumanEval), and Instruction Following (IFEval). |
| - Currently, the evaluation primarily targets chat models, with updates featuring the latest community models at irregular intervals. |
| - Prompts and reproduction scripts can be found in [**OpenCompass**: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)🏆. |
| """ |
|
|
| MODEL_SIZE = ['<10B', '10B-70B', '>70B', 'Unknown'] |
| MODEL_TYPE = ['API', 'OpenSource'] |
|
|
| def load_data(data_url): |
| response = urlopen(data_url) |
| data = json.loads(response.read().decode('utf-8')) |
| return data |
|
|
| def build_main_table(data): |
| df = pd.DataFrame(data['globalData']['OverallTable']) |
| models_data = data['models'] |
| df['OpenSource'] = df['model'].apply( |
| lambda x: 'Yes' if models_data[x]['release'] == 'OpenSource' else 'No' |
| ) |
| df['Rank'] = df['Average'].rank(ascending=False, method='min').astype(int) |
| |
| columns = { |
| 'Rank': 'Rank', 'model': 'Model', 'org': 'Organization', 'num': 'Parameters', |
| 'OpenSource': 'OpenSource', 'Average': 'Average Score', 'BBH': 'BBH', |
| 'Math-500': 'Math-500', 'AIME': 'AIME', 'MMLU-Pro': 'MMLU-Pro', |
| 'LiveCodeBench': 'LiveCodeBench', 'HumanEval': 'HumanEval', |
| 'GQPA-Diamond': 'GQPA-Diamond', 'IFEval': 'IFEval', |
| } |
| df = df[list(columns.keys())].rename(columns=columns) |
| return df |
|
|
| def filter_table(df, size_ranges, model_types): |
| filtered_df = df.copy() |
| |
| if size_ranges: |
| def get_size_in_B(param): |
| if param == 'N/A': |
| return None |
| try: |
| return float(param.replace('B', '')) |
| except: |
| return None |
| |
| filtered_df['size_in_B'] = filtered_df['Parameters'].apply(get_size_in_B) |
| mask = pd.Series(False, index=filtered_df.index) |
| |
| for size_range in size_ranges: |
| if size_range == '<10B': |
| mask |= (filtered_df['size_in_B'] < 10) & (filtered_df['size_in_B'].notna()) |
| elif size_range == '10B-70B': |
| mask |= (filtered_df['size_in_B'] >= 10) & (filtered_df['size_in_B'] < 70) |
| elif size_range == '>70B': |
| mask |= filtered_df['size_in_B'] >= 70 |
| elif size_range == 'Unknown': |
| mask |= filtered_df['size_in_B'].isna() |
| |
| filtered_df = filtered_df[mask] |
| filtered_df.drop('size_in_B', axis=1, inplace=True) |
| |
| if model_types: |
| type_mask = pd.Series(False, index=filtered_df.index) |
| for model_type in model_types: |
| if model_type == 'API': |
| type_mask |= filtered_df['OpenSource'] == 'No' |
| elif model_type == 'OpenSource': |
| type_mask |= filtered_df['OpenSource'] == 'Yes' |
| filtered_df = filtered_df[type_mask] |
| |
| return filtered_df |
|
|
| def calculate_column_widths(df): |
| column_widths = [] |
| for column in df.columns: |
| header_length = len(str(column)) |
| max_content_length = df[column].astype(str).map(len).max() |
| width = max(header_length * 10, max_content_length * 8) + 20 |
| width = max(160, min(400, width)) |
| column_widths.append(width) |
| return column_widths |
|
|
| class DataState: |
| def __init__(self): |
| self.current_df = None |
|
|
| data_state = DataState() |
|
|
| def create_interface(): |
| empty_df = pd.DataFrame(columns=[ |
| 'Rank', 'Model', 'Organization', 'Parameters', 'OpenSource', |
| 'Average Score', 'BBH', 'Math-500', 'AIME', 'MMLU-Pro', |
| 'LiveCodeBench', 'HumanEval', 'GQPA-Diamond', 'IFEval' |
| ]) |
|
|
| def load_initial_data(): |
| try: |
| data_url, update_time = get_latest_data() |
| data = load_data(data_url) |
| new_df = build_main_table(data) |
| data_state.current_df = new_df |
| filtered_df = filter_table(new_df, MODEL_SIZE, MODEL_TYPE) |
| return get_leaderboard_title(update_time), filtered_df.sort_values("Average Score", ascending=False) |
| except Exception as e: |
| print(f"Error loading initial data: {e}") |
| return "# CompassAcademic Leaderboard (Error loading data)", empty_df |
|
|
| def refresh_data(): |
| try: |
| data_url, update_time = get_latest_data() |
| data = load_data(data_url) |
| new_df = build_main_table(data) |
| data_state.current_df = new_df |
| filtered_df = filter_table(new_df, MODEL_SIZE, MODEL_TYPE) |
| return get_leaderboard_title(update_time), filtered_df.sort_values("Average Score", ascending=False) |
| except Exception as e: |
| print(f"Error refreshing data: {e}") |
| return None, None |
|
|
| def auto_refresh(): |
| """Single refresh function for automatic updates""" |
| title, data = refresh_data() |
| status = f"Last auto update: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" |
| if title and data is not None: |
| return title, data, status |
| return None, None, None |
|
|
| def update_table(size_ranges, model_types): |
| if data_state.current_df is None: |
| return empty_df |
| filtered_df = filter_table(data_state.current_df, size_ranges, model_types) |
| return filtered_df.sort_values("Average Score", ascending=False) |
|
|
| initial_title, initial_data = load_initial_data() |
| |
| with gr.Blocks() as demo: |
| title_comp = gr.Markdown(initial_title) |
| |
| with gr.Tabs() as tabs: |
| with gr.TabItem("🏅 Main Leaderboard", elem_id='main'): |
| gr.Markdown(MAIN_LEADERBOARD_DESCRIPTION) |
| |
| with gr.Row(): |
| with gr.Column(): |
| size_filter = gr.CheckboxGroup( |
| choices=MODEL_SIZE, |
| value=MODEL_SIZE, |
| label='Model Size', |
| interactive=True, |
| ) |
| with gr.Column(): |
| type_filter = gr.CheckboxGroup( |
| choices=MODEL_TYPE, |
| value=MODEL_TYPE, |
| label='Model Type', |
| interactive=True, |
| ) |
| |
| with gr.Column(): |
| table = gr.DataFrame( |
| value=initial_data, |
| interactive=False, |
| wrap=False, |
| column_widths=calculate_column_widths(initial_data), |
| ) |
| |
| refresh_button = gr.Button("Refresh Data") |
| update_status = gr.Markdown("Last update: " + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) |
|
|
| def refresh_and_update(): |
| title, data = refresh_data() |
| status = f"Last manual update: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" |
| return title, data, status |
|
|
| refresh_button.click( |
| fn=refresh_and_update, |
| outputs=[title_comp, table, update_status], |
| ) |
| |
| |
| demo.load( |
| fn=auto_refresh, |
| outputs=[title_comp, table, update_status], |
| every=21600 |
| ) |
| |
| size_filter.change( |
| fn=update_table, |
| inputs=[size_filter, type_filter], |
| outputs=table, |
| ) |
| |
| type_filter.change( |
| fn=update_table, |
| inputs=[size_filter, type_filter], |
| outputs=table, |
| ) |
|
|
| with gr.Row(): |
| with gr.Accordion("Citation", open=False): |
| citation_button = gr.Textbox( |
| value=CITATION_BUTTON_TEXT, |
| label=CITATION_BUTTON_LABEL, |
| elem_id='citation-button', |
| ) |
|
|
| return demo |
|
|
| if __name__ == '__main__': |
| demo = create_interface() |
| demo.queue() |
| demo.launch(server_name='0.0.0.0') |