Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from src.display.formatting import make_clickable_model | |
| # Static benchmark data | |
| LEADERBOARD_DATA = [ | |
| # Open-source Models | |
| {"model": "internlm/internlm2-20b-reward", "Avg": 70.58, "Nar": 70.37, "MT": 68.25, "Con": 67.61, "IF": 76.00, "Scn": 72.73, "Saf": 66.10, "Att": 75.00}, | |
| {"model": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2", "Avg": 70.36, "Nar": 66.67, "MT": 71.43, "Con": 70.42, "IF": 70.00, "Scn": 65.15, "Saf": 76.27, "Att": 70.59}, | |
| {"model": "Skywork/Skywork-Reward-V2-Qwen3-8B", "Avg": 70.07, "Nar": 64.81, "MT": 69.84, "Con": 67.61, "IF": 66.00, "Scn": 75.76, "Saf": 74.58, "Att": 77.94}, | |
| {"model": "internlm/internlm2-7b-reward", "Avg": 67.72, "Nar": 64.81, "MT": 63.49, "Con": 64.79, "IF": 68.00, "Scn": 72.73, "Saf": 72.88, "Att": 66.18}, | |
| {"model": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2", "Avg": 67.53, "Nar": 70.37, "MT": 65.08, "Con": 60.56, "IF": 76.00, "Scn": 71.21, "Saf": 67.80, "Att": 61.76}, | |
| {"model": "allenai/Llama-3.1-70B-Instruct-RM-RB2", "Avg": 66.39, "Nar": 72.22, "MT": 65.08, "Con": 56.34, "IF": 62.00, "Scn": 65.15, "Saf": 76.27, "Att": 67.65}, | |
| {"model": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2", "Avg": 66.34, "Nar": 70.37, "MT": 61.90, "Con": 60.56, "IF": 72.00, "Scn": 72.73, "Saf": 69.49, "Att": 60.29}, | |
| {"model": "allenai/Llama-3.1-8B-Instruct-RM-RB2", "Avg": 65.06, "Nar": 59.26, "MT": 61.94, "Con": 59.15, "IF": 70.00, "Scn": 72.73, "Saf": 71.19, "Att": 61.16}, | |
| {"model": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2", "Avg": 64.89, "Nar": 66.67, "MT": 60.32, "Con": 57.75, "IF": 70.00, "Scn": 66.67, "Saf": 66.10, "Att": 64.71}, | |
| {"model": "Skywork/Skywork-Reward-V2-Llama-3.1-8B", "Avg": 64.17, "Nar": 53.70, "MT": 63.49, "Con": 60.56, "IF": 66.00, "Scn": 71.21, "Saf": 69.49, "Att": 64.71}, | |
| {"model": "CharacterRM", "Avg": 61.11, "Nar": 59.26, "MT": 65.08, "Con": 56.34, "IF": 72.00, "Scn": 66.67, "Saf": 52.54, "Att": 55.88}, | |
| {"model": "infly/INF-ORM-Llama3.1-70B", "Avg": 58.51, "Nar": 61.11, "MT": 61.90, "Con": 50.70, "IF": 58.00, "Scn": 56.06, "Saf": 64.41, "Att": 57.35}, | |
| {"model": "Ray2333/GRM_Llama3.1_8B_rewardmodel-ft", "Avg": 56.50, "Nar": 53.70, "MT": 58.73, "Con": 57.75, "IF": 56.00, "Scn": 56.06, "Saf": 59.32, "Att": 52.94}, | |
| {"model": "Skywork/Skywork-Reward-Llama-3.1-8B", "Avg": 53.50, "Nar": 48.15, "MT": 50.79, "Con": 50.70, "IF": 58.00, "Scn": 59.09, "Saf": 55.93, "Att": 50.00}, | |
| {"model": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", "Avg": 51.97, "Nar": 42.58, "MT": 50.79, "Con": 45.07, "IF": 60.00, "Scn": 50.06, "Saf": 55.93, "Att": 57.35}, | |
| {"model": "nicolinho/QRM-Llama3.1-8B-v2", "Avg": 47.42, "Nar": 44.44, "MT": 58.73, "Con": 40.85, "IF": 46.00, "Scn": 50.00, "Saf": 43.37, "Att": 48.53}, | |
| {"model": "NCSOFT/Llama-3-OffsetBias-RM-8B", "Avg": 47.17, "Nar": 44.44, "MT": 49.21, "Con": 39.44, "IF": 32.00, "Scn": 50.00, "Saf": 69.49, "Att": 45.59}, | |
| # Proprietary Models | |
| {"model": "GPT-5-mini-2025-08-07", "Avg": 69.30, "Nar": 68.52, "MT": 73.02, "Con": 59.86, "IF": 83.00, "Scn": 68.94, "Saf": 70.34, "Att": 65.44}, | |
| {"model": "GPT-4o-2024-08-06", "Avg": 69.12, "Nar": 66.67, "MT": 66.67, "Con": 66.90, "IF": 71.00, "Scn": 68.18, "Saf": 78.81, "Att": 67.65}, | |
| {"model": "GPT-5-2025-08-07", "Avg": 67.55, "Nar": 69.44, "MT": 66.67, "Con": 66.20, "IF": 82.00, "Scn": 65.91, "Saf": 60.17, "Att": 62.50}, | |
| {"model": "Claude-3-7-sonnet-20250219", "Avg": 65.24, "Nar": 68.52, "MT": 62.70, "Con": 65.49, "IF": 75.00, "Scn": 62.88, "Saf": 61.02, "Att": 61.76}, | |
| # Ours | |
| {"model": "RoleRM", "Avg": 88.32, "Nar": 90.74, "MT": 82.54, "Con": 80.28, "IF": 94.00, "Scn": 90.91, "Saf": 91.53, "Att": 88.24}, | |
| ] | |
| def get_leaderboard_df(cols: list, benchmark_cols: list) -> pd.DataFrame: | |
| """Creates a dataframe from the static benchmark data""" | |
| records = [] | |
| for entry in LEADERBOARD_DATA: | |
| record = { | |
| "Model": make_clickable_model(entry["model"]), | |
| } | |
| for col in benchmark_cols: | |
| record[col] = entry[col] | |
| records.append(record) | |
| df = pd.DataFrame.from_records(records) | |
| df = df.sort_values(by=["Avg"], ascending=False) | |
| df = df[cols].round(decimals=2) | |
| return df | |