Spaces:
Running
Running
Update space
Browse files- src/about.py +10 -3
- src/display/formatting.py +10 -3
- src/populate.py +26 -25
src/about.py
CHANGED
|
@@ -29,9 +29,16 @@ TITLE = """<h1 align="center" id="space-title">🎭 RoleRMBench Leaderboard</h1>
|
|
| 29 |
|
| 30 |
# What does your leaderboard evaluate?
|
| 31 |
INTRODUCTION_TEXT = """
|
| 32 |
-
|
| 33 |
"""
|
| 34 |
|
| 35 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 36 |
-
CITATION_BUTTON_TEXT = r"""
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# What does your leaderboard evaluate?
|
| 31 |
INTRODUCTION_TEXT = """
|
| 32 |
+
A Tencent Youtu work. For more information, please refer to: [https://github.com/Dear-Sloth/RoleRMBench](https://github.com/Dear-Sloth/RoleRMBench)
|
| 33 |
"""
|
| 34 |
|
| 35 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 36 |
+
CITATION_BUTTON_TEXT = r"""@misc{ ,
|
| 37 |
+
title={RoleRMBench & RoleRM: Towards Reward Modeling for Profile-Based Role Play in Dialogue Systems},
|
| 38 |
+
author={ },
|
| 39 |
+
year={2025},
|
| 40 |
+
eprint={ },
|
| 41 |
+
archivePrefix={arXiv},
|
| 42 |
+
primaryClass={ },
|
| 43 |
+
url={https://arxiv.org/abs/ },
|
| 44 |
+
}"""
|
src/display/formatting.py
CHANGED
|
@@ -2,9 +2,16 @@ def model_hyperlink(link, model_name):
|
|
| 2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 3 |
|
| 4 |
|
| 5 |
-
def make_clickable_model(model_name):
|
| 6 |
-
link
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def styled_error(error):
|
|
|
|
| 2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 3 |
|
| 4 |
|
| 5 |
+
def make_clickable_model(model_name, custom_link=None):
|
| 6 |
+
"""Creates a clickable model link. If custom_link is None, no link is created."""
|
| 7 |
+
if custom_link is None:
|
| 8 |
+
return model_name
|
| 9 |
+
elif custom_link == "auto":
|
| 10 |
+
link = f"https://huggingface.co/{model_name}"
|
| 11 |
+
return model_hyperlink(link, model_name)
|
| 12 |
+
else:
|
| 13 |
+
link = f"https://huggingface.co/{custom_link}"
|
| 14 |
+
return model_hyperlink(link, model_name)
|
| 15 |
|
| 16 |
|
| 17 |
def styled_error(error):
|
src/populate.py
CHANGED
|
@@ -4,32 +4,33 @@ from src.display.formatting import make_clickable_model
|
|
| 4 |
|
| 5 |
|
| 6 |
# Static benchmark data
|
|
|
|
| 7 |
LEADERBOARD_DATA = [
|
| 8 |
# Open-source Models
|
| 9 |
-
{"model": "internlm/internlm2-20b-reward", "Avg": 70.58, "Nar": 70.37, "MT": 68.25, "Con": 67.61, "IF": 76.00, "Scn": 72.73, "Saf": 66.10, "Att": 75.00},
|
| 10 |
-
{"model": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2", "Avg": 70.36, "Nar": 66.67, "MT": 71.43, "Con": 70.42, "IF": 70.00, "Scn": 65.15, "Saf": 76.27, "Att": 70.59},
|
| 11 |
-
{"model": "Skywork/Skywork-Reward-V2-Qwen3-8B", "Avg": 70.07, "Nar": 64.81, "MT": 69.84, "Con": 67.61, "IF": 66.00, "Scn": 75.76, "Saf": 74.58, "Att": 77.94},
|
| 12 |
-
{"model": "internlm/internlm2-7b-reward", "Avg": 67.72, "Nar": 64.81, "MT": 63.49, "Con": 64.79, "IF": 68.00, "Scn": 72.73, "Saf": 72.88, "Att": 66.18},
|
| 13 |
-
{"model": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2", "Avg": 67.53, "Nar": 70.37, "MT": 65.08, "Con": 60.56, "IF": 76.00, "Scn": 71.21, "Saf": 67.80, "Att": 61.76},
|
| 14 |
-
{"model": "allenai/Llama-3.1-70B-Instruct-RM-RB2", "Avg": 66.39, "Nar": 72.22, "MT": 65.08, "Con": 56.34, "IF": 62.00, "Scn": 65.15, "Saf": 76.27, "Att": 67.65},
|
| 15 |
-
{"model": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2", "Avg": 66.34, "Nar": 70.37, "MT": 61.90, "Con": 60.56, "IF": 72.00, "Scn": 72.73, "Saf": 69.49, "Att": 60.29},
|
| 16 |
-
{"model": "allenai/Llama-3.1-8B-Instruct-RM-RB2", "Avg": 65.06, "Nar": 59.26, "MT": 61.94, "Con": 59.15, "IF": 70.00, "Scn": 72.73, "Saf": 71.19, "Att": 61.16},
|
| 17 |
-
{"model": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2", "Avg": 64.89, "Nar": 66.67, "MT": 60.32, "Con": 57.75, "IF": 70.00, "Scn": 66.67, "Saf": 66.10, "Att": 64.71},
|
| 18 |
-
{"model": "Skywork/Skywork-Reward-V2-Llama-3.1-8B", "Avg": 64.17, "Nar": 53.70, "MT": 63.49, "Con": 60.56, "IF": 66.00, "Scn": 71.21, "Saf": 69.49, "Att": 64.71},
|
| 19 |
-
{"model": "CharacterRM", "Avg": 61.11, "Nar": 59.26, "MT": 65.08, "Con": 56.34, "IF": 72.00, "Scn": 66.67, "Saf": 52.54, "Att": 55.88},
|
| 20 |
-
{"model": "infly/INF-ORM-Llama3.1-70B", "Avg": 58.51, "Nar": 61.11, "MT": 61.90, "Con": 50.70, "IF": 58.00, "Scn": 56.06, "Saf": 64.41, "Att": 57.35},
|
| 21 |
-
{"model": "Ray2333/GRM_Llama3.1_8B_rewardmodel-ft", "Avg": 56.50, "Nar": 53.70, "MT": 58.73, "Con": 57.75, "IF": 56.00, "Scn": 56.06, "Saf": 59.32, "Att": 52.94},
|
| 22 |
-
{"model": "Skywork/Skywork-Reward-Llama-3.1-8B", "Avg": 53.50, "Nar": 48.15, "MT": 50.79, "Con": 50.70, "IF": 58.00, "Scn": 59.09, "Saf": 55.93, "Att": 50.00},
|
| 23 |
-
{"model": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", "Avg": 51.97, "Nar": 42.58, "MT": 50.79, "Con": 45.07, "IF": 60.00, "Scn": 50.06, "Saf": 55.93, "Att": 57.35},
|
| 24 |
-
{"model": "nicolinho/QRM-Llama3.1-8B-v2", "Avg": 47.42, "Nar": 44.44, "MT": 58.73, "Con": 40.85, "IF": 46.00, "Scn": 50.00, "Saf": 43.37, "Att": 48.53},
|
| 25 |
-
{"model": "NCSOFT/Llama-3-OffsetBias-RM-8B", "Avg": 47.17, "Nar": 44.44, "MT": 49.21, "Con": 39.44, "IF": 32.00, "Scn": 50.00, "Saf": 69.49, "Att": 45.59},
|
| 26 |
-
# Proprietary Models
|
| 27 |
-
{"model": "GPT-5-mini-2025-08-07", "Avg": 69.30, "Nar": 68.52, "MT": 73.02, "Con": 59.86, "IF": 83.00, "Scn": 68.94, "Saf": 70.34, "Att": 65.44},
|
| 28 |
-
{"model": "GPT-4o-2024-08-06", "Avg": 69.12, "Nar": 66.67, "MT": 66.67, "Con": 66.90, "IF": 71.00, "Scn": 68.18, "Saf": 78.81, "Att": 67.65},
|
| 29 |
-
{"model": "GPT-5-2025-08-07", "Avg": 67.55, "Nar": 69.44, "MT": 66.67, "Con": 66.20, "IF": 82.00, "Scn": 65.91, "Saf": 60.17, "Att": 62.50},
|
| 30 |
-
{"model": "Claude-3-7-sonnet-20250219", "Avg": 65.24, "Nar": 68.52, "MT": 62.70, "Con": 65.49, "IF": 75.00, "Scn": 62.88, "Saf": 61.02, "Att": 61.76},
|
| 31 |
-
# Ours
|
| 32 |
-
{"model": "RoleRM", "Avg": 88.32, "Nar": 90.74, "MT": 82.54, "Con": 80.28, "IF": 94.00, "Scn": 90.91, "Saf": 91.53, "Att": 88.24},
|
| 33 |
]
|
| 34 |
|
| 35 |
|
|
@@ -38,7 +39,7 @@ def get_leaderboard_df(cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
| 38 |
records = []
|
| 39 |
for entry in LEADERBOARD_DATA:
|
| 40 |
record = {
|
| 41 |
-
"Model": make_clickable_model(entry["model"]),
|
| 42 |
}
|
| 43 |
for col in benchmark_cols:
|
| 44 |
record[col] = entry[col]
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
# Static benchmark data
|
| 7 |
+
# link: "auto" = use model name as HuggingFace link, None = no link, or custom path
|
| 8 |
LEADERBOARD_DATA = [
|
| 9 |
# Open-source Models
|
| 10 |
+
{"model": "internlm/internlm2-20b-reward", "link": "auto", "Avg": 70.58, "Nar": 70.37, "MT": 68.25, "Con": 67.61, "IF": 76.00, "Scn": 72.73, "Saf": 66.10, "Att": 75.00},
|
| 11 |
+
{"model": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2", "link": "auto", "Avg": 70.36, "Nar": 66.67, "MT": 71.43, "Con": 70.42, "IF": 70.00, "Scn": 65.15, "Saf": 76.27, "Att": 70.59},
|
| 12 |
+
{"model": "Skywork/Skywork-Reward-V2-Qwen3-8B", "link": "auto", "Avg": 70.07, "Nar": 64.81, "MT": 69.84, "Con": 67.61, "IF": 66.00, "Scn": 75.76, "Saf": 74.58, "Att": 77.94},
|
| 13 |
+
{"model": "internlm/internlm2-7b-reward", "link": "auto", "Avg": 67.72, "Nar": 64.81, "MT": 63.49, "Con": 64.79, "IF": 68.00, "Scn": 72.73, "Saf": 72.88, "Att": 66.18},
|
| 14 |
+
{"model": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2", "link": "auto", "Avg": 67.53, "Nar": 70.37, "MT": 65.08, "Con": 60.56, "IF": 76.00, "Scn": 71.21, "Saf": 67.80, "Att": 61.76},
|
| 15 |
+
{"model": "allenai/Llama-3.1-70B-Instruct-RM-RB2", "link": "auto", "Avg": 66.39, "Nar": 72.22, "MT": 65.08, "Con": 56.34, "IF": 62.00, "Scn": 65.15, "Saf": 76.27, "Att": 67.65},
|
| 16 |
+
{"model": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2", "link": "auto", "Avg": 66.34, "Nar": 70.37, "MT": 61.90, "Con": 60.56, "IF": 72.00, "Scn": 72.73, "Saf": 69.49, "Att": 60.29},
|
| 17 |
+
{"model": "allenai/Llama-3.1-8B-Instruct-RM-RB2", "link": "auto", "Avg": 65.06, "Nar": 59.26, "MT": 61.94, "Con": 59.15, "IF": 70.00, "Scn": 72.73, "Saf": 71.19, "Att": 61.16},
|
| 18 |
+
{"model": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2", "link": "auto", "Avg": 64.89, "Nar": 66.67, "MT": 60.32, "Con": 57.75, "IF": 70.00, "Scn": 66.67, "Saf": 66.10, "Att": 64.71},
|
| 19 |
+
{"model": "Skywork/Skywork-Reward-V2-Llama-3.1-8B", "link": "auto", "Avg": 64.17, "Nar": 53.70, "MT": 63.49, "Con": 60.56, "IF": 66.00, "Scn": 71.21, "Saf": 69.49, "Att": 64.71},
|
| 20 |
+
{"model": "CharacterRM", "link": "morecry/BaichuanCharRM", "Avg": 61.11, "Nar": 59.26, "MT": 65.08, "Con": 56.34, "IF": 72.00, "Scn": 66.67, "Saf": 52.54, "Att": 55.88},
|
| 21 |
+
{"model": "infly/INF-ORM-Llama3.1-70B", "link": "auto", "Avg": 58.51, "Nar": 61.11, "MT": 61.90, "Con": 50.70, "IF": 58.00, "Scn": 56.06, "Saf": 64.41, "Att": 57.35},
|
| 22 |
+
{"model": "Ray2333/GRM_Llama3.1_8B_rewardmodel-ft", "link": "auto", "Avg": 56.50, "Nar": 53.70, "MT": 58.73, "Con": 57.75, "IF": 56.00, "Scn": 56.06, "Saf": 59.32, "Att": 52.94},
|
| 23 |
+
{"model": "Skywork/Skywork-Reward-Llama-3.1-8B", "link": "auto", "Avg": 53.50, "Nar": 48.15, "MT": 50.79, "Con": 50.70, "IF": 58.00, "Scn": 59.09, "Saf": 55.93, "Att": 50.00},
|
| 24 |
+
{"model": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", "link": "auto", "Avg": 51.97, "Nar": 42.58, "MT": 50.79, "Con": 45.07, "IF": 60.00, "Scn": 50.06, "Saf": 55.93, "Att": 57.35},
|
| 25 |
+
{"model": "nicolinho/QRM-Llama3.1-8B-v2", "link": "auto", "Avg": 47.42, "Nar": 44.44, "MT": 58.73, "Con": 40.85, "IF": 46.00, "Scn": 50.00, "Saf": 43.37, "Att": 48.53},
|
| 26 |
+
{"model": "NCSOFT/Llama-3-OffsetBias-RM-8B", "link": "auto", "Avg": 47.17, "Nar": 44.44, "MT": 49.21, "Con": 39.44, "IF": 32.00, "Scn": 50.00, "Saf": 69.49, "Att": 45.59},
|
| 27 |
+
# Proprietary Models (no links)
|
| 28 |
+
{"model": "GPT-5-mini-2025-08-07", "link": None, "Avg": 69.30, "Nar": 68.52, "MT": 73.02, "Con": 59.86, "IF": 83.00, "Scn": 68.94, "Saf": 70.34, "Att": 65.44},
|
| 29 |
+
{"model": "GPT-4o-2024-08-06", "link": None, "Avg": 69.12, "Nar": 66.67, "MT": 66.67, "Con": 66.90, "IF": 71.00, "Scn": 68.18, "Saf": 78.81, "Att": 67.65},
|
| 30 |
+
{"model": "GPT-5-2025-08-07", "link": None, "Avg": 67.55, "Nar": 69.44, "MT": 66.67, "Con": 66.20, "IF": 82.00, "Scn": 65.91, "Saf": 60.17, "Att": 62.50},
|
| 31 |
+
{"model": "Claude-3-7-sonnet-20250219", "link": None, "Avg": 65.24, "Nar": 68.52, "MT": 62.70, "Con": 65.49, "IF": 75.00, "Scn": 62.88, "Saf": 61.02, "Att": 61.76},
|
| 32 |
+
# Ours
|
| 33 |
+
{"model": "RoleRM", "link": None, "Avg": 88.32, "Nar": 90.74, "MT": 82.54, "Con": 80.28, "IF": 94.00, "Scn": 90.91, "Saf": 91.53, "Att": 88.24},
|
| 34 |
]
|
| 35 |
|
| 36 |
|
|
|
|
| 39 |
records = []
|
| 40 |
for entry in LEADERBOARD_DATA:
|
| 41 |
record = {
|
| 42 |
+
"Model": make_clickable_model(entry["model"], entry["link"]),
|
| 43 |
}
|
| 44 |
for col in benchmark_cols:
|
| 45 |
record[col] = entry[col]
|