DearSloth commited on
Commit
d1beb96
·
1 Parent(s): 47dba14

Update space

Browse files
Files changed (3) hide show
  1. src/about.py +10 -3
  2. src/display/formatting.py +10 -3
  3. src/populate.py +26 -25
src/about.py CHANGED
@@ -29,9 +29,16 @@ TITLE = """<h1 align="center" id="space-title">🎭 RoleRMBench Leaderboard</h1>
29
 
30
  # What does your leaderboard evaluate?
31
  INTRODUCTION_TEXT = """
32
- RoleRMBench evaluates reward models on role-playing scenarios across multiple dimensions.
33
  """
34
 
35
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
36
- CITATION_BUTTON_TEXT = r"""
37
- """
 
 
 
 
 
 
 
 
29
 
30
  # What does your leaderboard evaluate?
31
  INTRODUCTION_TEXT = """
32
+ A Tencent Youtu work. For more information, please refer to: [https://github.com/Dear-Sloth/RoleRMBench](https://github.com/Dear-Sloth/RoleRMBench)
33
  """
34
 
35
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
36
+ CITATION_BUTTON_TEXT = r"""@misc{ ,
37
+ title={RoleRMBench & RoleRM: Towards Reward Modeling for Profile-Based Role Play in Dialogue Systems},
38
+ author={ },
39
+ year={2025},
40
+ eprint={ },
41
+ archivePrefix={arXiv},
42
+ primaryClass={ },
43
+ url={https://arxiv.org/abs/ },
44
+ }"""
src/display/formatting.py CHANGED
@@ -2,9 +2,16 @@ def model_hyperlink(link, model_name):
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
4
 
5
- def make_clickable_model(model_name):
6
- link = f"https://huggingface.co/{model_name}"
7
- return model_hyperlink(link, model_name)
 
 
 
 
 
 
 
8
 
9
 
10
  def styled_error(error):
 
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
4
 
5
+ def make_clickable_model(model_name, custom_link=None):
6
+ """Creates a clickable model link. If custom_link is None, no link is created."""
7
+ if custom_link is None:
8
+ return model_name
9
+ elif custom_link == "auto":
10
+ link = f"https://huggingface.co/{model_name}"
11
+ return model_hyperlink(link, model_name)
12
+ else:
13
+ link = f"https://huggingface.co/{custom_link}"
14
+ return model_hyperlink(link, model_name)
15
 
16
 
17
  def styled_error(error):
src/populate.py CHANGED
@@ -4,32 +4,33 @@ from src.display.formatting import make_clickable_model
4
 
5
 
6
  # Static benchmark data
 
7
  LEADERBOARD_DATA = [
8
  # Open-source Models
9
- {"model": "internlm/internlm2-20b-reward", "Avg": 70.58, "Nar": 70.37, "MT": 68.25, "Con": 67.61, "IF": 76.00, "Scn": 72.73, "Saf": 66.10, "Att": 75.00},
10
- {"model": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2", "Avg": 70.36, "Nar": 66.67, "MT": 71.43, "Con": 70.42, "IF": 70.00, "Scn": 65.15, "Saf": 76.27, "Att": 70.59},
11
- {"model": "Skywork/Skywork-Reward-V2-Qwen3-8B", "Avg": 70.07, "Nar": 64.81, "MT": 69.84, "Con": 67.61, "IF": 66.00, "Scn": 75.76, "Saf": 74.58, "Att": 77.94},
12
- {"model": "internlm/internlm2-7b-reward", "Avg": 67.72, "Nar": 64.81, "MT": 63.49, "Con": 64.79, "IF": 68.00, "Scn": 72.73, "Saf": 72.88, "Att": 66.18},
13
- {"model": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2", "Avg": 67.53, "Nar": 70.37, "MT": 65.08, "Con": 60.56, "IF": 76.00, "Scn": 71.21, "Saf": 67.80, "Att": 61.76},
14
- {"model": "allenai/Llama-3.1-70B-Instruct-RM-RB2", "Avg": 66.39, "Nar": 72.22, "MT": 65.08, "Con": 56.34, "IF": 62.00, "Scn": 65.15, "Saf": 76.27, "Att": 67.65},
15
- {"model": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2", "Avg": 66.34, "Nar": 70.37, "MT": 61.90, "Con": 60.56, "IF": 72.00, "Scn": 72.73, "Saf": 69.49, "Att": 60.29},
16
- {"model": "allenai/Llama-3.1-8B-Instruct-RM-RB2", "Avg": 65.06, "Nar": 59.26, "MT": 61.94, "Con": 59.15, "IF": 70.00, "Scn": 72.73, "Saf": 71.19, "Att": 61.16},
17
- {"model": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2", "Avg": 64.89, "Nar": 66.67, "MT": 60.32, "Con": 57.75, "IF": 70.00, "Scn": 66.67, "Saf": 66.10, "Att": 64.71},
18
- {"model": "Skywork/Skywork-Reward-V2-Llama-3.1-8B", "Avg": 64.17, "Nar": 53.70, "MT": 63.49, "Con": 60.56, "IF": 66.00, "Scn": 71.21, "Saf": 69.49, "Att": 64.71},
19
- {"model": "CharacterRM", "Avg": 61.11, "Nar": 59.26, "MT": 65.08, "Con": 56.34, "IF": 72.00, "Scn": 66.67, "Saf": 52.54, "Att": 55.88},
20
- {"model": "infly/INF-ORM-Llama3.1-70B", "Avg": 58.51, "Nar": 61.11, "MT": 61.90, "Con": 50.70, "IF": 58.00, "Scn": 56.06, "Saf": 64.41, "Att": 57.35},
21
- {"model": "Ray2333/GRM_Llama3.1_8B_rewardmodel-ft", "Avg": 56.50, "Nar": 53.70, "MT": 58.73, "Con": 57.75, "IF": 56.00, "Scn": 56.06, "Saf": 59.32, "Att": 52.94},
22
- {"model": "Skywork/Skywork-Reward-Llama-3.1-8B", "Avg": 53.50, "Nar": 48.15, "MT": 50.79, "Con": 50.70, "IF": 58.00, "Scn": 59.09, "Saf": 55.93, "Att": 50.00},
23
- {"model": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", "Avg": 51.97, "Nar": 42.58, "MT": 50.79, "Con": 45.07, "IF": 60.00, "Scn": 50.06, "Saf": 55.93, "Att": 57.35},
24
- {"model": "nicolinho/QRM-Llama3.1-8B-v2", "Avg": 47.42, "Nar": 44.44, "MT": 58.73, "Con": 40.85, "IF": 46.00, "Scn": 50.00, "Saf": 43.37, "Att": 48.53},
25
- {"model": "NCSOFT/Llama-3-OffsetBias-RM-8B", "Avg": 47.17, "Nar": 44.44, "MT": 49.21, "Con": 39.44, "IF": 32.00, "Scn": 50.00, "Saf": 69.49, "Att": 45.59},
26
- # Proprietary Models
27
- {"model": "GPT-5-mini-2025-08-07", "Avg": 69.30, "Nar": 68.52, "MT": 73.02, "Con": 59.86, "IF": 83.00, "Scn": 68.94, "Saf": 70.34, "Att": 65.44},
28
- {"model": "GPT-4o-2024-08-06", "Avg": 69.12, "Nar": 66.67, "MT": 66.67, "Con": 66.90, "IF": 71.00, "Scn": 68.18, "Saf": 78.81, "Att": 67.65},
29
- {"model": "GPT-5-2025-08-07", "Avg": 67.55, "Nar": 69.44, "MT": 66.67, "Con": 66.20, "IF": 82.00, "Scn": 65.91, "Saf": 60.17, "Att": 62.50},
30
- {"model": "Claude-3-7-sonnet-20250219", "Avg": 65.24, "Nar": 68.52, "MT": 62.70, "Con": 65.49, "IF": 75.00, "Scn": 62.88, "Saf": 61.02, "Att": 61.76},
31
- # Ours
32
- {"model": "RoleRM", "Avg": 88.32, "Nar": 90.74, "MT": 82.54, "Con": 80.28, "IF": 94.00, "Scn": 90.91, "Saf": 91.53, "Att": 88.24},
33
  ]
34
 
35
 
@@ -38,7 +39,7 @@ def get_leaderboard_df(cols: list, benchmark_cols: list) -> pd.DataFrame:
38
  records = []
39
  for entry in LEADERBOARD_DATA:
40
  record = {
41
- "Model": make_clickable_model(entry["model"]),
42
  }
43
  for col in benchmark_cols:
44
  record[col] = entry[col]
 
4
 
5
 
6
  # Static benchmark data
7
+ # link: "auto" = use model name as HuggingFace link, None = no link, or custom path
8
  LEADERBOARD_DATA = [
9
  # Open-source Models
10
+ {"model": "internlm/internlm2-20b-reward", "link": "auto", "Avg": 70.58, "Nar": 70.37, "MT": 68.25, "Con": 67.61, "IF": 76.00, "Scn": 72.73, "Saf": 66.10, "Att": 75.00},
11
+ {"model": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2", "link": "auto", "Avg": 70.36, "Nar": 66.67, "MT": 71.43, "Con": 70.42, "IF": 70.00, "Scn": 65.15, "Saf": 76.27, "Att": 70.59},
12
+ {"model": "Skywork/Skywork-Reward-V2-Qwen3-8B", "link": "auto", "Avg": 70.07, "Nar": 64.81, "MT": 69.84, "Con": 67.61, "IF": 66.00, "Scn": 75.76, "Saf": 74.58, "Att": 77.94},
13
+ {"model": "internlm/internlm2-7b-reward", "link": "auto", "Avg": 67.72, "Nar": 64.81, "MT": 63.49, "Con": 64.79, "IF": 68.00, "Scn": 72.73, "Saf": 72.88, "Att": 66.18},
14
+ {"model": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2", "link": "auto", "Avg": 67.53, "Nar": 70.37, "MT": 65.08, "Con": 60.56, "IF": 76.00, "Scn": 71.21, "Saf": 67.80, "Att": 61.76},
15
+ {"model": "allenai/Llama-3.1-70B-Instruct-RM-RB2", "link": "auto", "Avg": 66.39, "Nar": 72.22, "MT": 65.08, "Con": 56.34, "IF": 62.00, "Scn": 65.15, "Saf": 76.27, "Att": 67.65},
16
+ {"model": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2", "link": "auto", "Avg": 66.34, "Nar": 70.37, "MT": 61.90, "Con": 60.56, "IF": 72.00, "Scn": 72.73, "Saf": 69.49, "Att": 60.29},
17
+ {"model": "allenai/Llama-3.1-8B-Instruct-RM-RB2", "link": "auto", "Avg": 65.06, "Nar": 59.26, "MT": 61.94, "Con": 59.15, "IF": 70.00, "Scn": 72.73, "Saf": 71.19, "Att": 61.16},
18
+ {"model": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2", "link": "auto", "Avg": 64.89, "Nar": 66.67, "MT": 60.32, "Con": 57.75, "IF": 70.00, "Scn": 66.67, "Saf": 66.10, "Att": 64.71},
19
+ {"model": "Skywork/Skywork-Reward-V2-Llama-3.1-8B", "link": "auto", "Avg": 64.17, "Nar": 53.70, "MT": 63.49, "Con": 60.56, "IF": 66.00, "Scn": 71.21, "Saf": 69.49, "Att": 64.71},
20
+ {"model": "CharacterRM", "link": "morecry/BaichuanCharRM", "Avg": 61.11, "Nar": 59.26, "MT": 65.08, "Con": 56.34, "IF": 72.00, "Scn": 66.67, "Saf": 52.54, "Att": 55.88},
21
+ {"model": "infly/INF-ORM-Llama3.1-70B", "link": "auto", "Avg": 58.51, "Nar": 61.11, "MT": 61.90, "Con": 50.70, "IF": 58.00, "Scn": 56.06, "Saf": 64.41, "Att": 57.35},
22
+ {"model": "Ray2333/GRM_Llama3.1_8B_rewardmodel-ft", "link": "auto", "Avg": 56.50, "Nar": 53.70, "MT": 58.73, "Con": 57.75, "IF": 56.00, "Scn": 56.06, "Saf": 59.32, "Att": 52.94},
23
+ {"model": "Skywork/Skywork-Reward-Llama-3.1-8B", "link": "auto", "Avg": 53.50, "Nar": 48.15, "MT": 50.79, "Con": 50.70, "IF": 58.00, "Scn": 59.09, "Saf": 55.93, "Att": 50.00},
24
+ {"model": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", "link": "auto", "Avg": 51.97, "Nar": 42.58, "MT": 50.79, "Con": 45.07, "IF": 60.00, "Scn": 50.06, "Saf": 55.93, "Att": 57.35},
25
+ {"model": "nicolinho/QRM-Llama3.1-8B-v2", "link": "auto", "Avg": 47.42, "Nar": 44.44, "MT": 58.73, "Con": 40.85, "IF": 46.00, "Scn": 50.00, "Saf": 43.37, "Att": 48.53},
26
+ {"model": "NCSOFT/Llama-3-OffsetBias-RM-8B", "link": "auto", "Avg": 47.17, "Nar": 44.44, "MT": 49.21, "Con": 39.44, "IF": 32.00, "Scn": 50.00, "Saf": 69.49, "Att": 45.59},
27
+ # Proprietary Models (no links)
28
+ {"model": "GPT-5-mini-2025-08-07", "link": None, "Avg": 69.30, "Nar": 68.52, "MT": 73.02, "Con": 59.86, "IF": 83.00, "Scn": 68.94, "Saf": 70.34, "Att": 65.44},
29
+ {"model": "GPT-4o-2024-08-06", "link": None, "Avg": 69.12, "Nar": 66.67, "MT": 66.67, "Con": 66.90, "IF": 71.00, "Scn": 68.18, "Saf": 78.81, "Att": 67.65},
30
+ {"model": "GPT-5-2025-08-07", "link": None, "Avg": 67.55, "Nar": 69.44, "MT": 66.67, "Con": 66.20, "IF": 82.00, "Scn": 65.91, "Saf": 60.17, "Att": 62.50},
31
+ {"model": "Claude-3-7-sonnet-20250219", "link": None, "Avg": 65.24, "Nar": 68.52, "MT": 62.70, "Con": 65.49, "IF": 75.00, "Scn": 62.88, "Saf": 61.02, "Att": 61.76},
32
+ # Ours
33
+ {"model": "RoleRM", "link": None, "Avg": 88.32, "Nar": 90.74, "MT": 82.54, "Con": 80.28, "IF": 94.00, "Scn": 90.91, "Saf": 91.53, "Att": 88.24},
34
  ]
35
 
36
 
 
39
  records = []
40
  for entry in LEADERBOARD_DATA:
41
  record = {
42
+ "Model": make_clickable_model(entry["model"], entry["link"]),
43
  }
44
  for col in benchmark_cols:
45
  record[col] = entry[col]