ping98k commited on
Commit
fa5956e
·
1 Parent(s): 97d13c6

Add explain option to scoring and pairwise evaluation

Browse files

Introduces 'explain' parameters to both scoring and pairwise evaluation functions, allowing explanations for each criteria to be included in the JSON output. Updates the Gradio interface and main tournament logic to support toggling this feature.

Files changed (2) hide show
  1. main.py +14 -3
  2. tournament_utils.py +22 -2
main.py CHANGED
@@ -78,6 +78,8 @@ def run_tournament(
78
  generate_thinking,
79
  score_thinking,
80
  pairwise_thinking,
 
 
81
  ):
82
  instruction = instruction_input.strip()
83
  criteria_list = [c.strip() for c in criteria_input.split(",") if c.strip()] or ["Factuality", "Instruction Following", "Precision"]
@@ -113,6 +115,11 @@ def run_tournament(
113
  score_thinking = SCORE_THINKING_DEFAULT
114
  if pairwise_thinking is None:
115
  pairwise_thinking = PAIRWISE_THINKING_DEFAULT
 
 
 
 
 
116
  process_log = []
117
  hist_fig = None
118
  top_picks_str = ""
@@ -145,8 +152,8 @@ def run_tournament(
145
 
146
  def log_completion(prefix: str, text: str, player_id: int | None = None):
147
  disp = text.replace("\n", " ")
148
- if len(disp) > 100:
149
- disp = disp[:100] + "…"
150
  if player_id is not None:
151
  prefix = f"{prefix}(ID {player_id}) "
152
  return log(f"{prefix}{disp}")
@@ -154,7 +161,7 @@ def run_tournament(
154
  process_log.append(msg)
155
  tqdm.write(msg)
156
  yield "\n".join(process_log), hist_fig, top_picks_str, usage_str()
157
- yield from log("Generating players …")
158
  all_players, usage = generate_players(
159
  instruction,
160
  n_gen,
@@ -188,6 +195,7 @@ def run_tournament(
188
  temperature=score_temperature,
189
  include_instruction=score_with_instruction,
190
  thinking=score_thinking,
 
191
  return_usage=True,
192
  )
193
  add_usage(usage)
@@ -227,6 +235,7 @@ def run_tournament(
227
  temperature=pairwise_temperature,
228
  include_instruction=pairwise_with_instruction,
229
  thinking=pairwise_thinking,
 
230
  return_usage=True,
231
  )
232
  add_usage(usage)
@@ -320,6 +329,8 @@ demo = gr.Interface(
320
  gr.Checkbox(value=GENERATE_THINKING_DEFAULT, label="Enable Thinking (Generate)"),
321
  gr.Checkbox(value=SCORE_THINKING_DEFAULT, label="Enable Thinking (Score)"),
322
  gr.Checkbox(value=PAIRWISE_THINKING_DEFAULT, label="Enable Thinking (Pairwise)"),
 
 
323
  ],
324
  outputs=[
325
  gr.Textbox(lines=10, label="Process"),
 
78
  generate_thinking,
79
  score_thinking,
80
  pairwise_thinking,
81
+ score_explain,
82
+ pairwise_explain,
83
  ):
84
  instruction = instruction_input.strip()
85
  criteria_list = [c.strip() for c in criteria_input.split(",") if c.strip()] or ["Factuality", "Instruction Following", "Precision"]
 
115
  score_thinking = SCORE_THINKING_DEFAULT
116
  if pairwise_thinking is None:
117
  pairwise_thinking = PAIRWISE_THINKING_DEFAULT
118
+ if score_explain is None:
119
+ score_explain = False
120
+ if pairwise_explain is None:
121
+ pairwise_explain = False
122
+
123
  process_log = []
124
  hist_fig = None
125
  top_picks_str = ""
 
152
 
153
  def log_completion(prefix: str, text: str, player_id: int | None = None):
154
  disp = text.replace("\n", " ")
155
+ if len(disp) > 1000:
156
+ disp = disp[:1000] + "…"
157
  if player_id is not None:
158
  prefix = f"{prefix}(ID {player_id}) "
159
  return log(f"{prefix}{disp}")
 
161
  process_log.append(msg)
162
  tqdm.write(msg)
163
  yield "\n".join(process_log), hist_fig, top_picks_str, usage_str()
164
+ yield from log("Generating answers …")
165
  all_players, usage = generate_players(
166
  instruction,
167
  n_gen,
 
195
  temperature=score_temperature,
196
  include_instruction=score_with_instruction,
197
  thinking=score_thinking,
198
+ explain=score_explain,
199
  return_usage=True,
200
  )
201
  add_usage(usage)
 
235
  temperature=pairwise_temperature,
236
  include_instruction=pairwise_with_instruction,
237
  thinking=pairwise_thinking,
238
+ explain=pairwise_explain,
239
  return_usage=True,
240
  )
241
  add_usage(usage)
 
329
  gr.Checkbox(value=GENERATE_THINKING_DEFAULT, label="Enable Thinking (Generate)"),
330
  gr.Checkbox(value=SCORE_THINKING_DEFAULT, label="Enable Thinking (Score)"),
331
  gr.Checkbox(value=PAIRWISE_THINKING_DEFAULT, label="Enable Thinking (Pairwise)"),
332
+ gr.Checkbox(value=False, label="Enable Explain (Score)"),
333
+ gr.Checkbox(value=False, label="Enable Explain (Pairwise)"),
334
  ],
335
  outputs=[
336
  gr.Textbox(lines=10, label="Process"),
tournament_utils.py CHANGED
@@ -60,6 +60,7 @@ def prompt_score(
60
  temperature: float | None = None,
61
  include_instruction: bool = True,
62
  thinking: bool = False,
 
63
  return_usage: bool = False,
64
  ) -> str | tuple[str, object]:
65
  """Return a JSON score string evaluating `player` on the criteria."""
@@ -67,9 +68,18 @@ def prompt_score(
67
  prompt = f"""Evaluate the output below on the following criteria:
68
  {criteria_block}
69
 
70
- Return JSON exactly like: {{"scores": [{example_scores}]}}."""
 
 
 
 
 
 
 
 
71
  if include_instruction:
72
  prompt += f"\n\nInstruction:\n{instruction}"
 
73
  prompt += f"\n\nOutput:\n{player}"
74
  kwargs = _completion_kwargs(api_base, api_key, temperature)
75
  kwargs["chat_template_kwargs"] = {"enable_thinking": thinking}
@@ -96,13 +106,23 @@ def prompt_pairwise(
96
  temperature: float | None = None,
97
  include_instruction: bool = True,
98
  thinking: bool = False,
 
99
  return_usage: bool = False,
100
  ) -> str | tuple[str, object]:
101
  """Return which player wins in JSON using the given criteria."""
102
  prompt = f"""Compare the two players below using:
103
  {criteria_block}
104
 
105
- Return ONLY JSON {{"winner": "A"}} or {{"winner": "B"}}."""
 
 
 
 
 
 
 
 
 
106
  if include_instruction:
107
  prompt += f"\n\nInstruction:\n{instruction}"
108
  prompt += f"\n\nPlayers:\n<A>{a}</A>\n<B>{b}</B>"
 
60
  temperature: float | None = None,
61
  include_instruction: bool = True,
62
  thinking: bool = False,
63
+ explain: bool = False,
64
  return_usage: bool = False,
65
  ) -> str | tuple[str, object]:
66
  """Return a JSON score string evaluating `player` on the criteria."""
 
68
  prompt = f"""Evaluate the output below on the following criteria:
69
  {criteria_block}
70
 
71
+ """
72
+ if explain:
73
+ prompt += f"""
74
+ Explain each criteria in English concisely.
75
+ One sentence per criteria.
76
+ Return JSON exactly like: {{"explain":"explanation","scores": [{example_scores}]}}.""".strip()
77
+ else:
78
+ prompt += f"""Return JSON exactly like: {{"scores": [{example_scores}]}}."""
79
+
80
  if include_instruction:
81
  prompt += f"\n\nInstruction:\n{instruction}"
82
+
83
  prompt += f"\n\nOutput:\n{player}"
84
  kwargs = _completion_kwargs(api_base, api_key, temperature)
85
  kwargs["chat_template_kwargs"] = {"enable_thinking": thinking}
 
106
  temperature: float | None = None,
107
  include_instruction: bool = True,
108
  thinking: bool = False,
109
+ explain: bool = False,
110
  return_usage: bool = False,
111
  ) -> str | tuple[str, object]:
112
  """Return which player wins in JSON using the given criteria."""
113
  prompt = f"""Compare the two players below using:
114
  {criteria_block}
115
 
116
+ """
117
+
118
+ if explain:
119
+ prompt += f"""
120
+ Explain each criteria in English concisely.
121
+ One sentence per criteria.
122
+ Return JSON exactly like: {{"explain":"explanation","winner": "A"}} or {{"explain":"explanation","winner": "B"}}.""".strip()
123
+ else:
124
+ prompt += f"""Return JSON exactly like: {{"winner": "A"}} or {{"winner": "B"}}."""
125
+
126
  if include_instruction:
127
  prompt += f"\n\nInstruction:\n{instruction}"
128
  prompt += f"\n\nPlayers:\n<A>{a}</A>\n<B>{b}</B>"