Spaces:
Sleeping
Sleeping
ping98k
commited on
Commit
·
fa5956e
1
Parent(s):
97d13c6
Add explain option to scoring and pairwise evaluation
Browse filesIntroduces 'explain' parameters to both scoring and pairwise evaluation functions, allowing explanations for each criteria to be included in the JSON output. Updates the Gradio interface and main tournament logic to support toggling this feature.
- main.py +14 -3
- tournament_utils.py +22 -2
main.py
CHANGED
|
@@ -78,6 +78,8 @@ def run_tournament(
|
|
| 78 |
generate_thinking,
|
| 79 |
score_thinking,
|
| 80 |
pairwise_thinking,
|
|
|
|
|
|
|
| 81 |
):
|
| 82 |
instruction = instruction_input.strip()
|
| 83 |
criteria_list = [c.strip() for c in criteria_input.split(",") if c.strip()] or ["Factuality", "Instruction Following", "Precision"]
|
|
@@ -113,6 +115,11 @@ def run_tournament(
|
|
| 113 |
score_thinking = SCORE_THINKING_DEFAULT
|
| 114 |
if pairwise_thinking is None:
|
| 115 |
pairwise_thinking = PAIRWISE_THINKING_DEFAULT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
process_log = []
|
| 117 |
hist_fig = None
|
| 118 |
top_picks_str = ""
|
|
@@ -145,8 +152,8 @@ def run_tournament(
|
|
| 145 |
|
| 146 |
def log_completion(prefix: str, text: str, player_id: int | None = None):
|
| 147 |
disp = text.replace("\n", " ")
|
| 148 |
-
if len(disp) >
|
| 149 |
-
disp = disp[:
|
| 150 |
if player_id is not None:
|
| 151 |
prefix = f"{prefix}(ID {player_id}) "
|
| 152 |
return log(f"{prefix}{disp}")
|
|
@@ -154,7 +161,7 @@ def run_tournament(
|
|
| 154 |
process_log.append(msg)
|
| 155 |
tqdm.write(msg)
|
| 156 |
yield "\n".join(process_log), hist_fig, top_picks_str, usage_str()
|
| 157 |
-
yield from log("Generating
|
| 158 |
all_players, usage = generate_players(
|
| 159 |
instruction,
|
| 160 |
n_gen,
|
|
@@ -188,6 +195,7 @@ def run_tournament(
|
|
| 188 |
temperature=score_temperature,
|
| 189 |
include_instruction=score_with_instruction,
|
| 190 |
thinking=score_thinking,
|
|
|
|
| 191 |
return_usage=True,
|
| 192 |
)
|
| 193 |
add_usage(usage)
|
|
@@ -227,6 +235,7 @@ def run_tournament(
|
|
| 227 |
temperature=pairwise_temperature,
|
| 228 |
include_instruction=pairwise_with_instruction,
|
| 229 |
thinking=pairwise_thinking,
|
|
|
|
| 230 |
return_usage=True,
|
| 231 |
)
|
| 232 |
add_usage(usage)
|
|
@@ -320,6 +329,8 @@ demo = gr.Interface(
|
|
| 320 |
gr.Checkbox(value=GENERATE_THINKING_DEFAULT, label="Enable Thinking (Generate)"),
|
| 321 |
gr.Checkbox(value=SCORE_THINKING_DEFAULT, label="Enable Thinking (Score)"),
|
| 322 |
gr.Checkbox(value=PAIRWISE_THINKING_DEFAULT, label="Enable Thinking (Pairwise)"),
|
|
|
|
|
|
|
| 323 |
],
|
| 324 |
outputs=[
|
| 325 |
gr.Textbox(lines=10, label="Process"),
|
|
|
|
| 78 |
generate_thinking,
|
| 79 |
score_thinking,
|
| 80 |
pairwise_thinking,
|
| 81 |
+
score_explain,
|
| 82 |
+
pairwise_explain,
|
| 83 |
):
|
| 84 |
instruction = instruction_input.strip()
|
| 85 |
criteria_list = [c.strip() for c in criteria_input.split(",") if c.strip()] or ["Factuality", "Instruction Following", "Precision"]
|
|
|
|
| 115 |
score_thinking = SCORE_THINKING_DEFAULT
|
| 116 |
if pairwise_thinking is None:
|
| 117 |
pairwise_thinking = PAIRWISE_THINKING_DEFAULT
|
| 118 |
+
if score_explain is None:
|
| 119 |
+
score_explain = False
|
| 120 |
+
if pairwise_explain is None:
|
| 121 |
+
pairwise_explain = False
|
| 122 |
+
|
| 123 |
process_log = []
|
| 124 |
hist_fig = None
|
| 125 |
top_picks_str = ""
|
|
|
|
| 152 |
|
| 153 |
def log_completion(prefix: str, text: str, player_id: int | None = None):
|
| 154 |
disp = text.replace("\n", " ")
|
| 155 |
+
if len(disp) > 1000:
|
| 156 |
+
disp = disp[:1000] + "…"
|
| 157 |
if player_id is not None:
|
| 158 |
prefix = f"{prefix}(ID {player_id}) "
|
| 159 |
return log(f"{prefix}{disp}")
|
|
|
|
| 161 |
process_log.append(msg)
|
| 162 |
tqdm.write(msg)
|
| 163 |
yield "\n".join(process_log), hist_fig, top_picks_str, usage_str()
|
| 164 |
+
yield from log("Generating answers …")
|
| 165 |
all_players, usage = generate_players(
|
| 166 |
instruction,
|
| 167 |
n_gen,
|
|
|
|
| 195 |
temperature=score_temperature,
|
| 196 |
include_instruction=score_with_instruction,
|
| 197 |
thinking=score_thinking,
|
| 198 |
+
explain=score_explain,
|
| 199 |
return_usage=True,
|
| 200 |
)
|
| 201 |
add_usage(usage)
|
|
|
|
| 235 |
temperature=pairwise_temperature,
|
| 236 |
include_instruction=pairwise_with_instruction,
|
| 237 |
thinking=pairwise_thinking,
|
| 238 |
+
explain=pairwise_explain,
|
| 239 |
return_usage=True,
|
| 240 |
)
|
| 241 |
add_usage(usage)
|
|
|
|
| 329 |
gr.Checkbox(value=GENERATE_THINKING_DEFAULT, label="Enable Thinking (Generate)"),
|
| 330 |
gr.Checkbox(value=SCORE_THINKING_DEFAULT, label="Enable Thinking (Score)"),
|
| 331 |
gr.Checkbox(value=PAIRWISE_THINKING_DEFAULT, label="Enable Thinking (Pairwise)"),
|
| 332 |
+
gr.Checkbox(value=False, label="Enable Explain (Score)"),
|
| 333 |
+
gr.Checkbox(value=False, label="Enable Explain (Pairwise)"),
|
| 334 |
],
|
| 335 |
outputs=[
|
| 336 |
gr.Textbox(lines=10, label="Process"),
|
tournament_utils.py
CHANGED
|
@@ -60,6 +60,7 @@ def prompt_score(
|
|
| 60 |
temperature: float | None = None,
|
| 61 |
include_instruction: bool = True,
|
| 62 |
thinking: bool = False,
|
|
|
|
| 63 |
return_usage: bool = False,
|
| 64 |
) -> str | tuple[str, object]:
|
| 65 |
"""Return a JSON score string evaluating `player` on the criteria."""
|
|
@@ -67,9 +68,18 @@ def prompt_score(
|
|
| 67 |
prompt = f"""Evaluate the output below on the following criteria:
|
| 68 |
{criteria_block}
|
| 69 |
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
if include_instruction:
|
| 72 |
prompt += f"\n\nInstruction:\n{instruction}"
|
|
|
|
| 73 |
prompt += f"\n\nOutput:\n{player}"
|
| 74 |
kwargs = _completion_kwargs(api_base, api_key, temperature)
|
| 75 |
kwargs["chat_template_kwargs"] = {"enable_thinking": thinking}
|
|
@@ -96,13 +106,23 @@ def prompt_pairwise(
|
|
| 96 |
temperature: float | None = None,
|
| 97 |
include_instruction: bool = True,
|
| 98 |
thinking: bool = False,
|
|
|
|
| 99 |
return_usage: bool = False,
|
| 100 |
) -> str | tuple[str, object]:
|
| 101 |
"""Return which player wins in JSON using the given criteria."""
|
| 102 |
prompt = f"""Compare the two players below using:
|
| 103 |
{criteria_block}
|
| 104 |
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
if include_instruction:
|
| 107 |
prompt += f"\n\nInstruction:\n{instruction}"
|
| 108 |
prompt += f"\n\nPlayers:\n<A>{a}</A>\n<B>{b}</B>"
|
|
|
|
| 60 |
temperature: float | None = None,
|
| 61 |
include_instruction: bool = True,
|
| 62 |
thinking: bool = False,
|
| 63 |
+
explain: bool = False,
|
| 64 |
return_usage: bool = False,
|
| 65 |
) -> str | tuple[str, object]:
|
| 66 |
"""Return a JSON score string evaluating `player` on the criteria."""
|
|
|
|
| 68 |
prompt = f"""Evaluate the output below on the following criteria:
|
| 69 |
{criteria_block}
|
| 70 |
|
| 71 |
+
"""
|
| 72 |
+
if explain:
|
| 73 |
+
prompt += f"""
|
| 74 |
+
Explain each criteria in English concisely.
|
| 75 |
+
One sentence per criteria.
|
| 76 |
+
Return JSON exactly like: {{"explain":"explanation","scores": [{example_scores}]}}.""".strip()
|
| 77 |
+
else:
|
| 78 |
+
prompt += f"""Return JSON exactly like: {{"scores": [{example_scores}]}}."""
|
| 79 |
+
|
| 80 |
if include_instruction:
|
| 81 |
prompt += f"\n\nInstruction:\n{instruction}"
|
| 82 |
+
|
| 83 |
prompt += f"\n\nOutput:\n{player}"
|
| 84 |
kwargs = _completion_kwargs(api_base, api_key, temperature)
|
| 85 |
kwargs["chat_template_kwargs"] = {"enable_thinking": thinking}
|
|
|
|
| 106 |
temperature: float | None = None,
|
| 107 |
include_instruction: bool = True,
|
| 108 |
thinking: bool = False,
|
| 109 |
+
explain: bool = False,
|
| 110 |
return_usage: bool = False,
|
| 111 |
) -> str | tuple[str, object]:
|
| 112 |
"""Return which player wins in JSON using the given criteria."""
|
| 113 |
prompt = f"""Compare the two players below using:
|
| 114 |
{criteria_block}
|
| 115 |
|
| 116 |
+
"""
|
| 117 |
+
|
| 118 |
+
if explain:
|
| 119 |
+
prompt += f"""
|
| 120 |
+
Explain each criteria in English concisely.
|
| 121 |
+
One sentence per criteria.
|
| 122 |
+
Return JSON exactly like: {{"explain":"explanation","winner": "A"}} or {{"explain":"explanation","winner": "B"}}.""".strip()
|
| 123 |
+
else:
|
| 124 |
+
prompt += f"""Return JSON exactly like: {{"winner": "A"}} or {{"winner": "B"}}."""
|
| 125 |
+
|
| 126 |
if include_instruction:
|
| 127 |
prompt += f"\n\nInstruction:\n{instruction}"
|
| 128 |
prompt += f"\n\nPlayers:\n<A>{a}</A>\n<B>{b}</B>"
|