Spaces:
Sleeping
Sleeping
ping98k
commited on
Commit
·
295a884
1
Parent(s):
63a07f5
Handle explain option for plaintext judges
Browse files- main.py +19 -6
- tests/test_main.py +12 -3
- tests/test_tournament_utils.py +4 -4
- tournament_utils.py +20 -10
main.py
CHANGED
|
@@ -49,12 +49,25 @@ GENERATE_THINKING_DEFAULT = os.getenv("ENABLE_GENERATE_THINKING", "false").lower
|
|
| 49 |
SCORE_THINKING_DEFAULT = os.getenv("ENABLE_SCORE_THINKING", "false").lower() == "true"
|
| 50 |
PAIRWISE_THINKING_DEFAULT = os.getenv("ENABLE_PAIRWISE_THINKING", "false").lower() == "true"
|
| 51 |
CRITERIA_DEFAULT = "Factuality,Concise,Precision"
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
txt = re.sub(r"^```.*?\n|```$", "", txt, flags=re.DOTALL).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
try:
|
| 55 |
-
|
| 56 |
-
except
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
def run_tournament(
|
| 60 |
api_base,
|
|
@@ -202,7 +215,7 @@ def run_tournament(
|
|
| 202 |
)
|
| 203 |
add_usage(usage)
|
| 204 |
score_outputs.append((idx, text))
|
| 205 |
-
data =
|
| 206 |
if "scores" in data and isinstance(data["scores"], list):
|
| 207 |
vals = data["scores"]
|
| 208 |
return sum(vals) / len(vals) if vals else 0.0
|
|
@@ -245,7 +258,7 @@ def run_tournament(
|
|
| 245 |
)
|
| 246 |
add_usage(usage)
|
| 247 |
pairwise_outputs.append(text)
|
| 248 |
-
winner_label =
|
| 249 |
winner = a if winner_label == "A" else b
|
| 250 |
match_cache[key] = winner
|
| 251 |
return winner
|
|
|
|
| 49 |
SCORE_THINKING_DEFAULT = os.getenv("ENABLE_SCORE_THINKING", "false").lower() == "true"
|
| 50 |
PAIRWISE_THINKING_DEFAULT = os.getenv("ENABLE_PAIRWISE_THINKING", "false").lower() == "true"
|
| 51 |
CRITERIA_DEFAULT = "Factuality,Concise,Precision"
|
| 52 |
+
|
| 53 |
+
# Regex used to capture the final verdict from judge output
|
| 54 |
+
FINAL_VERDICT_RE = re.compile(r"(?im)^final verdict:\s*(.*)$")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _parse_verdict(txt: str) -> dict:
|
| 58 |
+
"""Extract verdict information from judge output."""
|
| 59 |
txt = re.sub(r"^```.*?\n|```$", "", txt, flags=re.DOTALL).strip()
|
| 60 |
+
match = FINAL_VERDICT_RE.search(txt)
|
| 61 |
+
if not match:
|
| 62 |
+
return {}
|
| 63 |
+
verdict = match.group(1).strip()
|
| 64 |
try:
|
| 65 |
+
verdict_val = ast.literal_eval(verdict)
|
| 66 |
+
except Exception:
|
| 67 |
+
verdict_val = verdict
|
| 68 |
+
if isinstance(verdict_val, list):
|
| 69 |
+
return {"scores": verdict_val}
|
| 70 |
+
return {"winner": str(verdict_val)}
|
| 71 |
|
| 72 |
def run_tournament(
|
| 73 |
api_base,
|
|
|
|
| 215 |
)
|
| 216 |
add_usage(usage)
|
| 217 |
score_outputs.append((idx, text))
|
| 218 |
+
data = _parse_verdict(text)
|
| 219 |
if "scores" in data and isinstance(data["scores"], list):
|
| 220 |
vals = data["scores"]
|
| 221 |
return sum(vals) / len(vals) if vals else 0.0
|
|
|
|
| 258 |
)
|
| 259 |
add_usage(usage)
|
| 260 |
pairwise_outputs.append(text)
|
| 261 |
+
winner_label = _parse_verdict(text).get("winner", "A")
|
| 262 |
winner = a if winner_label == "A" else b
|
| 263 |
match_cache[key] = winner
|
| 264 |
return winner
|
tests/test_main.py
CHANGED
|
@@ -87,8 +87,14 @@ def test_run_tournament_full_loop():
|
|
| 87 |
patch('main.plt.bar'):
|
| 88 |
mock_gen.return_value = (['p1', 'p2', 'p3', 'p4'], {'prompt_tokens':1,'completion_tokens':1})
|
| 89 |
scores = {'p1':3, 'p2':2, 'p3':1, 'p4':0}
|
| 90 |
-
mock_score.side_effect = lambda instr, cl, block, player, **kw: (
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
results = list(main.run_tournament(
|
| 94 |
api_base='b',
|
|
@@ -138,7 +144,10 @@ def test_run_tournament_pairwise_odd_players():
|
|
| 138 |
patch('main.plt.hist'), \
|
| 139 |
patch('main.plt.bar'):
|
| 140 |
mock_gen.return_value = (['p1', 'p2', 'p3'], {'prompt_tokens':1,'completion_tokens':1})
|
| 141 |
-
mock_pair.side_effect = lambda instr, block, a, b, **kw: (
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
results = list(main.run_tournament(
|
| 144 |
api_base='b',
|
|
|
|
| 87 |
patch('main.plt.bar'):
|
| 88 |
mock_gen.return_value = (['p1', 'p2', 'p3', 'p4'], {'prompt_tokens':1,'completion_tokens':1})
|
| 89 |
scores = {'p1':3, 'p2':2, 'p3':1, 'p4':0}
|
| 90 |
+
mock_score.side_effect = lambda instr, cl, block, player, **kw: (
|
| 91 |
+
f"Final verdict: [{scores[player]}]",
|
| 92 |
+
{'prompt_tokens':1,'completion_tokens':1}
|
| 93 |
+
)
|
| 94 |
+
mock_pair.side_effect = lambda instr, block, a, b, **kw: (
|
| 95 |
+
"Final verdict: A",
|
| 96 |
+
{'prompt_tokens':1,'completion_tokens':1}
|
| 97 |
+
)
|
| 98 |
|
| 99 |
results = list(main.run_tournament(
|
| 100 |
api_base='b',
|
|
|
|
| 144 |
patch('main.plt.hist'), \
|
| 145 |
patch('main.plt.bar'):
|
| 146 |
mock_gen.return_value = (['p1', 'p2', 'p3'], {'prompt_tokens':1,'completion_tokens':1})
|
| 147 |
+
mock_pair.side_effect = lambda instr, block, a, b, **kw: (
|
| 148 |
+
"Final verdict: A",
|
| 149 |
+
{'prompt_tokens':1,'completion_tokens':1}
|
| 150 |
+
)
|
| 151 |
|
| 152 |
results = list(main.run_tournament(
|
| 153 |
api_base='b',
|
tests/test_tournament_utils.py
CHANGED
|
@@ -31,25 +31,25 @@ def test_generate_players():
|
|
| 31 |
|
| 32 |
|
| 33 |
def test_prompt_score():
|
| 34 |
-
resp = make_response(["
|
| 35 |
with patch('tournament_utils.completion', return_value=resp) as mock_comp:
|
| 36 |
result = tu.prompt_score('instr', ['c1'], 'block', 'pl', model='m', api_base='b', api_key='k', temperature=0.2, include_instruction=False)
|
| 37 |
mock_comp.assert_called_once()
|
| 38 |
assert mock_comp.call_args.kwargs['api_base'] == 'b'
|
| 39 |
assert mock_comp.call_args.kwargs['api_key'] == 'k'
|
| 40 |
assert mock_comp.call_args.kwargs['temperature'] == 0.2
|
| 41 |
-
assert result == '
|
| 42 |
|
| 43 |
|
| 44 |
def test_prompt_pairwise():
|
| 45 |
-
resp = make_response(["
|
| 46 |
with patch('tournament_utils.completion', return_value=resp) as mock_comp:
|
| 47 |
result = tu.prompt_pairwise('instr', 'block', 'A text', 'B text', model='m', api_base='b', api_key='k', temperature=0.3, include_instruction=False)
|
| 48 |
mock_comp.assert_called_once()
|
| 49 |
assert mock_comp.call_args.kwargs['api_base'] == 'b'
|
| 50 |
assert mock_comp.call_args.kwargs['api_key'] == 'k'
|
| 51 |
assert mock_comp.call_args.kwargs['temperature'] == 0.3
|
| 52 |
-
assert result == '
|
| 53 |
|
| 54 |
|
| 55 |
def test_thinking_passed_to_completion():
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
def test_prompt_score():
|
| 34 |
+
resp = make_response(["Final verdict: [5]"])
|
| 35 |
with patch('tournament_utils.completion', return_value=resp) as mock_comp:
|
| 36 |
result = tu.prompt_score('instr', ['c1'], 'block', 'pl', model='m', api_base='b', api_key='k', temperature=0.2, include_instruction=False)
|
| 37 |
mock_comp.assert_called_once()
|
| 38 |
assert mock_comp.call_args.kwargs['api_base'] == 'b'
|
| 39 |
assert mock_comp.call_args.kwargs['api_key'] == 'k'
|
| 40 |
assert mock_comp.call_args.kwargs['temperature'] == 0.2
|
| 41 |
+
assert result == 'Final verdict: [5]'
|
| 42 |
|
| 43 |
|
| 44 |
def test_prompt_pairwise():
|
| 45 |
+
resp = make_response(["Final verdict: A"])
|
| 46 |
with patch('tournament_utils.completion', return_value=resp) as mock_comp:
|
| 47 |
result = tu.prompt_pairwise('instr', 'block', 'A text', 'B text', model='m', api_base='b', api_key='k', temperature=0.3, include_instruction=False)
|
| 48 |
mock_comp.assert_called_once()
|
| 49 |
assert mock_comp.call_args.kwargs['api_base'] == 'b'
|
| 50 |
assert mock_comp.call_args.kwargs['api_key'] == 'k'
|
| 51 |
assert mock_comp.call_args.kwargs['temperature'] == 0.3
|
| 52 |
+
assert result == 'Final verdict: A'
|
| 53 |
|
| 54 |
|
| 55 |
def test_thinking_passed_to_completion():
|
tournament_utils.py
CHANGED
|
@@ -63,18 +63,21 @@ def prompt_score(
|
|
| 63 |
explain: bool = False,
|
| 64 |
return_usage: bool = False,
|
| 65 |
) -> str | tuple[str, object]:
|
| 66 |
-
"""Return a
|
| 67 |
example_scores = ", ".join(["1-10"] * len(criteria_list)) or "1-10"
|
| 68 |
prompt = f"""Evaluate the output below on the following criteria:
|
| 69 |
{criteria_block}
|
| 70 |
|
|
|
|
| 71 |
"""
|
|
|
|
| 72 |
if explain:
|
| 73 |
-
prompt +=
|
| 74 |
-
|
| 75 |
-
|
| 76 |
else:
|
| 77 |
-
prompt +=
|
|
|
|
| 78 |
|
| 79 |
if include_instruction:
|
| 80 |
prompt += f"\n\nInstruction:\n{instruction}"
|
|
@@ -108,18 +111,25 @@ def prompt_pairwise(
|
|
| 108 |
explain: bool = False,
|
| 109 |
return_usage: bool = False,
|
| 110 |
) -> str | tuple[str, object]:
|
| 111 |
-
"""Return which player wins in
|
| 112 |
prompt = f"""Compare the two players below using:
|
| 113 |
{criteria_block}
|
| 114 |
|
|
|
|
| 115 |
"""
|
| 116 |
|
|
|
|
| 117 |
if explain:
|
| 118 |
-
prompt +=
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
| 121 |
else:
|
| 122 |
-
prompt +=
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
if include_instruction:
|
| 125 |
prompt += f"\n\nInstruction:\n{instruction}"
|
|
|
|
| 63 |
explain: bool = False,
|
| 64 |
return_usage: bool = False,
|
| 65 |
) -> str | tuple[str, object]:
|
| 66 |
+
"""Return a plaintext score evaluation for `player`."""
|
| 67 |
example_scores = ", ".join(["1-10"] * len(criteria_list)) or "1-10"
|
| 68 |
prompt = f"""Evaluate the output below on the following criteria:
|
| 69 |
{criteria_block}
|
| 70 |
|
| 71 |
+
{'Provide detailed reasons in English.' if explain else 'Provide a short reason.'}
|
| 72 |
"""
|
| 73 |
+
|
| 74 |
if explain:
|
| 75 |
+
prompt += "Respond in plain text with two sections exactly like:\n" \
|
| 76 |
+
"Reasons: <your reasoning>\n" \
|
| 77 |
+
f"Final verdict: [{example_scores}]"
|
| 78 |
else:
|
| 79 |
+
prompt += "Respond in plain text exactly like:\n" \
|
| 80 |
+
f"Final verdict: [{example_scores}]"
|
| 81 |
|
| 82 |
if include_instruction:
|
| 83 |
prompt += f"\n\nInstruction:\n{instruction}"
|
|
|
|
| 111 |
explain: bool = False,
|
| 112 |
return_usage: bool = False,
|
| 113 |
) -> str | tuple[str, object]:
|
| 114 |
+
"""Return which player wins in plaintext using the given criteria."""
|
| 115 |
prompt = f"""Compare the two players below using:
|
| 116 |
{criteria_block}
|
| 117 |
|
| 118 |
+
{'Provide detailed reasons in English.' if explain else 'Provide a short reason.'}
|
| 119 |
"""
|
| 120 |
|
| 121 |
+
verdict_example = "Final verdict: A or Final verdict: B"
|
| 122 |
if explain:
|
| 123 |
+
prompt += (
|
| 124 |
+
"Respond in plain text with two sections exactly like:\n"
|
| 125 |
+
"Reasons: <your reasoning>\n"
|
| 126 |
+
f"{verdict_example}"
|
| 127 |
+
)
|
| 128 |
else:
|
| 129 |
+
prompt += (
|
| 130 |
+
"Respond in plain text exactly like:\n"
|
| 131 |
+
f"{verdict_example}"
|
| 132 |
+
)
|
| 133 |
|
| 134 |
if include_instruction:
|
| 135 |
prompt += f"\n\nInstruction:\n{instruction}"
|