ping98k commited on
Commit
295a884
·
1 Parent(s): 63a07f5

Handle explain option for plaintext judges

Browse files
main.py CHANGED
@@ -49,12 +49,25 @@ GENERATE_THINKING_DEFAULT = os.getenv("ENABLE_GENERATE_THINKING", "false").lower
49
  SCORE_THINKING_DEFAULT = os.getenv("ENABLE_SCORE_THINKING", "false").lower() == "true"
50
  PAIRWISE_THINKING_DEFAULT = os.getenv("ENABLE_PAIRWISE_THINKING", "false").lower() == "true"
51
  CRITERIA_DEFAULT = "Factuality,Concise,Precision"
52
- def _clean_json(txt):
 
 
 
 
 
 
53
  txt = re.sub(r"^```.*?\n|```$", "", txt, flags=re.DOTALL).strip()
 
 
 
 
54
  try:
55
- return json.loads(txt)
56
- except json.JSONDecodeError:
57
- return ast.literal_eval(txt)
 
 
 
58
 
59
  def run_tournament(
60
  api_base,
@@ -202,7 +215,7 @@ def run_tournament(
202
  )
203
  add_usage(usage)
204
  score_outputs.append((idx, text))
205
- data = _clean_json(text)
206
  if "scores" in data and isinstance(data["scores"], list):
207
  vals = data["scores"]
208
  return sum(vals) / len(vals) if vals else 0.0
@@ -245,7 +258,7 @@ def run_tournament(
245
  )
246
  add_usage(usage)
247
  pairwise_outputs.append(text)
248
- winner_label = _clean_json(text).get("winner", "A")
249
  winner = a if winner_label == "A" else b
250
  match_cache[key] = winner
251
  return winner
 
49
  SCORE_THINKING_DEFAULT = os.getenv("ENABLE_SCORE_THINKING", "false").lower() == "true"
50
  PAIRWISE_THINKING_DEFAULT = os.getenv("ENABLE_PAIRWISE_THINKING", "false").lower() == "true"
51
  CRITERIA_DEFAULT = "Factuality,Concise,Precision"
52
+
53
+ # Regex used to capture the final verdict from judge output
54
+ FINAL_VERDICT_RE = re.compile(r"(?im)^final verdict:\s*(.*)$")
55
+
56
+
57
+ def _parse_verdict(txt: str) -> dict:
58
+ """Extract verdict information from judge output."""
59
  txt = re.sub(r"^```.*?\n|```$", "", txt, flags=re.DOTALL).strip()
60
+ match = FINAL_VERDICT_RE.search(txt)
61
+ if not match:
62
+ return {}
63
+ verdict = match.group(1).strip()
64
  try:
65
+ verdict_val = ast.literal_eval(verdict)
66
+ except Exception:
67
+ verdict_val = verdict
68
+ if isinstance(verdict_val, list):
69
+ return {"scores": verdict_val}
70
+ return {"winner": str(verdict_val)}
71
 
72
  def run_tournament(
73
  api_base,
 
215
  )
216
  add_usage(usage)
217
  score_outputs.append((idx, text))
218
+ data = _parse_verdict(text)
219
  if "scores" in data and isinstance(data["scores"], list):
220
  vals = data["scores"]
221
  return sum(vals) / len(vals) if vals else 0.0
 
258
  )
259
  add_usage(usage)
260
  pairwise_outputs.append(text)
261
+ winner_label = _parse_verdict(text).get("winner", "A")
262
  winner = a if winner_label == "A" else b
263
  match_cache[key] = winner
264
  return winner
tests/test_main.py CHANGED
@@ -87,8 +87,14 @@ def test_run_tournament_full_loop():
87
  patch('main.plt.bar'):
88
  mock_gen.return_value = (['p1', 'p2', 'p3', 'p4'], {'prompt_tokens':1,'completion_tokens':1})
89
  scores = {'p1':3, 'p2':2, 'p3':1, 'p4':0}
90
- mock_score.side_effect = lambda instr, cl, block, player, **kw: (json.dumps({'score': scores[player]}), {'prompt_tokens':1,'completion_tokens':1})
91
- mock_pair.side_effect = lambda instr, block, a, b, **kw: (json.dumps({'winner': 'A'}), {'prompt_tokens':1,'completion_tokens':1})
 
 
 
 
 
 
92
 
93
  results = list(main.run_tournament(
94
  api_base='b',
@@ -138,7 +144,10 @@ def test_run_tournament_pairwise_odd_players():
138
  patch('main.plt.hist'), \
139
  patch('main.plt.bar'):
140
  mock_gen.return_value = (['p1', 'p2', 'p3'], {'prompt_tokens':1,'completion_tokens':1})
141
- mock_pair.side_effect = lambda instr, block, a, b, **kw: (json.dumps({'winner':'A'}), {'prompt_tokens':1,'completion_tokens':1})
 
 
 
142
 
143
  results = list(main.run_tournament(
144
  api_base='b',
 
87
  patch('main.plt.bar'):
88
  mock_gen.return_value = (['p1', 'p2', 'p3', 'p4'], {'prompt_tokens':1,'completion_tokens':1})
89
  scores = {'p1':3, 'p2':2, 'p3':1, 'p4':0}
90
+ mock_score.side_effect = lambda instr, cl, block, player, **kw: (
91
+ f"Final verdict: [{scores[player]}]",
92
+ {'prompt_tokens':1,'completion_tokens':1}
93
+ )
94
+ mock_pair.side_effect = lambda instr, block, a, b, **kw: (
95
+ "Final verdict: A",
96
+ {'prompt_tokens':1,'completion_tokens':1}
97
+ )
98
 
99
  results = list(main.run_tournament(
100
  api_base='b',
 
144
  patch('main.plt.hist'), \
145
  patch('main.plt.bar'):
146
  mock_gen.return_value = (['p1', 'p2', 'p3'], {'prompt_tokens':1,'completion_tokens':1})
147
+ mock_pair.side_effect = lambda instr, block, a, b, **kw: (
148
+ "Final verdict: A",
149
+ {'prompt_tokens':1,'completion_tokens':1}
150
+ )
151
 
152
  results = list(main.run_tournament(
153
  api_base='b',
tests/test_tournament_utils.py CHANGED
@@ -31,25 +31,25 @@ def test_generate_players():
31
 
32
 
33
  def test_prompt_score():
34
- resp = make_response([" {\"score\": [5]} "])
35
  with patch('tournament_utils.completion', return_value=resp) as mock_comp:
36
  result = tu.prompt_score('instr', ['c1'], 'block', 'pl', model='m', api_base='b', api_key='k', temperature=0.2, include_instruction=False)
37
  mock_comp.assert_called_once()
38
  assert mock_comp.call_args.kwargs['api_base'] == 'b'
39
  assert mock_comp.call_args.kwargs['api_key'] == 'k'
40
  assert mock_comp.call_args.kwargs['temperature'] == 0.2
41
- assert result == '{"score": [5]}'
42
 
43
 
44
  def test_prompt_pairwise():
45
- resp = make_response([" {\"winner\": \"A\"} "])
46
  with patch('tournament_utils.completion', return_value=resp) as mock_comp:
47
  result = tu.prompt_pairwise('instr', 'block', 'A text', 'B text', model='m', api_base='b', api_key='k', temperature=0.3, include_instruction=False)
48
  mock_comp.assert_called_once()
49
  assert mock_comp.call_args.kwargs['api_base'] == 'b'
50
  assert mock_comp.call_args.kwargs['api_key'] == 'k'
51
  assert mock_comp.call_args.kwargs['temperature'] == 0.3
52
- assert result == '{"winner": "A"}'
53
 
54
 
55
  def test_thinking_passed_to_completion():
 
31
 
32
 
33
  def test_prompt_score():
34
+ resp = make_response(["Final verdict: [5]"])
35
  with patch('tournament_utils.completion', return_value=resp) as mock_comp:
36
  result = tu.prompt_score('instr', ['c1'], 'block', 'pl', model='m', api_base='b', api_key='k', temperature=0.2, include_instruction=False)
37
  mock_comp.assert_called_once()
38
  assert mock_comp.call_args.kwargs['api_base'] == 'b'
39
  assert mock_comp.call_args.kwargs['api_key'] == 'k'
40
  assert mock_comp.call_args.kwargs['temperature'] == 0.2
41
+ assert result == 'Final verdict: [5]'
42
 
43
 
44
  def test_prompt_pairwise():
45
+ resp = make_response(["Final verdict: A"])
46
  with patch('tournament_utils.completion', return_value=resp) as mock_comp:
47
  result = tu.prompt_pairwise('instr', 'block', 'A text', 'B text', model='m', api_base='b', api_key='k', temperature=0.3, include_instruction=False)
48
  mock_comp.assert_called_once()
49
  assert mock_comp.call_args.kwargs['api_base'] == 'b'
50
  assert mock_comp.call_args.kwargs['api_key'] == 'k'
51
  assert mock_comp.call_args.kwargs['temperature'] == 0.3
52
+ assert result == 'Final verdict: A'
53
 
54
 
55
  def test_thinking_passed_to_completion():
tournament_utils.py CHANGED
@@ -63,18 +63,21 @@ def prompt_score(
63
  explain: bool = False,
64
  return_usage: bool = False,
65
  ) -> str | tuple[str, object]:
66
- """Return a JSON score string evaluating `player` on the criteria."""
67
  example_scores = ", ".join(["1-10"] * len(criteria_list)) or "1-10"
68
  prompt = f"""Evaluate the output below on the following criteria:
69
  {criteria_block}
70
 
 
71
  """
 
72
  if explain:
73
- prompt += f"""
74
- Provide detailed reasons in English for each criterion.
75
- Return JSON exactly like: {{"reasons":"","scores": [{example_scores}]}}.""".strip()
76
  else:
77
- prompt += f"""Return JSON exactly like: {{"scores": [{example_scores}]}}."""
 
78
 
79
  if include_instruction:
80
  prompt += f"\n\nInstruction:\n{instruction}"
@@ -108,18 +111,25 @@ def prompt_pairwise(
108
  explain: bool = False,
109
  return_usage: bool = False,
110
  ) -> str | tuple[str, object]:
111
- """Return which player wins in JSON using the given criteria."""
112
  prompt = f"""Compare the two players below using:
113
  {criteria_block}
114
 
 
115
  """
116
 
 
117
  if explain:
118
- prompt += f"""
119
- Provide detailed reasons in English for each criterion.
120
- Return JSON exactly like: {{"reasons":"","winner": "A"}} or {{"reasons":"","winner": "B"}}.""".strip()
 
 
121
  else:
122
- prompt += f"""Return JSON exactly like: {{"winner": "A"}} or {{"winner": "B"}}."""
 
 
 
123
 
124
  if include_instruction:
125
  prompt += f"\n\nInstruction:\n{instruction}"
 
63
  explain: bool = False,
64
  return_usage: bool = False,
65
  ) -> str | tuple[str, object]:
66
+ """Return a plaintext score evaluation for `player`."""
67
  example_scores = ", ".join(["1-10"] * len(criteria_list)) or "1-10"
68
  prompt = f"""Evaluate the output below on the following criteria:
69
  {criteria_block}
70
 
71
+ {'Provide detailed reasons in English.' if explain else 'Provide a short reason.'}
72
  """
73
+
74
  if explain:
75
+ prompt += "Respond in plain text with two sections exactly like:\n" \
76
+ "Reasons: <your reasoning>\n" \
77
+ f"Final verdict: [{example_scores}]"
78
  else:
79
+ prompt += "Respond in plain text exactly like:\n" \
80
+ f"Final verdict: [{example_scores}]"
81
 
82
  if include_instruction:
83
  prompt += f"\n\nInstruction:\n{instruction}"
 
111
  explain: bool = False,
112
  return_usage: bool = False,
113
  ) -> str | tuple[str, object]:
114
+ """Return which player wins in plaintext using the given criteria."""
115
  prompt = f"""Compare the two players below using:
116
  {criteria_block}
117
 
118
+ {'Provide detailed reasons in English.' if explain else 'Provide a short reason.'}
119
  """
120
 
121
+ verdict_example = "Final verdict: A or Final verdict: B"
122
  if explain:
123
+ prompt += (
124
+ "Respond in plain text with two sections exactly like:\n"
125
+ "Reasons: <your reasoning>\n"
126
+ f"{verdict_example}"
127
+ )
128
  else:
129
+ prompt += (
130
+ "Respond in plain text exactly like:\n"
131
+ f"{verdict_example}"
132
+ )
133
 
134
  if include_instruction:
135
  prompt += f"\n\nInstruction:\n{instruction}"