Spaces:

ping98k
/

LLM-Brainstorming

Sleeping

App Files Files Community

ping98k commited on Jun 28, 2025

Commit

295a884

1 Parent(s): 63a07f5

Handle explain option for plaintext judges

Browse files

Files changed (4) hide show

main.py +19 -6
tests/test_main.py +12 -3
tests/test_tournament_utils.py +4 -4
tournament_utils.py +20 -10

main.py CHANGED Viewed

@@ -49,12 +49,25 @@ GENERATE_THINKING_DEFAULT = os.getenv("ENABLE_GENERATE_THINKING", "false").lower
 SCORE_THINKING_DEFAULT = os.getenv("ENABLE_SCORE_THINKING", "false").lower() == "true"
 PAIRWISE_THINKING_DEFAULT = os.getenv("ENABLE_PAIRWISE_THINKING", "false").lower() == "true"
 CRITERIA_DEFAULT = "Factuality,Concise,Precision"
-def _clean_json(txt):
     txt = re.sub(r"^```.*?\n|```$", "", txt, flags=re.DOTALL).strip()
     try:
-        return json.loads(txt)
-    except json.JSONDecodeError:
-        return ast.literal_eval(txt)
 def run_tournament(
     api_base,
@@ -202,7 +215,7 @@ def run_tournament(
             )
             add_usage(usage)
             score_outputs.append((idx, text))
-            data = _clean_json(text)
             if "scores" in data and isinstance(data["scores"], list):
                 vals = data["scores"]
                 return sum(vals) / len(vals) if vals else 0.0
@@ -245,7 +258,7 @@ def run_tournament(
             )
             add_usage(usage)
             pairwise_outputs.append(text)
-            winner_label = _clean_json(text).get("winner", "A")
             winner = a if winner_label == "A" else b
             match_cache[key] = winner
             return winner

 SCORE_THINKING_DEFAULT = os.getenv("ENABLE_SCORE_THINKING", "false").lower() == "true"
 PAIRWISE_THINKING_DEFAULT = os.getenv("ENABLE_PAIRWISE_THINKING", "false").lower() == "true"
 CRITERIA_DEFAULT = "Factuality,Concise,Precision"
+# Regex used to capture the final verdict from judge output
+FINAL_VERDICT_RE = re.compile(r"(?im)^final verdict:\s*(.*)$")
+def _parse_verdict(txt: str) -> dict:
+    """Extract verdict information from judge output."""
     txt = re.sub(r"^```.*?\n|```$", "", txt, flags=re.DOTALL).strip()
+    match = FINAL_VERDICT_RE.search(txt)
+    if not match:
+        return {}
+    verdict = match.group(1).strip()
     try:
+        verdict_val = ast.literal_eval(verdict)
+    except Exception:
+        verdict_val = verdict
+    if isinstance(verdict_val, list):
+        return {"scores": verdict_val}
+    return {"winner": str(verdict_val)}
 def run_tournament(
     api_base,
             )
             add_usage(usage)
             score_outputs.append((idx, text))
+            data = _parse_verdict(text)
             if "scores" in data and isinstance(data["scores"], list):
                 vals = data["scores"]
                 return sum(vals) / len(vals) if vals else 0.0
             )
             add_usage(usage)
             pairwise_outputs.append(text)
+            winner_label = _parse_verdict(text).get("winner", "A")
             winner = a if winner_label == "A" else b
             match_cache[key] = winner
             return winner

tests/test_main.py CHANGED Viewed

@@ -87,8 +87,14 @@ def test_run_tournament_full_loop():
          patch('main.plt.bar'):
         mock_gen.return_value = (['p1', 'p2', 'p3', 'p4'], {'prompt_tokens':1,'completion_tokens':1})
         scores = {'p1':3, 'p2':2, 'p3':1, 'p4':0}
-        mock_score.side_effect = lambda instr, cl, block, player, **kw: (json.dumps({'score': scores[player]}), {'prompt_tokens':1,'completion_tokens':1})
-        mock_pair.side_effect = lambda instr, block, a, b, **kw: (json.dumps({'winner': 'A'}), {'prompt_tokens':1,'completion_tokens':1})
         results = list(main.run_tournament(
             api_base='b',
@@ -138,7 +144,10 @@ def test_run_tournament_pairwise_odd_players():
          patch('main.plt.hist'), \
          patch('main.plt.bar'):
         mock_gen.return_value = (['p1', 'p2', 'p3'], {'prompt_tokens':1,'completion_tokens':1})
-        mock_pair.side_effect = lambda instr, block, a, b, **kw: (json.dumps({'winner':'A'}), {'prompt_tokens':1,'completion_tokens':1})
         results = list(main.run_tournament(
             api_base='b',

          patch('main.plt.bar'):
         mock_gen.return_value = (['p1', 'p2', 'p3', 'p4'], {'prompt_tokens':1,'completion_tokens':1})
         scores = {'p1':3, 'p2':2, 'p3':1, 'p4':0}
+        mock_score.side_effect = lambda instr, cl, block, player, **kw: (
+            f"Final verdict: [{scores[player]}]",
+            {'prompt_tokens':1,'completion_tokens':1}
+        )
+        mock_pair.side_effect = lambda instr, block, a, b, **kw: (
+            "Final verdict: A",
+            {'prompt_tokens':1,'completion_tokens':1}
+        )
         results = list(main.run_tournament(
             api_base='b',
          patch('main.plt.hist'), \
          patch('main.plt.bar'):
         mock_gen.return_value = (['p1', 'p2', 'p3'], {'prompt_tokens':1,'completion_tokens':1})
+        mock_pair.side_effect = lambda instr, block, a, b, **kw: (
+            "Final verdict: A",
+            {'prompt_tokens':1,'completion_tokens':1}
+        )
         results = list(main.run_tournament(
             api_base='b',

tests/test_tournament_utils.py CHANGED Viewed

@@ -31,25 +31,25 @@ def test_generate_players():
 def test_prompt_score():
-    resp = make_response([" {\"score\": [5]} "])
     with patch('tournament_utils.completion', return_value=resp) as mock_comp:
         result = tu.prompt_score('instr', ['c1'], 'block', 'pl', model='m', api_base='b', api_key='k', temperature=0.2, include_instruction=False)
         mock_comp.assert_called_once()
         assert mock_comp.call_args.kwargs['api_base'] == 'b'
         assert mock_comp.call_args.kwargs['api_key'] == 'k'
         assert mock_comp.call_args.kwargs['temperature'] == 0.2
-        assert result == '{"score": [5]}'
 def test_prompt_pairwise():
-    resp = make_response([" {\"winner\": \"A\"} "])
     with patch('tournament_utils.completion', return_value=resp) as mock_comp:
         result = tu.prompt_pairwise('instr', 'block', 'A text', 'B text', model='m', api_base='b', api_key='k', temperature=0.3, include_instruction=False)
         mock_comp.assert_called_once()
         assert mock_comp.call_args.kwargs['api_base'] == 'b'
         assert mock_comp.call_args.kwargs['api_key'] == 'k'
         assert mock_comp.call_args.kwargs['temperature'] == 0.3
-        assert result == '{"winner": "A"}'
 def test_thinking_passed_to_completion():

 def test_prompt_score():
+    resp = make_response(["Final verdict: [5]"])
     with patch('tournament_utils.completion', return_value=resp) as mock_comp:
         result = tu.prompt_score('instr', ['c1'], 'block', 'pl', model='m', api_base='b', api_key='k', temperature=0.2, include_instruction=False)
         mock_comp.assert_called_once()
         assert mock_comp.call_args.kwargs['api_base'] == 'b'
         assert mock_comp.call_args.kwargs['api_key'] == 'k'
         assert mock_comp.call_args.kwargs['temperature'] == 0.2
+        assert result == 'Final verdict: [5]'
 def test_prompt_pairwise():
+    resp = make_response(["Final verdict: A"])
     with patch('tournament_utils.completion', return_value=resp) as mock_comp:
         result = tu.prompt_pairwise('instr', 'block', 'A text', 'B text', model='m', api_base='b', api_key='k', temperature=0.3, include_instruction=False)
         mock_comp.assert_called_once()
         assert mock_comp.call_args.kwargs['api_base'] == 'b'
         assert mock_comp.call_args.kwargs['api_key'] == 'k'
         assert mock_comp.call_args.kwargs['temperature'] == 0.3
+        assert result == 'Final verdict: A'
 def test_thinking_passed_to_completion():

tournament_utils.py CHANGED Viewed

@@ -63,18 +63,21 @@ def prompt_score(
     explain: bool = False,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
-    """Return a JSON score string evaluating `player` on the criteria."""
     example_scores = ", ".join(["1-10"] * len(criteria_list)) or "1-10"
     prompt = f"""Evaluate the output below on the following criteria:
 {criteria_block}
 """
     if explain:
-        prompt += f"""
-Provide detailed reasons in English for each criterion.
-Return JSON exactly like: {{"reasons":"","scores": [{example_scores}]}}.""".strip()
     else:
-        prompt += f"""Return JSON exactly like: {{"scores": [{example_scores}]}}."""
     if include_instruction:
         prompt += f"\n\nInstruction:\n{instruction}"
@@ -108,18 +111,25 @@ def prompt_pairwise(
     explain: bool = False,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
-    """Return which player wins in JSON using the given criteria."""
     prompt = f"""Compare the two players below using:
 {criteria_block}
 """
     if explain:
-        prompt += f"""
-Provide detailed reasons in English for each criterion.
-Return JSON exactly like: {{"reasons":"","winner": "A"}} or {{"reasons":"","winner": "B"}}.""".strip()
     else:
-        prompt += f"""Return JSON exactly like: {{"winner": "A"}} or {{"winner": "B"}}."""
     if include_instruction:
         prompt += f"\n\nInstruction:\n{instruction}"

     explain: bool = False,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
+    """Return a plaintext score evaluation for `player`."""
     example_scores = ", ".join(["1-10"] * len(criteria_list)) or "1-10"
     prompt = f"""Evaluate the output below on the following criteria:
 {criteria_block}
+{'Provide detailed reasons in English.' if explain else 'Provide a short reason.'}
 """
     if explain:
+        prompt += "Respond in plain text with two sections exactly like:\n" \
+                 "Reasons: <your reasoning>\n" \
+                 f"Final verdict: [{example_scores}]"
     else:
+        prompt += "Respond in plain text exactly like:\n" \
+                 f"Final verdict: [{example_scores}]"
     if include_instruction:
         prompt += f"\n\nInstruction:\n{instruction}"
     explain: bool = False,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
+    """Return which player wins in plaintext using the given criteria."""
     prompt = f"""Compare the two players below using:
 {criteria_block}
+{'Provide detailed reasons in English.' if explain else 'Provide a short reason.'}
 """
+    verdict_example = "Final verdict: A or Final verdict: B"
     if explain:
+        prompt += (
+            "Respond in plain text with two sections exactly like:\n"
+            "Reasons: <your reasoning>\n"
+            f"{verdict_example}"
+        )
     else:
+        prompt += (
+            "Respond in plain text exactly like:\n"
+            f"{verdict_example}"
+        )
     if include_instruction:
         prompt += f"\n\nInstruction:\n{instruction}"