Spaces:

ping98k
/

LLM-Brainstorming

Sleeping

ping98k commited on Jun 27, 2025

Commit

fa5956e

1 Parent(s): 97d13c6

Add explain option to scoring and pairwise evaluation

Introduces 'explain' parameters to both scoring and pairwise evaluation functions, allowing explanations for each criteria to be included in the JSON output. Updates the Gradio interface and main tournament logic to support toggling this feature.

Files changed (2) hide show

main.py +14 -3
tournament_utils.py +22 -2

main.py CHANGED Viewed

@@ -78,6 +78,8 @@ def run_tournament(
     generate_thinking,
     score_thinking,
     pairwise_thinking,
 ):
     instruction = instruction_input.strip()
     criteria_list = [c.strip() for c in criteria_input.split(",") if c.strip()] or ["Factuality", "Instruction Following", "Precision"]
@@ -113,6 +115,11 @@ def run_tournament(
         score_thinking = SCORE_THINKING_DEFAULT
     if pairwise_thinking is None:
         pairwise_thinking = PAIRWISE_THINKING_DEFAULT
     process_log = []
     hist_fig = None
     top_picks_str = ""
@@ -145,8 +152,8 @@ def run_tournament(
     def log_completion(prefix: str, text: str, player_id: int | None = None):
         disp = text.replace("\n", " ")
-        if len(disp) > 100:
-            disp = disp[:100] + "…"
         if player_id is not None:
             prefix = f"{prefix}(ID {player_id}) "
         return log(f"{prefix}{disp}")
@@ -154,7 +161,7 @@ def run_tournament(
         process_log.append(msg)
         tqdm.write(msg)
         yield "\n".join(process_log), hist_fig, top_picks_str, usage_str()
-    yield from log("Generating players …")
     all_players, usage = generate_players(
         instruction,
         n_gen,
@@ -188,6 +195,7 @@ def run_tournament(
                 temperature=score_temperature,
                 include_instruction=score_with_instruction,
                 thinking=score_thinking,
                 return_usage=True,
             )
             add_usage(usage)
@@ -227,6 +235,7 @@ def run_tournament(
                 temperature=pairwise_temperature,
                 include_instruction=pairwise_with_instruction,
                 thinking=pairwise_thinking,
                 return_usage=True,
             )
             add_usage(usage)
@@ -320,6 +329,8 @@ demo = gr.Interface(
         gr.Checkbox(value=GENERATE_THINKING_DEFAULT, label="Enable Thinking (Generate)"),
         gr.Checkbox(value=SCORE_THINKING_DEFAULT, label="Enable Thinking (Score)"),
         gr.Checkbox(value=PAIRWISE_THINKING_DEFAULT, label="Enable Thinking (Pairwise)"),
     ],
     outputs=[
         gr.Textbox(lines=10, label="Process"),

     generate_thinking,
     score_thinking,
     pairwise_thinking,
+    score_explain,
+    pairwise_explain,
 ):
     instruction = instruction_input.strip()
     criteria_list = [c.strip() for c in criteria_input.split(",") if c.strip()] or ["Factuality", "Instruction Following", "Precision"]
         score_thinking = SCORE_THINKING_DEFAULT
     if pairwise_thinking is None:
         pairwise_thinking = PAIRWISE_THINKING_DEFAULT
+    if score_explain is None:
+        score_explain = False
+    if pairwise_explain is None:
+        pairwise_explain = False
     process_log = []
     hist_fig = None
     top_picks_str = ""
     def log_completion(prefix: str, text: str, player_id: int | None = None):
         disp = text.replace("\n", " ")
+        if len(disp) > 1000:
+            disp = disp[:1000] + "…"
         if player_id is not None:
             prefix = f"{prefix}(ID {player_id}) "
         return log(f"{prefix}{disp}")
         process_log.append(msg)
         tqdm.write(msg)
         yield "\n".join(process_log), hist_fig, top_picks_str, usage_str()
+    yield from log("Generating answers …")
     all_players, usage = generate_players(
         instruction,
         n_gen,
                 temperature=score_temperature,
                 include_instruction=score_with_instruction,
                 thinking=score_thinking,
+                explain=score_explain,
                 return_usage=True,
             )
             add_usage(usage)
                 temperature=pairwise_temperature,
                 include_instruction=pairwise_with_instruction,
                 thinking=pairwise_thinking,
+                explain=pairwise_explain,
                 return_usage=True,
             )
             add_usage(usage)
         gr.Checkbox(value=GENERATE_THINKING_DEFAULT, label="Enable Thinking (Generate)"),
         gr.Checkbox(value=SCORE_THINKING_DEFAULT, label="Enable Thinking (Score)"),
         gr.Checkbox(value=PAIRWISE_THINKING_DEFAULT, label="Enable Thinking (Pairwise)"),
+        gr.Checkbox(value=False, label="Enable Explain (Score)"),
+        gr.Checkbox(value=False, label="Enable Explain (Pairwise)"),
     ],
     outputs=[
         gr.Textbox(lines=10, label="Process"),

tournament_utils.py CHANGED Viewed

@@ -60,6 +60,7 @@ def prompt_score(
     temperature: float | None = None,
     include_instruction: bool = True,
     thinking: bool = False,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
     """Return a JSON score string evaluating `player` on the criteria."""
@@ -67,9 +68,18 @@ def prompt_score(
     prompt = f"""Evaluate the output below on the following criteria:
 {criteria_block}
-Return JSON exactly like: {{"scores": [{example_scores}]}}."""
     if include_instruction:
         prompt += f"\n\nInstruction:\n{instruction}"
     prompt += f"\n\nOutput:\n{player}"
     kwargs = _completion_kwargs(api_base, api_key, temperature)
     kwargs["chat_template_kwargs"] = {"enable_thinking": thinking}
@@ -96,13 +106,23 @@ def prompt_pairwise(
     temperature: float | None = None,
     include_instruction: bool = True,
     thinking: bool = False,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
     """Return which player wins in JSON using the given criteria."""
     prompt = f"""Compare the two players below using:
 {criteria_block}
-Return ONLY JSON {{"winner": "A"}} or {{"winner": "B"}}."""
     if include_instruction:
         prompt += f"\n\nInstruction:\n{instruction}"
     prompt += f"\n\nPlayers:\n<A>{a}</A>\n<B>{b}</B>"

     temperature: float | None = None,
     include_instruction: bool = True,
     thinking: bool = False,
+    explain: bool = False,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
     """Return a JSON score string evaluating `player` on the criteria."""
     prompt = f"""Evaluate the output below on the following criteria:
 {criteria_block}
+"""
+    if explain:
+        prompt += f"""
+Explain each criteria in English concisely.
+One sentence per criteria.
+Return JSON exactly like: {{"explain":"explanation","scores": [{example_scores}]}}.""".strip()
+    else:
+        prompt += f"""Return JSON exactly like: {{"scores": [{example_scores}]}}."""
     if include_instruction:
         prompt += f"\n\nInstruction:\n{instruction}"
     prompt += f"\n\nOutput:\n{player}"
     kwargs = _completion_kwargs(api_base, api_key, temperature)
     kwargs["chat_template_kwargs"] = {"enable_thinking": thinking}
     temperature: float | None = None,
     include_instruction: bool = True,
     thinking: bool = False,
+    explain: bool = False,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
     """Return which player wins in JSON using the given criteria."""
     prompt = f"""Compare the two players below using:
 {criteria_block}
+"""
+    if explain:
+        prompt += f"""
+Explain each criteria in English concisely.
+One sentence per criteria.
+Return JSON exactly like: {{"explain":"explanation","winner": "A"}} or {{"explain":"explanation","winner": "B"}}.""".strip()
+    else:
+        prompt += f"""Return JSON exactly like: {{"winner": "A"}} or {{"winner": "B"}}."""
     if include_instruction:
         prompt += f"\n\nInstruction:\n{instruction}"
     prompt += f"\n\nPlayers:\n<A>{a}</A>\n<B>{b}</B>"