Spaces:

ping98k
/

LLM-Brainstorming

Sleeping

App Files Files Community

ping98k commited on Jun 27, 2025

Commit

faf573e

unverified ·

2 Parent(s): bc6765e cca2d14

Merge pull request #7 from ping98k/codex/add-option-boxes-for-score-and-pairwise-model

Browse files

Files changed (5) hide show

README.md +5 -0
main.py +30 -0
tests/test_main.py +11 -1
tests/test_tournament_utils.py +6 -4
tournament_utils.py +23 -19

README.md CHANGED Viewed

@@ -14,6 +14,11 @@ This project provides a small interface for running "tournaments" between langua
    - `GENERATE_MODEL`
    - `SCORE_MODEL`
    - `PAIRWISE_MODEL`
    - `ENABLE_SCORE_FILTER`
    - `ENABLE_PAIRWISE_FILTER`
 2. Install dependencies (example with `pip`):

    - `GENERATE_MODEL`
    - `SCORE_MODEL`
    - `PAIRWISE_MODEL`
+   - `GENERATE_TEMPERATURE`
+   - `SCORE_TEMPERATURE`
+   - `PAIRWISE_TEMPERATURE`
+   - `PASS_INSTRUCTION_TO_SCORE`
+   - `PASS_INSTRUCTION_TO_PAIRWISE`
    - `ENABLE_SCORE_FILTER`
    - `ENABLE_PAIRWISE_FILTER`
 2. Install dependencies (example with `pip`):

main.py CHANGED Viewed

@@ -40,6 +40,11 @@ PAIRWISE_FILTER_DEFAULT = os.getenv("ENABLE_PAIRWISE_FILTER", "true").lower() ==
 GENERATE_MODEL_DEFAULT = os.getenv("GENERATE_MODEL", "gpt-4o-mini")
 SCORE_MODEL_DEFAULT = os.getenv("SCORE_MODEL", "gpt-4o-mini")
 PAIRWISE_MODEL_DEFAULT = os.getenv("PAIRWISE_MODEL", "gpt-4o-mini")
 CRITERIA_DEFAULT = "Factuality,Instruction Following,Precision"
 def _clean_json(txt):
     txt = re.sub(r"^```.*?\n|```$", "", txt, flags=re.DOTALL).strip()
@@ -54,6 +59,9 @@ def run_tournament(
     generate_model,
     score_model,
     pairwise_model,
     instruction_input,
     criteria_input,
     n_gen,
@@ -62,6 +70,8 @@ def run_tournament(
     max_workers,
     enable_score_filter,
     enable_pairwise_filter,
 ):
     instruction = instruction_input.strip()
     criteria_list = [c.strip() for c in criteria_input.split(",") if c.strip()] or ["Factuality", "Instruction Following", "Precision"]
@@ -69,6 +79,12 @@ def run_tournament(
     num_top_picks = int(num_top_picks)
     pool_size = int(pool_size)
     max_workers = int(max_workers)
     if not api_base:
         api_base = API_BASE_DEFAULT
     if not api_token:
@@ -81,6 +97,10 @@ def run_tournament(
         pairwise_model = PAIRWISE_MODEL_DEFAULT
     enable_score_filter = bool(enable_score_filter)
     enable_pairwise_filter = bool(enable_pairwise_filter)
     process_log = []
     hist_fig = None
     top_picks_str = ""
@@ -127,6 +147,7 @@ def run_tournament(
         model=generate_model,
         api_base=api_base,
         api_key=api_token,
         return_usage=True,
     )
     add_usage(usage)
@@ -146,6 +167,8 @@ def run_tournament(
                 model=score_model,
                 api_base=api_base,
                 api_key=api_token,
                 return_usage=True,
             )
             add_usage(usage)
@@ -182,6 +205,8 @@ def run_tournament(
                 model=pairwise_model,
                 api_base=api_base,
                 api_key=api_token,
                 return_usage=True,
             )
             add_usage(usage)
@@ -259,6 +284,9 @@ demo = gr.Interface(
         gr.Textbox(value=GENERATE_MODEL_DEFAULT, label="Generation Model"),
         gr.Textbox(value=SCORE_MODEL_DEFAULT, label="Score Model"),
         gr.Textbox(value=PAIRWISE_MODEL_DEFAULT, label="Pairwise Model"),
         gr.Textbox(lines=10, label="Instruction"),
         gr.Textbox(value=CRITERIA_DEFAULT, lines=5, label="Criteria (comma separated)"),
         gr.Number(value=NUM_GENERATIONS_DEFAULT, label="Number of Generations"),
@@ -267,6 +295,8 @@ demo = gr.Interface(
         gr.Number(value=MAX_WORKERS_DEFAULT, label="Max Workers"),
         gr.Checkbox(value=SCORE_FILTER_DEFAULT, label="Enable Score Filter"),
         gr.Checkbox(value=PAIRWISE_FILTER_DEFAULT, label="Enable Pairwise Filter"),
     ],
     outputs=[
         gr.Textbox(lines=10, label="Process"),

 GENERATE_MODEL_DEFAULT = os.getenv("GENERATE_MODEL", "gpt-4o-mini")
 SCORE_MODEL_DEFAULT = os.getenv("SCORE_MODEL", "gpt-4o-mini")
 PAIRWISE_MODEL_DEFAULT = os.getenv("PAIRWISE_MODEL", "gpt-4o-mini")
+GENERATE_TEMPERATURE_DEFAULT = float(os.getenv("GENERATE_TEMPERATURE", "1.0"))
+SCORE_TEMPERATURE_DEFAULT = float(os.getenv("SCORE_TEMPERATURE", "1.0"))
+PAIRWISE_TEMPERATURE_DEFAULT = float(os.getenv("PAIRWISE_TEMPERATURE", "1.0"))
+SCORE_WITH_INSTRUCTION_DEFAULT = os.getenv("PASS_INSTRUCTION_TO_SCORE", "true").lower() == "true"
+PAIRWISE_WITH_INSTRUCTION_DEFAULT = os.getenv("PASS_INSTRUCTION_TO_PAIRWISE", "true").lower() == "true"
 CRITERIA_DEFAULT = "Factuality,Instruction Following,Precision"
 def _clean_json(txt):
     txt = re.sub(r"^```.*?\n|```$", "", txt, flags=re.DOTALL).strip()
     generate_model,
     score_model,
     pairwise_model,
+    generate_temperature,
+    score_temperature,
+    pairwise_temperature,
     instruction_input,
     criteria_input,
     n_gen,
     max_workers,
     enable_score_filter,
     enable_pairwise_filter,
+    score_with_instruction,
+    pairwise_with_instruction,
 ):
     instruction = instruction_input.strip()
     criteria_list = [c.strip() for c in criteria_input.split(",") if c.strip()] or ["Factuality", "Instruction Following", "Precision"]
     num_top_picks = int(num_top_picks)
     pool_size = int(pool_size)
     max_workers = int(max_workers)
+    if generate_temperature is None:
+        generate_temperature = GENERATE_TEMPERATURE_DEFAULT
+    if score_temperature is None:
+        score_temperature = SCORE_TEMPERATURE_DEFAULT
+    if pairwise_temperature is None:
+        pairwise_temperature = PAIRWISE_TEMPERATURE_DEFAULT
     if not api_base:
         api_base = API_BASE_DEFAULT
     if not api_token:
         pairwise_model = PAIRWISE_MODEL_DEFAULT
     enable_score_filter = bool(enable_score_filter)
     enable_pairwise_filter = bool(enable_pairwise_filter)
+    if score_with_instruction is None:
+        score_with_instruction = SCORE_WITH_INSTRUCTION_DEFAULT
+    if pairwise_with_instruction is None:
+        pairwise_with_instruction = PAIRWISE_WITH_INSTRUCTION_DEFAULT
     process_log = []
     hist_fig = None
     top_picks_str = ""
         model=generate_model,
         api_base=api_base,
         api_key=api_token,
+        temperature=generate_temperature,
         return_usage=True,
     )
     add_usage(usage)
                 model=score_model,
                 api_base=api_base,
                 api_key=api_token,
+                temperature=score_temperature,
+                include_instruction=score_with_instruction,
                 return_usage=True,
             )
             add_usage(usage)
                 model=pairwise_model,
                 api_base=api_base,
                 api_key=api_token,
+                temperature=pairwise_temperature,
+                include_instruction=pairwise_with_instruction,
                 return_usage=True,
             )
             add_usage(usage)
         gr.Textbox(value=GENERATE_MODEL_DEFAULT, label="Generation Model"),
         gr.Textbox(value=SCORE_MODEL_DEFAULT, label="Score Model"),
         gr.Textbox(value=PAIRWISE_MODEL_DEFAULT, label="Pairwise Model"),
+        gr.Number(value=GENERATE_TEMPERATURE_DEFAULT, label="Generation Temperature"),
+        gr.Number(value=SCORE_TEMPERATURE_DEFAULT, label="Score Temperature"),
+        gr.Number(value=PAIRWISE_TEMPERATURE_DEFAULT, label="Pairwise Temperature"),
         gr.Textbox(lines=10, label="Instruction"),
         gr.Textbox(value=CRITERIA_DEFAULT, lines=5, label="Criteria (comma separated)"),
         gr.Number(value=NUM_GENERATIONS_DEFAULT, label="Number of Generations"),
         gr.Number(value=MAX_WORKERS_DEFAULT, label="Max Workers"),
         gr.Checkbox(value=SCORE_FILTER_DEFAULT, label="Enable Score Filter"),
         gr.Checkbox(value=PAIRWISE_FILTER_DEFAULT, label="Enable Pairwise Filter"),
+        gr.Checkbox(value=SCORE_WITH_INSTRUCTION_DEFAULT, label="Pass Instruction to Score Model"),
+        gr.Checkbox(value=PAIRWISE_WITH_INSTRUCTION_DEFAULT, label="Pass Instruction to Pairwise Model"),
     ],
     outputs=[
         gr.Textbox(lines=10, label="Process"),

tests/test_main.py CHANGED Viewed

@@ -93,6 +93,9 @@ def test_run_tournament_full_loop():
             generate_model='gm',
             score_model='sm',
             pairwise_model='pm',
             instruction_input='instr',
             criteria_input='c1,c2',
             n_gen=4,
@@ -101,13 +104,15 @@ def test_run_tournament_full_loop():
             max_workers=1,
             enable_score_filter=True,
             enable_pairwise_filter=True,
         ))
     process_log, hist_fig, top_picks, usage = results[-1]
     assert 'Done' in process_log
     assert hist_fig == 'fig'
     assert top_picks.strip() in {'p1', 'p2'}
-    mock_gen.assert_called_once_with('instr', 4, model='gm', api_base='b', api_key='k', return_usage=True)
     assert 'Score completion' in process_log
     assert 'Pairwise completion' in process_log
     assert 'Prompt tokens' in usage
@@ -133,6 +138,9 @@ def test_run_tournament_pairwise_odd_players():
             generate_model='gm',
             score_model='sm',
             pairwise_model='pm',
             instruction_input='instr',
             criteria_input='c1,c2',
             n_gen=3,
@@ -141,6 +149,8 @@ def test_run_tournament_pairwise_odd_players():
             max_workers=1,
             enable_score_filter=False,
             enable_pairwise_filter=True,
         ))
     process_log, fig, top_picks, usage = results[-1]

             generate_model='gm',
             score_model='sm',
             pairwise_model='pm',
+            generate_temperature=1,
+            score_temperature=1,
+            pairwise_temperature=1,
             instruction_input='instr',
             criteria_input='c1,c2',
             n_gen=4,
             max_workers=1,
             enable_score_filter=True,
             enable_pairwise_filter=True,
+            score_with_instruction=True,
+            pairwise_with_instruction=True,
         ))
     process_log, hist_fig, top_picks, usage = results[-1]
     assert 'Done' in process_log
     assert hist_fig == 'fig'
     assert top_picks.strip() in {'p1', 'p2'}
+    mock_gen.assert_called_once_with('instr', 4, model='gm', api_base='b', api_key='k', temperature=1, return_usage=True)
     assert 'Score completion' in process_log
     assert 'Pairwise completion' in process_log
     assert 'Prompt tokens' in usage
             generate_model='gm',
             score_model='sm',
             pairwise_model='pm',
+            generate_temperature=1,
+            score_temperature=1,
+            pairwise_temperature=1,
             instruction_input='instr',
             criteria_input='c1,c2',
             n_gen=3,
             max_workers=1,
             enable_score_filter=False,
             enable_pairwise_filter=True,
+            score_with_instruction=True,
+            pairwise_with_instruction=True,
         ))
     process_log, fig, top_picks, usage = results[-1]

tests/test_tournament_utils.py CHANGED Viewed

@@ -25,26 +25,28 @@ def make_response(contents):
 def test_generate_players():
     resp = make_response([" player1 ", "player2\n"])
     with patch('tournament_utils.completion', return_value=resp) as mock_comp:
-        players = tu.generate_players('instr', 2, model='m', api_base='b', api_key='k')
-        mock_comp.assert_called_once_with(model='m', messages=[{'role': 'user', 'content': 'instr'}], n=2, api_base='b', api_key='k')
         assert players == ['player1', 'player2']
 def test_prompt_score():
     resp = make_response([" {\"score\": [5]} "])
     with patch('tournament_utils.completion', return_value=resp) as mock_comp:
-        result = tu.prompt_score('instr', ['c1'], 'block', 'pl', model='m', api_base='b', api_key='k')
         mock_comp.assert_called_once()
         assert mock_comp.call_args.kwargs['api_base'] == 'b'
         assert mock_comp.call_args.kwargs['api_key'] == 'k'
         assert result == '{"score": [5]}'
 def test_prompt_pairwise():
     resp = make_response([" {\"winner\": \"A\"} "])
     with patch('tournament_utils.completion', return_value=resp) as mock_comp:
-        result = tu.prompt_pairwise('instr', 'block', 'A text', 'B text', model='m', api_base='b', api_key='k')
         mock_comp.assert_called_once()
         assert mock_comp.call_args.kwargs['api_base'] == 'b'
         assert mock_comp.call_args.kwargs['api_key'] == 'k'
         assert result == '{"winner": "A"}'

 def test_generate_players():
     resp = make_response([" player1 ", "player2\n"])
     with patch('tournament_utils.completion', return_value=resp) as mock_comp:
+        players = tu.generate_players('instr', 2, model='m', api_base='b', api_key='k', temperature=0.5)
+        mock_comp.assert_called_once_with(model='m', messages=[{'role': 'user', 'content': 'instr'}], n=2, api_base='b', api_key='k', temperature=0.5)
         assert players == ['player1', 'player2']
 def test_prompt_score():
     resp = make_response([" {\"score\": [5]} "])
     with patch('tournament_utils.completion', return_value=resp) as mock_comp:
+        result = tu.prompt_score('instr', ['c1'], 'block', 'pl', model='m', api_base='b', api_key='k', temperature=0.2, include_instruction=False)
         mock_comp.assert_called_once()
         assert mock_comp.call_args.kwargs['api_base'] == 'b'
         assert mock_comp.call_args.kwargs['api_key'] == 'k'
+        assert mock_comp.call_args.kwargs['temperature'] == 0.2
         assert result == '{"score": [5]}'
 def test_prompt_pairwise():
     resp = make_response([" {\"winner\": \"A\"} "])
     with patch('tournament_utils.completion', return_value=resp) as mock_comp:
+        result = tu.prompt_pairwise('instr', 'block', 'A text', 'B text', model='m', api_base='b', api_key='k', temperature=0.3, include_instruction=False)
         mock_comp.assert_called_once()
         assert mock_comp.call_args.kwargs['api_base'] == 'b'
         assert mock_comp.call_args.kwargs['api_key'] == 'k'
+        assert mock_comp.call_args.kwargs['temperature'] == 0.3
         assert result == '{"winner": "A"}'

tournament_utils.py CHANGED Viewed

@@ -1,13 +1,19 @@
 from litellm import completion
-def _completion_kwargs(api_base: str | None, api_key: str | None) -> dict:
     """Build kwargs for litellm.completion from api settings."""
     kwargs: dict = {}
     if api_base:
         kwargs["api_base"] = api_base
     if api_key:
         kwargs["api_key"] = api_key
     return kwargs
@@ -18,6 +24,7 @@ def generate_players(
     *,
     api_base: str | None = None,
     api_key: str | None = None,
     return_usage: bool = False,
 ) -> list[str] | tuple[list[str], object]:
     """Request ``n`` completions for the instruction using the given model.
@@ -29,7 +36,7 @@ def generate_players(
         model=model,
         messages=[{"role": "user", "content": instruction}],
         n=n,
-        **_completion_kwargs(api_base, api_key),
     )
     players = [c.message.content.strip() for c in response.choices]
     if return_usage:
@@ -46,6 +53,8 @@ def prompt_score(
     *,
     api_base: str | None = None,
     api_key: str | None = None,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
     """Return a JSON score string evaluating `player` on the criteria."""
@@ -53,17 +62,14 @@ def prompt_score(
     prompt = f"""Evaluate the output below on the following criteria:
 {criteria_block}
-Return JSON exactly like: {{"scores": [{example_scores}]}}.
-Instruction:
-{instruction}
-Output:
-{player}"""
     response = completion(
         model=model,
         messages=[{"role": "system", "content": prompt}],
-        **_completion_kwargs(api_base, api_key),
     )
     text = response.choices[0].message.content.strip()
     if return_usage:
@@ -80,24 +86,22 @@ def prompt_pairwise(
     *,
     api_base: str | None = None,
     api_key: str | None = None,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
     """Return which player wins in JSON using the given criteria."""
     prompt = f"""Compare the two players below using:
 {criteria_block}
-Return ONLY JSON {{"winner": "A"}} or {{"winner": "B"}}.
-Instruction:
-{instruction}
-Players:
-<A>{a}</A>
-<B>{b}</B>"""
     response = completion(
         model=model,
         messages=[{"role": "system", "content": prompt}],
-        **_completion_kwargs(api_base, api_key),
     )
     text = response.choices[0].message.content.strip()
     if return_usage:

 from litellm import completion
+def _completion_kwargs(
+    api_base: str | None,
+    api_key: str | None,
+    temperature: float | None,
+) -> dict:
     """Build kwargs for litellm.completion from api settings."""
     kwargs: dict = {}
     if api_base:
         kwargs["api_base"] = api_base
     if api_key:
         kwargs["api_key"] = api_key
+    if temperature is not None:
+        kwargs["temperature"] = temperature
     return kwargs
     *,
     api_base: str | None = None,
     api_key: str | None = None,
+    temperature: float | None = None,
     return_usage: bool = False,
 ) -> list[str] | tuple[list[str], object]:
     """Request ``n`` completions for the instruction using the given model.
         model=model,
         messages=[{"role": "user", "content": instruction}],
         n=n,
+        **_completion_kwargs(api_base, api_key, temperature),
     )
     players = [c.message.content.strip() for c in response.choices]
     if return_usage:
     *,
     api_base: str | None = None,
     api_key: str | None = None,
+    temperature: float | None = None,
+    include_instruction: bool = True,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
     """Return a JSON score string evaluating `player` on the criteria."""
     prompt = f"""Evaluate the output below on the following criteria:
 {criteria_block}
+Return JSON exactly like: {{"scores": [{example_scores}]}}."""
+    if include_instruction:
+        prompt += f"\n\nInstruction:\n{instruction}"
+    prompt += f"\n\nOutput:\n{player}"
     response = completion(
         model=model,
         messages=[{"role": "system", "content": prompt}],
+        **_completion_kwargs(api_base, api_key, temperature),
     )
     text = response.choices[0].message.content.strip()
     if return_usage:
     *,
     api_base: str | None = None,
     api_key: str | None = None,
+    temperature: float | None = None,
+    include_instruction: bool = True,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
     """Return which player wins in JSON using the given criteria."""
     prompt = f"""Compare the two players below using:
 {criteria_block}
+Return ONLY JSON {{"winner": "A"}} or {{"winner": "B"}}."""
+    if include_instruction:
+        prompt += f"\n\nInstruction:\n{instruction}"
+    prompt += f"\n\nPlayers:\n<A>{a}</A>\n<B>{b}</B>"
     response = completion(
         model=model,
         messages=[{"role": "system", "content": prompt}],
+        **_completion_kwargs(api_base, api_key, temperature),
     )
     text = response.choices[0].message.content.strip()
     if return_usage: