Spaces:

ping98k
/

LLM-Brainstorming

Sleeping

App Files Files Community

ping98k commited on Jun 27, 2025

Commit

3404ee0

1 Parent(s): 348dd93

Add configurable thinking budget

Browse files

Files changed (5) hide show

README.md +9 -0
main.py +22 -0
tests/test_main.py +7 -1
tests/test_tournament_utils.py +23 -1
tournament_utils.py +29 -4

README.md CHANGED Viewed

@@ -21,6 +21,15 @@ This project provides a small interface for running "tournaments" between langua
    - `PASS_INSTRUCTION_TO_PAIRWISE`
    - `ENABLE_SCORE_FILTER`
    - `ENABLE_PAIRWISE_FILTER`
 2. Install dependencies (example with `pip`):
    ```bash
    pip install gradio litellm python-dotenv tqdm matplotlib

    - `PASS_INSTRUCTION_TO_PAIRWISE`
    - `ENABLE_SCORE_FILTER`
    - `ENABLE_PAIRWISE_FILTER`
+   - `ENABLE_GENERATE_THINKING`
+   - `ENABLE_SCORE_THINKING`
+   - `ENABLE_PAIRWISE_THINKING`
+   - `THINKING_BUDGET_TOKENS`
+   When any of the thinking flags are enabled, the app sends
+   `thinking={"type": "enabled", "budget_tokens": $THINKING_BUDGET_TOKENS}` with each
+   `litellm.completion` call for that model. Otherwise it sends
+   `thinking={"type": "disabled", "budget_tokens": 0}`.
 2. Install dependencies (example with `pip`):
    ```bash
    pip install gradio litellm python-dotenv tqdm matplotlib

main.py CHANGED Viewed

@@ -45,6 +45,10 @@ SCORE_TEMPERATURE_DEFAULT = float(os.getenv("SCORE_TEMPERATURE", "0.6"))
 PAIRWISE_TEMPERATURE_DEFAULT = float(os.getenv("PAIRWISE_TEMPERATURE", "0.6"))
 SCORE_WITH_INSTRUCTION_DEFAULT = os.getenv("PASS_INSTRUCTION_TO_SCORE", "true").lower() == "true"
 PAIRWISE_WITH_INSTRUCTION_DEFAULT = os.getenv("PASS_INSTRUCTION_TO_PAIRWISE", "true").lower() == "true"
 CRITERIA_DEFAULT = "Factuality,Instruction Following,Precision"
 def _clean_json(txt):
     txt = re.sub(r"^```.*?\n|```$", "", txt, flags=re.DOTALL).strip()
@@ -72,6 +76,9 @@ def run_tournament(
     enable_pairwise_filter,
     score_with_instruction,
     pairwise_with_instruction,
 ):
     instruction = instruction_input.strip()
     criteria_list = [c.strip() for c in criteria_input.split(",") if c.strip()] or ["Factuality", "Instruction Following", "Precision"]
@@ -101,6 +108,12 @@ def run_tournament(
         score_with_instruction = SCORE_WITH_INSTRUCTION_DEFAULT
     if pairwise_with_instruction is None:
         pairwise_with_instruction = PAIRWISE_WITH_INSTRUCTION_DEFAULT
     process_log = []
     hist_fig = None
     top_picks_str = ""
@@ -150,6 +163,8 @@ def run_tournament(
         api_base=api_base,
         api_key=api_token,
         temperature=generate_temperature,
         return_usage=True,
     )
     add_usage(usage)
@@ -174,6 +189,8 @@ def run_tournament(
                 api_key=api_token,
                 temperature=score_temperature,
                 include_instruction=score_with_instruction,
                 return_usage=True,
             )
             add_usage(usage)
@@ -212,6 +229,8 @@ def run_tournament(
                 api_key=api_token,
                 temperature=pairwise_temperature,
                 include_instruction=pairwise_with_instruction,
                 return_usage=True,
             )
             add_usage(usage)
@@ -302,6 +321,9 @@ demo = gr.Interface(
         gr.Checkbox(value=PAIRWISE_FILTER_DEFAULT, label="Enable Pairwise Filter"),
         gr.Checkbox(value=SCORE_WITH_INSTRUCTION_DEFAULT, label="Pass Instruction to Score Model"),
         gr.Checkbox(value=PAIRWISE_WITH_INSTRUCTION_DEFAULT, label="Pass Instruction to Pairwise Model"),
     ],
     outputs=[
         gr.Textbox(lines=10, label="Process"),

 PAIRWISE_TEMPERATURE_DEFAULT = float(os.getenv("PAIRWISE_TEMPERATURE", "0.6"))
 SCORE_WITH_INSTRUCTION_DEFAULT = os.getenv("PASS_INSTRUCTION_TO_SCORE", "true").lower() == "true"
 PAIRWISE_WITH_INSTRUCTION_DEFAULT = os.getenv("PASS_INSTRUCTION_TO_PAIRWISE", "true").lower() == "true"
+GENERATE_THINKING_DEFAULT = os.getenv("ENABLE_GENERATE_THINKING", "false").lower() == "true"
+SCORE_THINKING_DEFAULT = os.getenv("ENABLE_SCORE_THINKING", "false").lower() == "true"
+PAIRWISE_THINKING_DEFAULT = os.getenv("ENABLE_PAIRWISE_THINKING", "false").lower() == "true"
+THINKING_BUDGET_TOKENS_DEFAULT = int(os.getenv("THINKING_BUDGET_TOKENS", "1024"))
 CRITERIA_DEFAULT = "Factuality,Instruction Following,Precision"
 def _clean_json(txt):
     txt = re.sub(r"^```.*?\n|```$", "", txt, flags=re.DOTALL).strip()
     enable_pairwise_filter,
     score_with_instruction,
     pairwise_with_instruction,
+    generate_thinking,
+    score_thinking,
+    pairwise_thinking,
 ):
     instruction = instruction_input.strip()
     criteria_list = [c.strip() for c in criteria_input.split(",") if c.strip()] or ["Factuality", "Instruction Following", "Precision"]
         score_with_instruction = SCORE_WITH_INSTRUCTION_DEFAULT
     if pairwise_with_instruction is None:
         pairwise_with_instruction = PAIRWISE_WITH_INSTRUCTION_DEFAULT
+    if generate_thinking is None:
+        generate_thinking = GENERATE_THINKING_DEFAULT
+    if score_thinking is None:
+        score_thinking = SCORE_THINKING_DEFAULT
+    if pairwise_thinking is None:
+        pairwise_thinking = PAIRWISE_THINKING_DEFAULT
     process_log = []
     hist_fig = None
     top_picks_str = ""
         api_base=api_base,
         api_key=api_token,
         temperature=generate_temperature,
+        thinking=generate_thinking,
+        budget_tokens=THINKING_BUDGET_TOKENS_DEFAULT,
         return_usage=True,
     )
     add_usage(usage)
                 api_key=api_token,
                 temperature=score_temperature,
                 include_instruction=score_with_instruction,
+                thinking=score_thinking,
+                budget_tokens=THINKING_BUDGET_TOKENS_DEFAULT,
                 return_usage=True,
             )
             add_usage(usage)
                 api_key=api_token,
                 temperature=pairwise_temperature,
                 include_instruction=pairwise_with_instruction,
+                thinking=pairwise_thinking,
+                budget_tokens=THINKING_BUDGET_TOKENS_DEFAULT,
                 return_usage=True,
             )
             add_usage(usage)
         gr.Checkbox(value=PAIRWISE_FILTER_DEFAULT, label="Enable Pairwise Filter"),
         gr.Checkbox(value=SCORE_WITH_INSTRUCTION_DEFAULT, label="Pass Instruction to Score Model"),
         gr.Checkbox(value=PAIRWISE_WITH_INSTRUCTION_DEFAULT, label="Pass Instruction to Pairwise Model"),
+        gr.Checkbox(value=GENERATE_THINKING_DEFAULT, label="Enable Thinking (Generate)"),
+        gr.Checkbox(value=SCORE_THINKING_DEFAULT, label="Enable Thinking (Score)"),
+        gr.Checkbox(value=PAIRWISE_THINKING_DEFAULT, label="Enable Thinking (Pairwise)"),
     ],
     outputs=[
         gr.Textbox(lines=10, label="Process"),

tests/test_main.py CHANGED Viewed

@@ -106,13 +106,16 @@ def test_run_tournament_full_loop():
             enable_pairwise_filter=True,
             score_with_instruction=True,
             pairwise_with_instruction=True,
         ))
     process_log, hist_fig, top_picks, usage = results[-1]
     assert 'Done' in process_log
     assert hist_fig == 'fig'
     assert top_picks.strip() in {'p1', 'p2'}
-    mock_gen.assert_called_once_with('instr', 4, model='gm', api_base='b', api_key='k', temperature=1, return_usage=True)
     assert 'Score completion' in process_log
     assert 'Pairwise completion' in process_log
     assert 'Prompt tokens' in usage
@@ -151,6 +154,9 @@ def test_run_tournament_pairwise_odd_players():
             enable_pairwise_filter=True,
             score_with_instruction=True,
             pairwise_with_instruction=True,
         ))
     process_log, fig, top_picks, usage = results[-1]

             enable_pairwise_filter=True,
             score_with_instruction=True,
             pairwise_with_instruction=True,
+            generate_thinking=True,
+            score_thinking=True,
+            pairwise_thinking=True,
         ))
     process_log, hist_fig, top_picks, usage = results[-1]
     assert 'Done' in process_log
     assert hist_fig == 'fig'
     assert top_picks.strip() in {'p1', 'p2'}
+    mock_gen.assert_called_once_with('instr', 4, model='gm', api_base='b', api_key='k', temperature=1, thinking=True, budget_tokens=1024, return_usage=True)
     assert 'Score completion' in process_log
     assert 'Pairwise completion' in process_log
     assert 'Prompt tokens' in usage
             enable_pairwise_filter=True,
             score_with_instruction=True,
             pairwise_with_instruction=True,
+            generate_thinking=True,
+            score_thinking=True,
+            pairwise_thinking=True,
         ))
     process_log, fig, top_picks, usage = results[-1]

tests/test_tournament_utils.py CHANGED Viewed

@@ -26,7 +26,7 @@ def test_generate_players():
     resp = make_response([" player1 ", "player2\n"])
     with patch('tournament_utils.completion', return_value=resp) as mock_comp:
         players = tu.generate_players('instr', 2, model='m', api_base='b', api_key='k', temperature=0.5)
-        mock_comp.assert_called_once_with(model='m', messages=[{'role': 'user', 'content': 'instr'}], n=2, api_base='b', api_key='k', temperature=0.5)
         assert players == ['player1', 'player2']
@@ -50,3 +50,25 @@ def test_prompt_pairwise():
         assert mock_comp.call_args.kwargs['api_key'] == 'k'
         assert mock_comp.call_args.kwargs['temperature'] == 0.3
         assert result == '{"winner": "A"}'

     resp = make_response([" player1 ", "player2\n"])
     with patch('tournament_utils.completion', return_value=resp) as mock_comp:
         players = tu.generate_players('instr', 2, model='m', api_base='b', api_key='k', temperature=0.5)
+        mock_comp.assert_called_once_with(model='m', messages=[{'role': 'user', 'content': 'instr'}], n=2, api_base='b', api_key='k', temperature=0.5, thinking={'type': 'disabled', 'budget_tokens': 0})
         assert players == ['player1', 'player2']
         assert mock_comp.call_args.kwargs['api_key'] == 'k'
         assert mock_comp.call_args.kwargs['temperature'] == 0.3
         assert result == '{"winner": "A"}'
+def test_thinking_passed_to_completion():
+    resp = make_response(["ok"])
+    with patch('tournament_utils.completion', return_value=resp) as mock_comp:
+        tu.generate_players('i', 1, thinking=True)
+        tu.prompt_score('i', ['c'], 'block', 'p', thinking=True)
+        tu.prompt_pairwise('i', 'block', 'a', 'b', thinking=True)
+        assert mock_comp.call_count == 3
+        for call in mock_comp.call_args_list:
+            assert call.kwargs['thinking'] == {'type': 'enabled', 'budget_tokens': 1024}
+def test_thinking_disabled_by_default():
+    resp = make_response(["ok"])
+    with patch('tournament_utils.completion', return_value=resp) as mock_comp:
+        tu.generate_players('i', 1)
+        tu.prompt_score('i', ['c'], 'block', 'p')
+        tu.prompt_pairwise('i', 'block', 'a', 'b')
+        assert mock_comp.call_count == 3
+        for call in mock_comp.call_args_list:
+            assert call.kwargs['thinking'] == {'type': 'disabled', 'budget_tokens': 0}

tournament_utils.py CHANGED Viewed

@@ -1,5 +1,8 @@
 from litellm import completion
 def _completion_kwargs(
     api_base: str | None,
@@ -25,6 +28,8 @@ def generate_players(
     api_base: str | None = None,
     api_key: str | None = None,
     temperature: float | None = None,
     return_usage: bool = False,
 ) -> list[str] | tuple[list[str], object]:
     """Request ``n`` completions for the instruction using the given model.
@@ -32,11 +37,17 @@ def generate_players(
     When ``return_usage`` is ``True`` the ``usage`` object from the completion
     response is also returned.
     """
     response = completion(
         model=model,
-        messages=[{"role": "user", "content": instruction}],
         n=n,
-        **_completion_kwargs(api_base, api_key, temperature),
     )
     players = [c.message.content.strip() for c in response.choices]
     if return_usage:
@@ -55,6 +66,8 @@ def prompt_score(
     api_key: str | None = None,
     temperature: float | None = None,
     include_instruction: bool = True,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
     """Return a JSON score string evaluating `player` on the criteria."""
@@ -66,10 +79,15 @@ Return JSON exactly like: {{"scores": [{example_scores}]}}."""
     if include_instruction:
         prompt += f"\n\nInstruction:\n{instruction}"
     prompt += f"\n\nOutput:\n{player}"
     response = completion(
         model=model,
         messages=[{"role": "system", "content": prompt}],
-        **_completion_kwargs(api_base, api_key, temperature),
     )
     text = response.choices[0].message.content.strip()
     if return_usage:
@@ -88,6 +106,8 @@ def prompt_pairwise(
     api_key: str | None = None,
     temperature: float | None = None,
     include_instruction: bool = True,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
     """Return which player wins in JSON using the given criteria."""
@@ -98,10 +118,15 @@ Return ONLY JSON {{"winner": "A"}} or {{"winner": "B"}}."""
     if include_instruction:
         prompt += f"\n\nInstruction:\n{instruction}"
     prompt += f"\n\nPlayers:\n<A>{a}</A>\n<B>{b}</B>"
     response = completion(
         model=model,
         messages=[{"role": "system", "content": prompt}],
-        **_completion_kwargs(api_base, api_key, temperature),
     )
     text = response.choices[0].message.content.strip()
     if return_usage:

+import os
 from litellm import completion
+BUDGET_TOKENS_DEFAULT = int(os.getenv("THINKING_BUDGET_TOKENS", "1024"))
 def _completion_kwargs(
     api_base: str | None,
     api_base: str | None = None,
     api_key: str | None = None,
     temperature: float | None = None,
+    thinking: bool = False,
+    budget_tokens: int = BUDGET_TOKENS_DEFAULT,
     return_usage: bool = False,
 ) -> list[str] | tuple[list[str], object]:
     """Request ``n`` completions for the instruction using the given model.
     When ``return_usage`` is ``True`` the ``usage`` object from the completion
     response is also returned.
     """
+    messages = [{"role": "user", "content": instruction}]
+    kwargs = _completion_kwargs(api_base, api_key, temperature)
+    kwargs["thinking"] = {
+        "type": "enabled" if thinking else "disabled",
+        "budget_tokens": budget_tokens if thinking else 0,
+    }
     response = completion(
         model=model,
+        messages=messages,
         n=n,
+        **kwargs,
     )
     players = [c.message.content.strip() for c in response.choices]
     if return_usage:
     api_key: str | None = None,
     temperature: float | None = None,
     include_instruction: bool = True,
+    thinking: bool = False,
+    budget_tokens: int = BUDGET_TOKENS_DEFAULT,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
     """Return a JSON score string evaluating `player` on the criteria."""
     if include_instruction:
         prompt += f"\n\nInstruction:\n{instruction}"
     prompt += f"\n\nOutput:\n{player}"
+    kwargs = _completion_kwargs(api_base, api_key, temperature)
+    kwargs["thinking"] = {
+        "type": "enabled" if thinking else "disabled",
+        "budget_tokens": budget_tokens if thinking else 0,
+    }
     response = completion(
         model=model,
         messages=[{"role": "system", "content": prompt}],
+        **kwargs,
     )
     text = response.choices[0].message.content.strip()
     if return_usage:
     api_key: str | None = None,
     temperature: float | None = None,
     include_instruction: bool = True,
+    thinking: bool = False,
+    budget_tokens: int = BUDGET_TOKENS_DEFAULT,
     return_usage: bool = False,
 ) -> str | tuple[str, object]:
     """Return which player wins in JSON using the given criteria."""
     if include_instruction:
         prompt += f"\n\nInstruction:\n{instruction}"
     prompt += f"\n\nPlayers:\n<A>{a}</A>\n<B>{b}</B>"
+    kwargs = _completion_kwargs(api_base, api_key, temperature)
+    kwargs["thinking"] = {
+        "type": "enabled" if thinking else "disabled",
+        "budget_tokens": budget_tokens if thinking else 0,
+    }
     response = completion(
         model=model,
         messages=[{"role": "system", "content": prompt}],
+        **kwargs,
     )
     text = response.choices[0].message.content.strip()
     if return_usage: