Spaces:
Sleeping
Sleeping
Merge pull request #11 from ping98k/codex/add-option-to-enable-thinking-in-llms
Browse files- README.md +8 -0
- main.py +18 -0
- tests/test_main.py +7 -1
- tests/test_tournament_utils.py +23 -1
- tournament_utils.py +14 -4
README.md
CHANGED
|
@@ -21,6 +21,14 @@ This project provides a small interface for running "tournaments" between langua
|
|
| 21 |
- `PASS_INSTRUCTION_TO_PAIRWISE`
|
| 22 |
- `ENABLE_SCORE_FILTER`
|
| 23 |
- `ENABLE_PAIRWISE_FILTER`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
2. Install dependencies (example with `pip`):
|
| 25 |
```bash
|
| 26 |
pip install gradio litellm python-dotenv tqdm matplotlib
|
|
|
|
| 21 |
- `PASS_INSTRUCTION_TO_PAIRWISE`
|
| 22 |
- `ENABLE_SCORE_FILTER`
|
| 23 |
- `ENABLE_PAIRWISE_FILTER`
|
| 24 |
+
- `ENABLE_GENERATE_THINKING`
|
| 25 |
+
- `ENABLE_SCORE_THINKING`
|
| 26 |
+
- `ENABLE_PAIRWISE_THINKING`
|
| 27 |
+
|
| 28 |
+
When any of the thinking flags are enabled, the app sends
|
| 29 |
+
`chat_template_kwargs={"enable_thinking": True}` with each
|
| 30 |
+
`litellm.completion` call for that model. Otherwise it sends
|
| 31 |
+
`chat_template_kwargs={"enable_thinking": False}`.
|
| 32 |
2. Install dependencies (example with `pip`):
|
| 33 |
```bash
|
| 34 |
pip install gradio litellm python-dotenv tqdm matplotlib
|
main.py
CHANGED
|
@@ -45,6 +45,9 @@ SCORE_TEMPERATURE_DEFAULT = float(os.getenv("SCORE_TEMPERATURE", "0.6"))
|
|
| 45 |
PAIRWISE_TEMPERATURE_DEFAULT = float(os.getenv("PAIRWISE_TEMPERATURE", "0.6"))
|
| 46 |
SCORE_WITH_INSTRUCTION_DEFAULT = os.getenv("PASS_INSTRUCTION_TO_SCORE", "true").lower() == "true"
|
| 47 |
PAIRWISE_WITH_INSTRUCTION_DEFAULT = os.getenv("PASS_INSTRUCTION_TO_PAIRWISE", "true").lower() == "true"
|
|
|
|
|
|
|
|
|
|
| 48 |
CRITERIA_DEFAULT = "Factuality,Instruction Following,Precision"
|
| 49 |
def _clean_json(txt):
|
| 50 |
txt = re.sub(r"^```.*?\n|```$", "", txt, flags=re.DOTALL).strip()
|
|
@@ -72,6 +75,9 @@ def run_tournament(
|
|
| 72 |
enable_pairwise_filter,
|
| 73 |
score_with_instruction,
|
| 74 |
pairwise_with_instruction,
|
|
|
|
|
|
|
|
|
|
| 75 |
):
|
| 76 |
instruction = instruction_input.strip()
|
| 77 |
criteria_list = [c.strip() for c in criteria_input.split(",") if c.strip()] or ["Factuality", "Instruction Following", "Precision"]
|
|
@@ -101,6 +107,12 @@ def run_tournament(
|
|
| 101 |
score_with_instruction = SCORE_WITH_INSTRUCTION_DEFAULT
|
| 102 |
if pairwise_with_instruction is None:
|
| 103 |
pairwise_with_instruction = PAIRWISE_WITH_INSTRUCTION_DEFAULT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
process_log = []
|
| 105 |
hist_fig = None
|
| 106 |
top_picks_str = ""
|
|
@@ -150,6 +162,7 @@ def run_tournament(
|
|
| 150 |
api_base=api_base,
|
| 151 |
api_key=api_token,
|
| 152 |
temperature=generate_temperature,
|
|
|
|
| 153 |
return_usage=True,
|
| 154 |
)
|
| 155 |
add_usage(usage)
|
|
@@ -174,6 +187,7 @@ def run_tournament(
|
|
| 174 |
api_key=api_token,
|
| 175 |
temperature=score_temperature,
|
| 176 |
include_instruction=score_with_instruction,
|
|
|
|
| 177 |
return_usage=True,
|
| 178 |
)
|
| 179 |
add_usage(usage)
|
|
@@ -212,6 +226,7 @@ def run_tournament(
|
|
| 212 |
api_key=api_token,
|
| 213 |
temperature=pairwise_temperature,
|
| 214 |
include_instruction=pairwise_with_instruction,
|
|
|
|
| 215 |
return_usage=True,
|
| 216 |
)
|
| 217 |
add_usage(usage)
|
|
@@ -302,6 +317,9 @@ demo = gr.Interface(
|
|
| 302 |
gr.Checkbox(value=PAIRWISE_FILTER_DEFAULT, label="Enable Pairwise Filter"),
|
| 303 |
gr.Checkbox(value=SCORE_WITH_INSTRUCTION_DEFAULT, label="Pass Instruction to Score Model"),
|
| 304 |
gr.Checkbox(value=PAIRWISE_WITH_INSTRUCTION_DEFAULT, label="Pass Instruction to Pairwise Model"),
|
|
|
|
|
|
|
|
|
|
| 305 |
],
|
| 306 |
outputs=[
|
| 307 |
gr.Textbox(lines=10, label="Process"),
|
|
|
|
| 45 |
PAIRWISE_TEMPERATURE_DEFAULT = float(os.getenv("PAIRWISE_TEMPERATURE", "0.6"))
|
| 46 |
SCORE_WITH_INSTRUCTION_DEFAULT = os.getenv("PASS_INSTRUCTION_TO_SCORE", "true").lower() == "true"
|
| 47 |
PAIRWISE_WITH_INSTRUCTION_DEFAULT = os.getenv("PASS_INSTRUCTION_TO_PAIRWISE", "true").lower() == "true"
|
| 48 |
+
GENERATE_THINKING_DEFAULT = os.getenv("ENABLE_GENERATE_THINKING", "false").lower() == "true"
|
| 49 |
+
SCORE_THINKING_DEFAULT = os.getenv("ENABLE_SCORE_THINKING", "false").lower() == "true"
|
| 50 |
+
PAIRWISE_THINKING_DEFAULT = os.getenv("ENABLE_PAIRWISE_THINKING", "false").lower() == "true"
|
| 51 |
CRITERIA_DEFAULT = "Factuality,Instruction Following,Precision"
|
| 52 |
def _clean_json(txt):
|
| 53 |
txt = re.sub(r"^```.*?\n|```$", "", txt, flags=re.DOTALL).strip()
|
|
|
|
| 75 |
enable_pairwise_filter,
|
| 76 |
score_with_instruction,
|
| 77 |
pairwise_with_instruction,
|
| 78 |
+
generate_thinking,
|
| 79 |
+
score_thinking,
|
| 80 |
+
pairwise_thinking,
|
| 81 |
):
|
| 82 |
instruction = instruction_input.strip()
|
| 83 |
criteria_list = [c.strip() for c in criteria_input.split(",") if c.strip()] or ["Factuality", "Instruction Following", "Precision"]
|
|
|
|
| 107 |
score_with_instruction = SCORE_WITH_INSTRUCTION_DEFAULT
|
| 108 |
if pairwise_with_instruction is None:
|
| 109 |
pairwise_with_instruction = PAIRWISE_WITH_INSTRUCTION_DEFAULT
|
| 110 |
+
if generate_thinking is None:
|
| 111 |
+
generate_thinking = GENERATE_THINKING_DEFAULT
|
| 112 |
+
if score_thinking is None:
|
| 113 |
+
score_thinking = SCORE_THINKING_DEFAULT
|
| 114 |
+
if pairwise_thinking is None:
|
| 115 |
+
pairwise_thinking = PAIRWISE_THINKING_DEFAULT
|
| 116 |
process_log = []
|
| 117 |
hist_fig = None
|
| 118 |
top_picks_str = ""
|
|
|
|
| 162 |
api_base=api_base,
|
| 163 |
api_key=api_token,
|
| 164 |
temperature=generate_temperature,
|
| 165 |
+
thinking=generate_thinking,
|
| 166 |
return_usage=True,
|
| 167 |
)
|
| 168 |
add_usage(usage)
|
|
|
|
| 187 |
api_key=api_token,
|
| 188 |
temperature=score_temperature,
|
| 189 |
include_instruction=score_with_instruction,
|
| 190 |
+
thinking=score_thinking,
|
| 191 |
return_usage=True,
|
| 192 |
)
|
| 193 |
add_usage(usage)
|
|
|
|
| 226 |
api_key=api_token,
|
| 227 |
temperature=pairwise_temperature,
|
| 228 |
include_instruction=pairwise_with_instruction,
|
| 229 |
+
thinking=pairwise_thinking,
|
| 230 |
return_usage=True,
|
| 231 |
)
|
| 232 |
add_usage(usage)
|
|
|
|
| 317 |
gr.Checkbox(value=PAIRWISE_FILTER_DEFAULT, label="Enable Pairwise Filter"),
|
| 318 |
gr.Checkbox(value=SCORE_WITH_INSTRUCTION_DEFAULT, label="Pass Instruction to Score Model"),
|
| 319 |
gr.Checkbox(value=PAIRWISE_WITH_INSTRUCTION_DEFAULT, label="Pass Instruction to Pairwise Model"),
|
| 320 |
+
gr.Checkbox(value=GENERATE_THINKING_DEFAULT, label="Enable Thinking (Generate)"),
|
| 321 |
+
gr.Checkbox(value=SCORE_THINKING_DEFAULT, label="Enable Thinking (Score)"),
|
| 322 |
+
gr.Checkbox(value=PAIRWISE_THINKING_DEFAULT, label="Enable Thinking (Pairwise)"),
|
| 323 |
],
|
| 324 |
outputs=[
|
| 325 |
gr.Textbox(lines=10, label="Process"),
|
tests/test_main.py
CHANGED
|
@@ -106,13 +106,16 @@ def test_run_tournament_full_loop():
|
|
| 106 |
enable_pairwise_filter=True,
|
| 107 |
score_with_instruction=True,
|
| 108 |
pairwise_with_instruction=True,
|
|
|
|
|
|
|
|
|
|
| 109 |
))
|
| 110 |
|
| 111 |
process_log, hist_fig, top_picks, usage = results[-1]
|
| 112 |
assert 'Done' in process_log
|
| 113 |
assert hist_fig == 'fig'
|
| 114 |
assert top_picks.strip() in {'p1', 'p2'}
|
| 115 |
-
mock_gen.assert_called_once_with('instr', 4, model='gm', api_base='b', api_key='k', temperature=1, return_usage=True)
|
| 116 |
assert 'Score completion' in process_log
|
| 117 |
assert 'Pairwise completion' in process_log
|
| 118 |
assert 'Prompt tokens' in usage
|
|
@@ -151,6 +154,9 @@ def test_run_tournament_pairwise_odd_players():
|
|
| 151 |
enable_pairwise_filter=True,
|
| 152 |
score_with_instruction=True,
|
| 153 |
pairwise_with_instruction=True,
|
|
|
|
|
|
|
|
|
|
| 154 |
))
|
| 155 |
|
| 156 |
process_log, fig, top_picks, usage = results[-1]
|
|
|
|
| 106 |
enable_pairwise_filter=True,
|
| 107 |
score_with_instruction=True,
|
| 108 |
pairwise_with_instruction=True,
|
| 109 |
+
generate_thinking=True,
|
| 110 |
+
score_thinking=True,
|
| 111 |
+
pairwise_thinking=True,
|
| 112 |
))
|
| 113 |
|
| 114 |
process_log, hist_fig, top_picks, usage = results[-1]
|
| 115 |
assert 'Done' in process_log
|
| 116 |
assert hist_fig == 'fig'
|
| 117 |
assert top_picks.strip() in {'p1', 'p2'}
|
| 118 |
+
mock_gen.assert_called_once_with('instr', 4, model='gm', api_base='b', api_key='k', temperature=1, thinking=True, return_usage=True)
|
| 119 |
assert 'Score completion' in process_log
|
| 120 |
assert 'Pairwise completion' in process_log
|
| 121 |
assert 'Prompt tokens' in usage
|
|
|
|
| 154 |
enable_pairwise_filter=True,
|
| 155 |
score_with_instruction=True,
|
| 156 |
pairwise_with_instruction=True,
|
| 157 |
+
generate_thinking=True,
|
| 158 |
+
score_thinking=True,
|
| 159 |
+
pairwise_thinking=True,
|
| 160 |
))
|
| 161 |
|
| 162 |
process_log, fig, top_picks, usage = results[-1]
|
tests/test_tournament_utils.py
CHANGED
|
@@ -26,7 +26,7 @@ def test_generate_players():
|
|
| 26 |
resp = make_response([" player1 ", "player2\n"])
|
| 27 |
with patch('tournament_utils.completion', return_value=resp) as mock_comp:
|
| 28 |
players = tu.generate_players('instr', 2, model='m', api_base='b', api_key='k', temperature=0.5)
|
| 29 |
-
mock_comp.assert_called_once_with(model='m', messages=[{'role': 'user', 'content': 'instr'}], n=2, api_base='b', api_key='k', temperature=0.5)
|
| 30 |
assert players == ['player1', 'player2']
|
| 31 |
|
| 32 |
|
|
@@ -50,3 +50,25 @@ def test_prompt_pairwise():
|
|
| 50 |
assert mock_comp.call_args.kwargs['api_key'] == 'k'
|
| 51 |
assert mock_comp.call_args.kwargs['temperature'] == 0.3
|
| 52 |
assert result == '{"winner": "A"}'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
resp = make_response([" player1 ", "player2\n"])
|
| 27 |
with patch('tournament_utils.completion', return_value=resp) as mock_comp:
|
| 28 |
players = tu.generate_players('instr', 2, model='m', api_base='b', api_key='k', temperature=0.5)
|
| 29 |
+
mock_comp.assert_called_once_with(model='m', messages=[{'role': 'user', 'content': 'instr'}], n=2, api_base='b', api_key='k', temperature=0.5, chat_template_kwargs={'enable_thinking': False})
|
| 30 |
assert players == ['player1', 'player2']
|
| 31 |
|
| 32 |
|
|
|
|
| 50 |
assert mock_comp.call_args.kwargs['api_key'] == 'k'
|
| 51 |
assert mock_comp.call_args.kwargs['temperature'] == 0.3
|
| 52 |
assert result == '{"winner": "A"}'
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def test_thinking_passed_to_completion():
|
| 56 |
+
resp = make_response(["ok"])
|
| 57 |
+
with patch('tournament_utils.completion', return_value=resp) as mock_comp:
|
| 58 |
+
tu.generate_players('i', 1, thinking=True)
|
| 59 |
+
tu.prompt_score('i', ['c'], 'block', 'p', thinking=True)
|
| 60 |
+
tu.prompt_pairwise('i', 'block', 'a', 'b', thinking=True)
|
| 61 |
+
assert mock_comp.call_count == 3
|
| 62 |
+
for call in mock_comp.call_args_list:
|
| 63 |
+
assert call.kwargs['chat_template_kwargs'] == {'enable_thinking': True}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def test_thinking_disabled_by_default():
|
| 67 |
+
resp = make_response(["ok"])
|
| 68 |
+
with patch('tournament_utils.completion', return_value=resp) as mock_comp:
|
| 69 |
+
tu.generate_players('i', 1)
|
| 70 |
+
tu.prompt_score('i', ['c'], 'block', 'p')
|
| 71 |
+
tu.prompt_pairwise('i', 'block', 'a', 'b')
|
| 72 |
+
assert mock_comp.call_count == 3
|
| 73 |
+
for call in mock_comp.call_args_list:
|
| 74 |
+
assert call.kwargs['chat_template_kwargs'] == {'enable_thinking': False}
|
tournament_utils.py
CHANGED
|
@@ -25,6 +25,7 @@ def generate_players(
|
|
| 25 |
api_base: str | None = None,
|
| 26 |
api_key: str | None = None,
|
| 27 |
temperature: float | None = None,
|
|
|
|
| 28 |
return_usage: bool = False,
|
| 29 |
) -> list[str] | tuple[list[str], object]:
|
| 30 |
"""Request ``n`` completions for the instruction using the given model.
|
|
@@ -32,11 +33,14 @@ def generate_players(
|
|
| 32 |
When ``return_usage`` is ``True`` the ``usage`` object from the completion
|
| 33 |
response is also returned.
|
| 34 |
"""
|
|
|
|
|
|
|
|
|
|
| 35 |
response = completion(
|
| 36 |
model=model,
|
| 37 |
-
messages=
|
| 38 |
n=n,
|
| 39 |
-
**
|
| 40 |
)
|
| 41 |
players = [c.message.content.strip() for c in response.choices]
|
| 42 |
if return_usage:
|
|
@@ -55,6 +59,7 @@ def prompt_score(
|
|
| 55 |
api_key: str | None = None,
|
| 56 |
temperature: float | None = None,
|
| 57 |
include_instruction: bool = True,
|
|
|
|
| 58 |
return_usage: bool = False,
|
| 59 |
) -> str | tuple[str, object]:
|
| 60 |
"""Return a JSON score string evaluating `player` on the criteria."""
|
|
@@ -66,10 +71,12 @@ Return JSON exactly like: {{"scores": [{example_scores}]}}."""
|
|
| 66 |
if include_instruction:
|
| 67 |
prompt += f"\n\nInstruction:\n{instruction}"
|
| 68 |
prompt += f"\n\nOutput:\n{player}"
|
|
|
|
|
|
|
| 69 |
response = completion(
|
| 70 |
model=model,
|
| 71 |
messages=[{"role": "system", "content": prompt}],
|
| 72 |
-
**
|
| 73 |
)
|
| 74 |
text = response.choices[0].message.content.strip()
|
| 75 |
if return_usage:
|
|
@@ -88,6 +95,7 @@ def prompt_pairwise(
|
|
| 88 |
api_key: str | None = None,
|
| 89 |
temperature: float | None = None,
|
| 90 |
include_instruction: bool = True,
|
|
|
|
| 91 |
return_usage: bool = False,
|
| 92 |
) -> str | tuple[str, object]:
|
| 93 |
"""Return which player wins in JSON using the given criteria."""
|
|
@@ -98,10 +106,12 @@ Return ONLY JSON {{"winner": "A"}} or {{"winner": "B"}}."""
|
|
| 98 |
if include_instruction:
|
| 99 |
prompt += f"\n\nInstruction:\n{instruction}"
|
| 100 |
prompt += f"\n\nPlayers:\n<A>{a}</A>\n<B>{b}</B>"
|
|
|
|
|
|
|
| 101 |
response = completion(
|
| 102 |
model=model,
|
| 103 |
messages=[{"role": "system", "content": prompt}],
|
| 104 |
-
**
|
| 105 |
)
|
| 106 |
text = response.choices[0].message.content.strip()
|
| 107 |
if return_usage:
|
|
|
|
| 25 |
api_base: str | None = None,
|
| 26 |
api_key: str | None = None,
|
| 27 |
temperature: float | None = None,
|
| 28 |
+
thinking: bool = False,
|
| 29 |
return_usage: bool = False,
|
| 30 |
) -> list[str] | tuple[list[str], object]:
|
| 31 |
"""Request ``n`` completions for the instruction using the given model.
|
|
|
|
| 33 |
When ``return_usage`` is ``True`` the ``usage`` object from the completion
|
| 34 |
response is also returned.
|
| 35 |
"""
|
| 36 |
+
messages = [{"role": "user", "content": instruction}]
|
| 37 |
+
kwargs = _completion_kwargs(api_base, api_key, temperature)
|
| 38 |
+
kwargs["chat_template_kwargs"] = {"enable_thinking": thinking}
|
| 39 |
response = completion(
|
| 40 |
model=model,
|
| 41 |
+
messages=messages,
|
| 42 |
n=n,
|
| 43 |
+
**kwargs,
|
| 44 |
)
|
| 45 |
players = [c.message.content.strip() for c in response.choices]
|
| 46 |
if return_usage:
|
|
|
|
| 59 |
api_key: str | None = None,
|
| 60 |
temperature: float | None = None,
|
| 61 |
include_instruction: bool = True,
|
| 62 |
+
thinking: bool = False,
|
| 63 |
return_usage: bool = False,
|
| 64 |
) -> str | tuple[str, object]:
|
| 65 |
"""Return a JSON score string evaluating `player` on the criteria."""
|
|
|
|
| 71 |
if include_instruction:
|
| 72 |
prompt += f"\n\nInstruction:\n{instruction}"
|
| 73 |
prompt += f"\n\nOutput:\n{player}"
|
| 74 |
+
kwargs = _completion_kwargs(api_base, api_key, temperature)
|
| 75 |
+
kwargs["chat_template_kwargs"] = {"enable_thinking": thinking}
|
| 76 |
response = completion(
|
| 77 |
model=model,
|
| 78 |
messages=[{"role": "system", "content": prompt}],
|
| 79 |
+
**kwargs,
|
| 80 |
)
|
| 81 |
text = response.choices[0].message.content.strip()
|
| 82 |
if return_usage:
|
|
|
|
| 95 |
api_key: str | None = None,
|
| 96 |
temperature: float | None = None,
|
| 97 |
include_instruction: bool = True,
|
| 98 |
+
thinking: bool = False,
|
| 99 |
return_usage: bool = False,
|
| 100 |
) -> str | tuple[str, object]:
|
| 101 |
"""Return which player wins in JSON using the given criteria."""
|
|
|
|
| 106 |
if include_instruction:
|
| 107 |
prompt += f"\n\nInstruction:\n{instruction}"
|
| 108 |
prompt += f"\n\nPlayers:\n<A>{a}</A>\n<B>{b}</B>"
|
| 109 |
+
kwargs = _completion_kwargs(api_base, api_key, temperature)
|
| 110 |
+
kwargs["chat_template_kwargs"] = {"enable_thinking": thinking}
|
| 111 |
response = completion(
|
| 112 |
model=model,
|
| 113 |
messages=[{"role": "system", "content": prompt}],
|
| 114 |
+
**kwargs,
|
| 115 |
)
|
| 116 |
text = response.choices[0].message.content.strip()
|
| 117 |
if return_usage:
|