ping98k commited on
Commit
97d13c6
·
unverified ·
2 Parent(s): 348dd93 6d50a75

Merge pull request #11 from ping98k/codex/add-option-to-enable-thinking-in-llms

Browse files
README.md CHANGED
@@ -21,6 +21,14 @@ This project provides a small interface for running "tournaments" between langua
21
  - `PASS_INSTRUCTION_TO_PAIRWISE`
22
  - `ENABLE_SCORE_FILTER`
23
  - `ENABLE_PAIRWISE_FILTER`
 
 
 
 
 
 
 
 
24
  2. Install dependencies (example with `pip`):
25
  ```bash
26
  pip install gradio litellm python-dotenv tqdm matplotlib
 
21
  - `PASS_INSTRUCTION_TO_PAIRWISE`
22
  - `ENABLE_SCORE_FILTER`
23
  - `ENABLE_PAIRWISE_FILTER`
24
+ - `ENABLE_GENERATE_THINKING`
25
+ - `ENABLE_SCORE_THINKING`
26
+ - `ENABLE_PAIRWISE_THINKING`
27
+
28
+ When any of the thinking flags are enabled, the app sends
29
+ `chat_template_kwargs={"enable_thinking": True}` with each
30
+ `litellm.completion` call for that model. Otherwise it sends
31
+ `chat_template_kwargs={"enable_thinking": False}`.
32
  2. Install dependencies (example with `pip`):
33
  ```bash
34
  pip install gradio litellm python-dotenv tqdm matplotlib
main.py CHANGED
@@ -45,6 +45,9 @@ SCORE_TEMPERATURE_DEFAULT = float(os.getenv("SCORE_TEMPERATURE", "0.6"))
45
  PAIRWISE_TEMPERATURE_DEFAULT = float(os.getenv("PAIRWISE_TEMPERATURE", "0.6"))
46
  SCORE_WITH_INSTRUCTION_DEFAULT = os.getenv("PASS_INSTRUCTION_TO_SCORE", "true").lower() == "true"
47
  PAIRWISE_WITH_INSTRUCTION_DEFAULT = os.getenv("PASS_INSTRUCTION_TO_PAIRWISE", "true").lower() == "true"
 
 
 
48
  CRITERIA_DEFAULT = "Factuality,Instruction Following,Precision"
49
  def _clean_json(txt):
50
  txt = re.sub(r"^```.*?\n|```$", "", txt, flags=re.DOTALL).strip()
@@ -72,6 +75,9 @@ def run_tournament(
72
  enable_pairwise_filter,
73
  score_with_instruction,
74
  pairwise_with_instruction,
 
 
 
75
  ):
76
  instruction = instruction_input.strip()
77
  criteria_list = [c.strip() for c in criteria_input.split(",") if c.strip()] or ["Factuality", "Instruction Following", "Precision"]
@@ -101,6 +107,12 @@ def run_tournament(
101
  score_with_instruction = SCORE_WITH_INSTRUCTION_DEFAULT
102
  if pairwise_with_instruction is None:
103
  pairwise_with_instruction = PAIRWISE_WITH_INSTRUCTION_DEFAULT
 
 
 
 
 
 
104
  process_log = []
105
  hist_fig = None
106
  top_picks_str = ""
@@ -150,6 +162,7 @@ def run_tournament(
150
  api_base=api_base,
151
  api_key=api_token,
152
  temperature=generate_temperature,
 
153
  return_usage=True,
154
  )
155
  add_usage(usage)
@@ -174,6 +187,7 @@ def run_tournament(
174
  api_key=api_token,
175
  temperature=score_temperature,
176
  include_instruction=score_with_instruction,
 
177
  return_usage=True,
178
  )
179
  add_usage(usage)
@@ -212,6 +226,7 @@ def run_tournament(
212
  api_key=api_token,
213
  temperature=pairwise_temperature,
214
  include_instruction=pairwise_with_instruction,
 
215
  return_usage=True,
216
  )
217
  add_usage(usage)
@@ -302,6 +317,9 @@ demo = gr.Interface(
302
  gr.Checkbox(value=PAIRWISE_FILTER_DEFAULT, label="Enable Pairwise Filter"),
303
  gr.Checkbox(value=SCORE_WITH_INSTRUCTION_DEFAULT, label="Pass Instruction to Score Model"),
304
  gr.Checkbox(value=PAIRWISE_WITH_INSTRUCTION_DEFAULT, label="Pass Instruction to Pairwise Model"),
 
 
 
305
  ],
306
  outputs=[
307
  gr.Textbox(lines=10, label="Process"),
 
45
  PAIRWISE_TEMPERATURE_DEFAULT = float(os.getenv("PAIRWISE_TEMPERATURE", "0.6"))
46
  SCORE_WITH_INSTRUCTION_DEFAULT = os.getenv("PASS_INSTRUCTION_TO_SCORE", "true").lower() == "true"
47
  PAIRWISE_WITH_INSTRUCTION_DEFAULT = os.getenv("PASS_INSTRUCTION_TO_PAIRWISE", "true").lower() == "true"
48
+ GENERATE_THINKING_DEFAULT = os.getenv("ENABLE_GENERATE_THINKING", "false").lower() == "true"
49
+ SCORE_THINKING_DEFAULT = os.getenv("ENABLE_SCORE_THINKING", "false").lower() == "true"
50
+ PAIRWISE_THINKING_DEFAULT = os.getenv("ENABLE_PAIRWISE_THINKING", "false").lower() == "true"
51
  CRITERIA_DEFAULT = "Factuality,Instruction Following,Precision"
52
  def _clean_json(txt):
53
  txt = re.sub(r"^```.*?\n|```$", "", txt, flags=re.DOTALL).strip()
 
75
  enable_pairwise_filter,
76
  score_with_instruction,
77
  pairwise_with_instruction,
78
+ generate_thinking,
79
+ score_thinking,
80
+ pairwise_thinking,
81
  ):
82
  instruction = instruction_input.strip()
83
  criteria_list = [c.strip() for c in criteria_input.split(",") if c.strip()] or ["Factuality", "Instruction Following", "Precision"]
 
107
  score_with_instruction = SCORE_WITH_INSTRUCTION_DEFAULT
108
  if pairwise_with_instruction is None:
109
  pairwise_with_instruction = PAIRWISE_WITH_INSTRUCTION_DEFAULT
110
+ if generate_thinking is None:
111
+ generate_thinking = GENERATE_THINKING_DEFAULT
112
+ if score_thinking is None:
113
+ score_thinking = SCORE_THINKING_DEFAULT
114
+ if pairwise_thinking is None:
115
+ pairwise_thinking = PAIRWISE_THINKING_DEFAULT
116
  process_log = []
117
  hist_fig = None
118
  top_picks_str = ""
 
162
  api_base=api_base,
163
  api_key=api_token,
164
  temperature=generate_temperature,
165
+ thinking=generate_thinking,
166
  return_usage=True,
167
  )
168
  add_usage(usage)
 
187
  api_key=api_token,
188
  temperature=score_temperature,
189
  include_instruction=score_with_instruction,
190
+ thinking=score_thinking,
191
  return_usage=True,
192
  )
193
  add_usage(usage)
 
226
  api_key=api_token,
227
  temperature=pairwise_temperature,
228
  include_instruction=pairwise_with_instruction,
229
+ thinking=pairwise_thinking,
230
  return_usage=True,
231
  )
232
  add_usage(usage)
 
317
  gr.Checkbox(value=PAIRWISE_FILTER_DEFAULT, label="Enable Pairwise Filter"),
318
  gr.Checkbox(value=SCORE_WITH_INSTRUCTION_DEFAULT, label="Pass Instruction to Score Model"),
319
  gr.Checkbox(value=PAIRWISE_WITH_INSTRUCTION_DEFAULT, label="Pass Instruction to Pairwise Model"),
320
+ gr.Checkbox(value=GENERATE_THINKING_DEFAULT, label="Enable Thinking (Generate)"),
321
+ gr.Checkbox(value=SCORE_THINKING_DEFAULT, label="Enable Thinking (Score)"),
322
+ gr.Checkbox(value=PAIRWISE_THINKING_DEFAULT, label="Enable Thinking (Pairwise)"),
323
  ],
324
  outputs=[
325
  gr.Textbox(lines=10, label="Process"),
tests/test_main.py CHANGED
@@ -106,13 +106,16 @@ def test_run_tournament_full_loop():
106
  enable_pairwise_filter=True,
107
  score_with_instruction=True,
108
  pairwise_with_instruction=True,
 
 
 
109
  ))
110
 
111
  process_log, hist_fig, top_picks, usage = results[-1]
112
  assert 'Done' in process_log
113
  assert hist_fig == 'fig'
114
  assert top_picks.strip() in {'p1', 'p2'}
115
- mock_gen.assert_called_once_with('instr', 4, model='gm', api_base='b', api_key='k', temperature=1, return_usage=True)
116
  assert 'Score completion' in process_log
117
  assert 'Pairwise completion' in process_log
118
  assert 'Prompt tokens' in usage
@@ -151,6 +154,9 @@ def test_run_tournament_pairwise_odd_players():
151
  enable_pairwise_filter=True,
152
  score_with_instruction=True,
153
  pairwise_with_instruction=True,
 
 
 
154
  ))
155
 
156
  process_log, fig, top_picks, usage = results[-1]
 
106
  enable_pairwise_filter=True,
107
  score_with_instruction=True,
108
  pairwise_with_instruction=True,
109
+ generate_thinking=True,
110
+ score_thinking=True,
111
+ pairwise_thinking=True,
112
  ))
113
 
114
  process_log, hist_fig, top_picks, usage = results[-1]
115
  assert 'Done' in process_log
116
  assert hist_fig == 'fig'
117
  assert top_picks.strip() in {'p1', 'p2'}
118
+ mock_gen.assert_called_once_with('instr', 4, model='gm', api_base='b', api_key='k', temperature=1, thinking=True, return_usage=True)
119
  assert 'Score completion' in process_log
120
  assert 'Pairwise completion' in process_log
121
  assert 'Prompt tokens' in usage
 
154
  enable_pairwise_filter=True,
155
  score_with_instruction=True,
156
  pairwise_with_instruction=True,
157
+ generate_thinking=True,
158
+ score_thinking=True,
159
+ pairwise_thinking=True,
160
  ))
161
 
162
  process_log, fig, top_picks, usage = results[-1]
tests/test_tournament_utils.py CHANGED
@@ -26,7 +26,7 @@ def test_generate_players():
26
  resp = make_response([" player1 ", "player2\n"])
27
  with patch('tournament_utils.completion', return_value=resp) as mock_comp:
28
  players = tu.generate_players('instr', 2, model='m', api_base='b', api_key='k', temperature=0.5)
29
- mock_comp.assert_called_once_with(model='m', messages=[{'role': 'user', 'content': 'instr'}], n=2, api_base='b', api_key='k', temperature=0.5)
30
  assert players == ['player1', 'player2']
31
 
32
 
@@ -50,3 +50,25 @@ def test_prompt_pairwise():
50
  assert mock_comp.call_args.kwargs['api_key'] == 'k'
51
  assert mock_comp.call_args.kwargs['temperature'] == 0.3
52
  assert result == '{"winner": "A"}'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  resp = make_response([" player1 ", "player2\n"])
27
  with patch('tournament_utils.completion', return_value=resp) as mock_comp:
28
  players = tu.generate_players('instr', 2, model='m', api_base='b', api_key='k', temperature=0.5)
29
+ mock_comp.assert_called_once_with(model='m', messages=[{'role': 'user', 'content': 'instr'}], n=2, api_base='b', api_key='k', temperature=0.5, chat_template_kwargs={'enable_thinking': False})
30
  assert players == ['player1', 'player2']
31
 
32
 
 
50
  assert mock_comp.call_args.kwargs['api_key'] == 'k'
51
  assert mock_comp.call_args.kwargs['temperature'] == 0.3
52
  assert result == '{"winner": "A"}'
53
+
54
+
55
+ def test_thinking_passed_to_completion():
56
+ resp = make_response(["ok"])
57
+ with patch('tournament_utils.completion', return_value=resp) as mock_comp:
58
+ tu.generate_players('i', 1, thinking=True)
59
+ tu.prompt_score('i', ['c'], 'block', 'p', thinking=True)
60
+ tu.prompt_pairwise('i', 'block', 'a', 'b', thinking=True)
61
+ assert mock_comp.call_count == 3
62
+ for call in mock_comp.call_args_list:
63
+ assert call.kwargs['chat_template_kwargs'] == {'enable_thinking': True}
64
+
65
+
66
+ def test_thinking_disabled_by_default():
67
+ resp = make_response(["ok"])
68
+ with patch('tournament_utils.completion', return_value=resp) as mock_comp:
69
+ tu.generate_players('i', 1)
70
+ tu.prompt_score('i', ['c'], 'block', 'p')
71
+ tu.prompt_pairwise('i', 'block', 'a', 'b')
72
+ assert mock_comp.call_count == 3
73
+ for call in mock_comp.call_args_list:
74
+ assert call.kwargs['chat_template_kwargs'] == {'enable_thinking': False}
tournament_utils.py CHANGED
@@ -25,6 +25,7 @@ def generate_players(
25
  api_base: str | None = None,
26
  api_key: str | None = None,
27
  temperature: float | None = None,
 
28
  return_usage: bool = False,
29
  ) -> list[str] | tuple[list[str], object]:
30
  """Request ``n`` completions for the instruction using the given model.
@@ -32,11 +33,14 @@ def generate_players(
32
  When ``return_usage`` is ``True`` the ``usage`` object from the completion
33
  response is also returned.
34
  """
 
 
 
35
  response = completion(
36
  model=model,
37
- messages=[{"role": "user", "content": instruction}],
38
  n=n,
39
- **_completion_kwargs(api_base, api_key, temperature),
40
  )
41
  players = [c.message.content.strip() for c in response.choices]
42
  if return_usage:
@@ -55,6 +59,7 @@ def prompt_score(
55
  api_key: str | None = None,
56
  temperature: float | None = None,
57
  include_instruction: bool = True,
 
58
  return_usage: bool = False,
59
  ) -> str | tuple[str, object]:
60
  """Return a JSON score string evaluating `player` on the criteria."""
@@ -66,10 +71,12 @@ Return JSON exactly like: {{"scores": [{example_scores}]}}."""
66
  if include_instruction:
67
  prompt += f"\n\nInstruction:\n{instruction}"
68
  prompt += f"\n\nOutput:\n{player}"
 
 
69
  response = completion(
70
  model=model,
71
  messages=[{"role": "system", "content": prompt}],
72
- **_completion_kwargs(api_base, api_key, temperature),
73
  )
74
  text = response.choices[0].message.content.strip()
75
  if return_usage:
@@ -88,6 +95,7 @@ def prompt_pairwise(
88
  api_key: str | None = None,
89
  temperature: float | None = None,
90
  include_instruction: bool = True,
 
91
  return_usage: bool = False,
92
  ) -> str | tuple[str, object]:
93
  """Return which player wins in JSON using the given criteria."""
@@ -98,10 +106,12 @@ Return ONLY JSON {{"winner": "A"}} or {{"winner": "B"}}."""
98
  if include_instruction:
99
  prompt += f"\n\nInstruction:\n{instruction}"
100
  prompt += f"\n\nPlayers:\n<A>{a}</A>\n<B>{b}</B>"
 
 
101
  response = completion(
102
  model=model,
103
  messages=[{"role": "system", "content": prompt}],
104
- **_completion_kwargs(api_base, api_key, temperature),
105
  )
106
  text = response.choices[0].message.content.strip()
107
  if return_usage:
 
25
  api_base: str | None = None,
26
  api_key: str | None = None,
27
  temperature: float | None = None,
28
+ thinking: bool = False,
29
  return_usage: bool = False,
30
  ) -> list[str] | tuple[list[str], object]:
31
  """Request ``n`` completions for the instruction using the given model.
 
33
  When ``return_usage`` is ``True`` the ``usage`` object from the completion
34
  response is also returned.
35
  """
36
+ messages = [{"role": "user", "content": instruction}]
37
+ kwargs = _completion_kwargs(api_base, api_key, temperature)
38
+ kwargs["chat_template_kwargs"] = {"enable_thinking": thinking}
39
  response = completion(
40
  model=model,
41
+ messages=messages,
42
  n=n,
43
+ **kwargs,
44
  )
45
  players = [c.message.content.strip() for c in response.choices]
46
  if return_usage:
 
59
  api_key: str | None = None,
60
  temperature: float | None = None,
61
  include_instruction: bool = True,
62
+ thinking: bool = False,
63
  return_usage: bool = False,
64
  ) -> str | tuple[str, object]:
65
  """Return a JSON score string evaluating `player` on the criteria."""
 
71
  if include_instruction:
72
  prompt += f"\n\nInstruction:\n{instruction}"
73
  prompt += f"\n\nOutput:\n{player}"
74
+ kwargs = _completion_kwargs(api_base, api_key, temperature)
75
+ kwargs["chat_template_kwargs"] = {"enable_thinking": thinking}
76
  response = completion(
77
  model=model,
78
  messages=[{"role": "system", "content": prompt}],
79
+ **kwargs,
80
  )
81
  text = response.choices[0].message.content.strip()
82
  if return_usage:
 
95
  api_key: str | None = None,
96
  temperature: float | None = None,
97
  include_instruction: bool = True,
98
+ thinking: bool = False,
99
  return_usage: bool = False,
100
  ) -> str | tuple[str, object]:
101
  """Return which player wins in JSON using the given criteria."""
 
106
  if include_instruction:
107
  prompt += f"\n\nInstruction:\n{instruction}"
108
  prompt += f"\n\nPlayers:\n<A>{a}</A>\n<B>{b}</B>"
109
+ kwargs = _completion_kwargs(api_base, api_key, temperature)
110
+ kwargs["chat_template_kwargs"] = {"enable_thinking": thinking}
111
  response = completion(
112
  model=model,
113
  messages=[{"role": "system", "content": prompt}],
114
+ **kwargs,
115
  )
116
  text = response.choices[0].message.content.strip()
117
  if return_usage: