BtB-ExpC commited on
Commit
3cda69c
Β·
1 Parent(s): 57a4fb4

Biig changes (moved dropdowns into tabs, implemented distractors chain)

Browse files
app.py CHANGED
@@ -14,12 +14,98 @@ logger = logging.getLogger(__name__)
14
  # --- Callback to update the exercise format dropdown based on LLM selection ---
15
  def update_exercise_format(selected_model: str):
16
  # When "Claude3.5" is selected, default the format to XML; otherwise, default to Markdown.
17
- if selected_model == "Claude 3.5":
18
  return gr.update(value="XML")
19
  else:
20
  return gr.update(value="Plaintext")
21
 
22
- # A generic async runner for chains.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  async def run_chain(chain_name: str, input_variables: dict, selected_model: str):
24
  try:
25
  chain_config = chain_configs.get(chain_name)
@@ -60,48 +146,6 @@ async def run_chain(chain_name: str, input_variables: dict, selected_model: str)
60
  logger.error(f"Error in run_chain for '{chain_name}': {e}")
61
  return f"Error: {e}"
62
 
63
- # Async wrappers for each chain.
64
- async def run_diagnoser(user_query: str, chosen_model: str, exercise_format: str, sampling_count: str) -> tuple:
65
- # figure out how many times to run
66
- num_samples = int("".join(filter(str.isdigit, sampling_count)))
67
-
68
- # Fetch the DiagnoserChain configuration.
69
- config = chain_configs["diagnoser"]
70
-
71
- # 1) Standardize the user query exactly once
72
- standardized_exercise = await standardize_exercise(
73
- user_query,
74
- exercise_format,
75
- config["template_standardize"], # Only if you kept them in config
76
- config["llm_standardize"]
77
- )
78
-
79
- # 2) Instantiate the DiagnoserChain using the user-selected LLM for diagnosing
80
- chain_instance = config["class"](
81
- templates_diagnose=config["templates_diagnose"],
82
- llm_diagnose=llms.get(chosen_model, config["llm_diagnose"]),
83
- template_diagnose_scorecard=config["template_diagnose_scorecard"],
84
- llm_4o_mini=config["llm_4o_mini"],
85
- llm_4o=config["llm_4o"]
86
- )
87
-
88
- # 3) Run the multiple samples in parallel
89
- # Create a short helper that does only the "diagnose" steps:
90
- tasks = [
91
- chain_instance.diagnose_only(standardized_exercise)
92
- for _ in range(num_samples)
93
- ]
94
- # run concurrently
95
- responses = await asyncio.gather(*tasks)
96
-
97
- # pad up to 5 if needed
98
- all_responses = list(responses) + [""] * (10 - len(responses))
99
-
100
- # Return a tuple of exactly 5 responses.
101
- return tuple(all_responses)
102
-
103
- async def run_distractors(user_query: str, model_choice: str) -> str:
104
- return await run_chain("distractors", {"user_query": user_query}, model_choice)
105
 
106
  # -------------------------------
107
  # Build the Gradio Interface
@@ -116,46 +160,48 @@ with gr.Blocks() as interface:
116
 
117
  # --- Main App (initially hidden) ---
118
  with gr.Column(visible=False, elem_id="main_app") as app_container:
119
- gr.Markdown("## Pick the tab for your task of choice below")
120
- # Dropdown for LLM selection.
121
- # Create a row for the control dropdowns
122
- with gr.Row():
123
- model_choice = gr.Dropdown(
124
- choices=list(llms.keys()),
125
- value="GPT-4o",
126
- label="Select LLM",
127
- interactive=True,
128
- )
129
- exercise_format = gr.Dropdown(
130
- choices=["Markdown", "XML", "Plaintext", "Raw (original)"],
131
- value="Markdown",
132
- label="Exercise Format",
133
- interactive=True,
134
- )
135
- sampling_count = gr.Dropdown(
136
- choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
137
- value="1",
138
- label="Sampling Count",
139
- interactive=True,
140
- )
141
- # Set up a change callback so that if the user selects "Claude 3.5", the exercise format updates to "XML"
142
- model_choice.change(
143
- fn=update_exercise_format,
144
- inputs=[model_choice],
145
- outputs=[exercise_format]
146
- )
147
  with gr.Tabs():
148
- with gr.TabItem("🩺 Validate exercise"):
149
  # Insert an HTML info icon with a tooltip at the top of the tab content.
150
  gr.HTML(
151
  """
152
  <div style="margin-bottom: 10px;">
153
- <span style="font-size: 1.5em; cursor: help;" title="Validate exercise: Diagnoses potential issues for the given exercise(s).">
154
- ℹ️ <i>← mouseover for more info</i>
155
  </span>
156
  </div>
157
  """
158
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  diagnoser_input = gr.Textbox(label="Enter exercise(s) in any format", placeholder="Exercise body: <mc:exercise xmlns:mc= ...")
160
  diagnoser_button = gr.Button("Submit")
161
  diagnoser_response_1 = gr.Textbox(label="Response 1", interactive=False)
@@ -168,6 +214,8 @@ with gr.Blocks() as interface:
168
  diagnoser_response_8 = gr.Textbox(label="Response 8", interactive=False)
169
  diagnoser_response_9 = gr.Textbox(label="Response 9", interactive=False)
170
  diagnoser_response_10 = gr.Textbox(label="Response 10", interactive=False)
 
 
171
  with gr.TabItem("πŸ€” Generate distractors"):
172
  # Insert an HTML info icon with a tooltip at the top of the tab content.
173
  gr.HTML(
@@ -179,10 +227,52 @@ with gr.Blocks() as interface:
179
  </div>
180
  """
181
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  distractors_input = gr.Textbox(label="Enter exercise(s) in any format", placeholder="Stelling: Dit is een ..... voorbeeld van een stelling. A. Mooi B. Lelijk ...")
183
  distractors_button = gr.Button("Submit")
184
- gr.Markdown("**Response(s):**")
185
- distractors_responses = gr.Column()
 
 
 
 
 
 
 
 
186
  with gr.TabItem("🚧 Generate learning objectives"):
187
  # Insert an HTML info icon with a tooltip at the top of the tab content.
188
  gr.HTML(
@@ -211,7 +301,7 @@ with gr.Blocks() as interface:
211
 
212
  diagnoser_button.click(
213
  fn=run_diagnoser,
214
- inputs=[diagnoser_input, model_choice, exercise_format, sampling_count],
215
  outputs=[
216
  diagnoser_response_1,
217
  diagnoser_response_2,
@@ -228,8 +318,19 @@ with gr.Blocks() as interface:
228
 
229
  distractors_button.click(
230
  fn=run_distractors,
231
- inputs=[distractors_input, model_choice, exercise_format, sampling_count],
232
- outputs=[distractors_responses]
 
 
 
 
 
 
 
 
 
 
 
233
  )
234
 
235
  # Launch the app.
 
14
  # --- Callback to update the exercise format dropdown based on LLM selection ---
15
  def update_exercise_format(selected_model: str):
16
  # When "Claude3.5" is selected, default the format to XML; otherwise, default to Markdown.
17
+ if "Claude" in selected_model:
18
  return gr.update(value="XML")
19
  else:
20
  return gr.update(value="Plaintext")
21
 
22
+
23
+
24
+ # Async wrappers for each chain.
25
+ async def run_diagnoser(user_query: str, model_choice_validate: str, exercise_format_validate: str, sampling_count_validate: str) -> tuple:
26
+ # figure out how many times to run
27
+ num_samples = int("".join(filter(str.isdigit, sampling_count)))
28
+
29
+ # Fetch the DiagnoserChain configuration.
30
+ config = chain_configs["diagnoser"]
31
+
32
+ # 1) Standardize the user query exactly once
33
+ standardized_exercise = await standardize_exercise(
34
+ user_query,
35
+ exercise_format,
36
+ config["template_standardize"], # Only if you kept them in config
37
+ config["llm_standardize"]
38
+ )
39
+
40
+ # 2) Instantiate the DiagnoserChain using the user-selected LLM for diagnosing
41
+ chain_instance = config["class"](
42
+ templates_diagnose=config["templates_diagnose"],
43
+ llm_diagnose=llms.get(model_choice_validate, config["llm_diagnose"]),
44
+ template_diagnose_scorecard=config["template_diagnose_scorecard"],
45
+ llm_4o_mini=config["llm_4o_mini"],
46
+ llm_4o=config["llm_4o"]
47
+ )
48
+
49
+ # 3) Run the multiple samples in parallel
50
+ # Create a short helper that does only the "diagnose" steps:
51
+ tasks = [
52
+ chain_instance.diagnose_only(standardized_exercise)
53
+ for _ in range(num_samples)
54
+ ]
55
+ # run concurrently
56
+ responses = await asyncio.gather(*tasks)
57
+
58
+ # pad up to 10 if needed
59
+ all_responses = list(responses) + [""] * (10 - len(responses))
60
+
61
+ # Return a tuple of exactly 5 responses.
62
+ return tuple(all_responses)
63
+
64
+
65
+ async def run_distractors(
66
+ user_query: str,
67
+ model_choice_distractors_1: str,
68
+ model_choice_distractors_2: str,
69
+ exercise_format_distractors: str,
70
+ sampling_count_distractors: str
71
+ ) -> tuple:
72
+ # 0) Parse how many concurrent runs (samples) we want
73
+ num_samples = int("".join(filter(str.isdigit, sampling_count_distractors)))
74
+ # Fetch the DistractorsChain configuration.
75
+ config = chain_configs["distractors"]
76
+
77
+ # 1) Standardize the user query exactly once
78
+ standardized_exercise = await standardize_exercise(
79
+ user_query,
80
+ exercise_format_distractors,
81
+ config["template_standardize"],
82
+ config["llm_standardize"]
83
+ )
84
+
85
+ # 2) Build the DistractorsChain instance
86
+ chain_instance = config["class"](
87
+ template_distractors_brainstorm_1=config["template_distractors_brainstorm_1"],
88
+ template_distractors_brainstorm_2=config["template_distractors_brainstorm_2"],
89
+ llm_brainstorm_1=llms.get(model_choice_distractors_1, config["llm_brainstorm_1"]), # User-selected (low and high temp GPT-4o by default)
90
+ llm_brainstorm_2=llms.get(model_choice_distractors_2, config["llm_brainstorm_2"]),
91
+ template_consolidate=config["template_consolidate"],
92
+ llm_consolidate=config["llm_consolidate"],
93
+ )
94
+
95
+ # 3) Create N tasks in parallel (one full distractor generation pipeline per sample)
96
+ tasks = [
97
+ chain_instance.run(standardized_exercise) for _ in range(num_samples)
98
+ ]
99
+ results = await asyncio.gather(*tasks)
100
+
101
+ # 4) Pad up to 10 outputs to correspond to 10 response fields
102
+ all_responses = list(results) + [""] * (10 - len(results))
103
+
104
+ return tuple(all_responses)
105
+
106
+
107
+
108
+ # A generic async runner for simple chains (currently not used)
109
  async def run_chain(chain_name: str, input_variables: dict, selected_model: str):
110
  try:
111
  chain_config = chain_configs.get(chain_name)
 
146
  logger.error(f"Error in run_chain for '{chain_name}': {e}")
147
  return f"Error: {e}"
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  # -------------------------------
151
  # Build the Gradio Interface
 
160
 
161
  # --- Main App (initially hidden) ---
162
  with gr.Column(visible=False, elem_id="main_app") as app_container:
163
+ gr.Markdown("## Pick the tab for your task of choice")
164
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  with gr.Tabs():
166
+ with gr.TabItem("🩺 Diagnose exercise"):
167
  # Insert an HTML info icon with a tooltip at the top of the tab content.
168
  gr.HTML(
169
  """
170
  <div style="margin-bottom: 10px;">
171
+ <span style="font-size: 1.5em; cursor: help;" title="Diagnose exercise: Diagnoses potential issues for the given exercise(s).">
172
+ ℹ️ <i>← mouseover</i>
173
  </span>
174
  </div>
175
  """
176
  )
177
+
178
+ # Create a row for the control dropdowns: LLM selection, exercise format, sampling count etc.
179
+ with gr.Row():
180
+ model_choice_validate = gr.Dropdown(
181
+ choices=list(llms.keys()),
182
+ value="GPT-4o (low temp)",
183
+ label="Select LLM",
184
+ interactive=True,
185
+ )
186
+ exercise_format_validate = gr.Dropdown(
187
+ choices=["Markdown", "XML", "Plaintext", "Raw (original)"],
188
+ value="Markdown",
189
+ label="Exercise Format",
190
+ interactive=True,
191
+ )
192
+ sampling_count_validate = gr.Dropdown(
193
+ choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
194
+ value="1",
195
+ label="Sampling Count",
196
+ interactive=True,
197
+ )
198
+ # Set up a change callback so that if the user selects any model with "Claude" in the name, the exercise format updates to "XML"
199
+ model_choice_validate.change(
200
+ fn=update_exercise_format,
201
+ inputs=[model_choice_validate],
202
+ outputs=[exercise_format_validate]
203
+ )
204
+
205
  diagnoser_input = gr.Textbox(label="Enter exercise(s) in any format", placeholder="Exercise body: <mc:exercise xmlns:mc= ...")
206
  diagnoser_button = gr.Button("Submit")
207
  diagnoser_response_1 = gr.Textbox(label="Response 1", interactive=False)
 
214
  diagnoser_response_8 = gr.Textbox(label="Response 8", interactive=False)
215
  diagnoser_response_9 = gr.Textbox(label="Response 9", interactive=False)
216
  diagnoser_response_10 = gr.Textbox(label="Response 10", interactive=False)
217
+
218
+
219
  with gr.TabItem("πŸ€” Generate distractors"):
220
  # Insert an HTML info icon with a tooltip at the top of the tab content.
221
  gr.HTML(
 
227
  </div>
228
  """
229
  )
230
+
231
+ # Create a row for the control dropdowns: LLM selection, exercise format, sampling count etc.
232
+ with gr.Row():
233
+ model_choice_distractors_1 = gr.Dropdown(
234
+ choices=list(llms.keys()),
235
+ value="GPT-4o (low temp)",
236
+ label="Select first LLM",
237
+ interactive=True,
238
+ )
239
+ model_choice_distractors_2 = gr.Dropdown(
240
+ choices=list(llms.keys()),
241
+ value="GPT-4o (mid temp)",
242
+ label="Select second LLM",
243
+ interactive=True,
244
+ )
245
+ exercise_format_distractors = gr.Dropdown(
246
+ choices=["Markdown", "XML", "Plaintext", "Raw (original)"],
247
+ value="Plaintext",
248
+ label="Exercise Format",
249
+ interactive=True,
250
+ )
251
+ sampling_count_distractors = gr.Dropdown(
252
+ choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
253
+ value="1",
254
+ label="Sampling Count",
255
+ interactive=True,
256
+ )
257
+ # Set up a change callback so that if the user selects any model with "Claude" in the name, the exercise format updates to "XML"
258
+ model_choice_distractors_1.change(
259
+ fn=update_exercise_format,
260
+ inputs=[model_choice_distractors_1],
261
+ outputs=[exercise_format_distractors]
262
+ )
263
+
264
  distractors_input = gr.Textbox(label="Enter exercise(s) in any format", placeholder="Stelling: Dit is een ..... voorbeeld van een stelling. A. Mooi B. Lelijk ...")
265
  distractors_button = gr.Button("Submit")
266
+ distractors_response_1 = gr.Textbox(label="Response 1", interactive=False)
267
+ distractors_response_2 = gr.Textbox(label="Response 2", interactive=False)
268
+ distractors_response_3 = gr.Textbox(label="Response 3", interactive=False)
269
+ distractors_response_4 = gr.Textbox(label="Response 4", interactive=False)
270
+ distractors_response_5 = gr.Textbox(label="Response 5", interactive=False)
271
+ distractors_response_6 = gr.Textbox(label="Response 6", interactive=False)
272
+ distractors_response_7 = gr.Textbox(label="Response 7", interactive=False)
273
+ distractors_response_8 = gr.Textbox(label="Response 8", interactive=False)
274
+ distractors_response_9 = gr.Textbox(label="Response 9", interactive=False)
275
+ distractors_response_10 = gr.Textbox(label="Response 10", interactive=False)
276
  with gr.TabItem("🚧 Generate learning objectives"):
277
  # Insert an HTML info icon with a tooltip at the top of the tab content.
278
  gr.HTML(
 
301
 
302
  diagnoser_button.click(
303
  fn=run_diagnoser,
304
+ inputs=[diagnoser_input, model_choice_validate, exercise_format_validate, sampling_count_validate],
305
  outputs=[
306
  diagnoser_response_1,
307
  diagnoser_response_2,
 
318
 
319
  distractors_button.click(
320
  fn=run_distractors,
321
+ inputs=[distractors_input, model_choice_distractors_1, model_choice_distractors_2, exercise_format_distractors, sampling_count_distractors],
322
+ outputs=[
323
+ distractors_response_1,
324
+ distractors_response_2,
325
+ distractors_response_3,
326
+ distractors_response_4,
327
+ distractors_response_5,
328
+ distractors_response_6,
329
+ distractors_response_7,
330
+ distractors_response_8,
331
+ distractors_response_9,
332
+ distractors_response_10
333
+ ]
334
  )
335
 
336
  # Launch the app.
chains/distractors_chain.py CHANGED
@@ -1,4 +1,5 @@
1
  # chains/distractors_chain.py
 
2
  from pydantic import BaseModel
3
  from typing import Any
4
  from langchain_core.prompts.chat import ChatPromptTemplate
@@ -6,29 +7,86 @@ from config.exercise_standardizer import standardize_exercise
6
 
7
 
8
  class DistractorsChain(BaseModel):
9
- template_standardize: ChatPromptTemplate
10
- template_distractors: ChatPromptTemplate
11
- llm_standardize: Any # Fixed LLM for step 1
12
- llm_distractors: Any # User-selectable LLM for step 2
 
 
13
 
14
-
15
- async def run(self, user_query: str, exercise_format: str) -> str:
16
  """
17
- Runs the composite chain:
18
- 1. Standardizes the exercise formatting (if exercise_format isn't Raw).
19
- 2. Generates new distractors from the standardized format.
 
 
 
 
20
  """
21
- # --- Step 1: Standardize the exercise formatting (if exercise_format isn't 'Raw (original)') ---
22
- standardized_exercise = await standardize_exercise(
23
- user_query, exercise_format, self.template_standardize, self.llm_standardize
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  )
 
25
 
26
- # --- Step 2: Generate new distractors using the standardized exercise ---
27
- prompt_distractors = await self.template_distractors.aformat_prompt(standardized_exercise=standardized_exercise)
28
- distractors_messages = prompt_distractors.to_messages()
29
- distractors = await self.llm_distr.ainvoke(distractors_messages)
30
 
31
- return distractors
 
32
 
33
  class Config:
34
  arbitrary_types_allowed = True
 
1
  # chains/distractors_chain.py
2
+ import asyncio
3
  from pydantic import BaseModel
4
  from typing import Any
5
  from langchain_core.prompts.chat import ChatPromptTemplate
 
7
 
8
 
9
  class DistractorsChain(BaseModel):
10
+ template_distractors_brainstorm_1: ChatPromptTemplate
11
+ template_distractors_brainstorm_2: ChatPromptTemplate
12
+ llm_brainstorm_low: Any # User-selectable LLMs for brainstorm
13
+ llm_brainstorm_high: Any
14
+ template_consolidate: ChatPromptTemplate
15
+ llm_consolidate: Any
16
 
17
+ async def run(self, standardized_exercise: str) -> str:
 
18
  """
19
+ Overall flow:
20
+ 2) Run 4 parallel brainstorming calls:
21
+ - 2 uses 'template_distractors_brainstorm_1' with (low-temp, high-temp)
22
+ - 2 uses 'template_distractors_brainstorm_2' with (low-temp, high-temp)
23
+ 3) Merge those four partial results in a single final answer
24
+ via a "consolidation" prompt.
25
+ 4) Return the final string
26
  """
27
+
28
+ # --- Step 2: Brainstorm in parallel ---
29
+ async def run_brainstorm(
30
+ prompt_template: ChatPromptTemplate,
31
+ llm_brainstorm: Any,
32
+ index_label: str
33
+ ) -> str:
34
+ # Format prompt
35
+ prompt = await prompt_template.aformat_prompt(
36
+ standardized_exercise=standardized_exercise
37
+ )
38
+ messages = prompt.to_messages()
39
+
40
+ # Call the specified LLM
41
+ response = await llm_brainstorm.ainvoke(messages)
42
+ content = getattr(response, "content", response)
43
+
44
+ return f"[Brainstorm {index_label}]\n{content}"
45
+
46
+ tasks = []
47
+ # Template 1, low-temp
48
+ tasks.append(run_brainstorm(
49
+ self.template_distractors_brainstorm_1,
50
+ self.llm_brainstorm_1,
51
+ "T1-Low"
52
+ ))
53
+ # Template 1, high-temp
54
+ tasks.append(run_brainstorm(
55
+ self.template_distractors_brainstorm_1,
56
+ self.llm_brainstorm_2,
57
+ "T1-High"
58
+ ))
59
+ # Template 2, low-temp
60
+ tasks.append(run_brainstorm(
61
+ self.template_distractors_brainstorm_2,
62
+ self.llm_brainstorm_1,
63
+ "T2-Low"
64
+ ))
65
+ # Template 2, high-temp
66
+ tasks.append(run_brainstorm(
67
+ self.template_distractors_brainstorm_2,
68
+ self.llm_brainstorm_2,
69
+ "T2-High"
70
+ ))
71
+
72
+ # Kick them off concurrently
73
+ brainstorm_results = await asyncio.gather(*tasks)
74
+
75
+ # Combine them in a single multiline string
76
+ combined_brainstorms = "\n\n".join(brainstorm_results)
77
+
78
+ # --- Step 3: Consolidate the 4 partial outputs into a final response ---
79
+ consolidation_prompt = await self.template_consolidate.aformat_prompt(
80
+ brainstorm_outputs=combined_brainstorms,
81
+ standardized_exercise=standardized_exercise
82
  )
83
+ consolidation_messages = consolidation_prompt.to_messages()
84
 
85
+ consolidation_response = await self.llm_consolidate.ainvoke(consolidation_messages)
86
+ final_output = getattr(consolidation_response, "content", consolidation_response)
 
 
87
 
88
+ # Return the final merged distractors response
89
+ return final_output
90
 
91
  class Config:
92
  arbitrary_types_allowed = True
config/chain_configs.py CHANGED
@@ -7,20 +7,23 @@ from config.templates import (
7
  template_diagnose_correct_answer_stands_out,
8
  template_diagnose_distractor_clearly_wrong,
9
  template_diagnose_distractor_partially_correct,
10
- diagnose_scorecard_template
 
 
 
11
  )
12
  from chains.diagnoser_chain import DiagnoserChain
13
  from chains.distractors_chain import DistractorsChain
14
  from config.llm_config import llms
15
 
16
- # Note: The default LLM here is 4o; the UI can override this choice.
17
  chain_configs = {
18
  "diagnoser": {
19
  "class": DiagnoserChain,
20
  "template_standardize": standardize_template,
21
  "llm_standardize": llms["GPT-4o-mini-zero"], # Always fixed
22
  "llm_4o_mini": llms["GPT-4o-mini"],
23
- "llm_4o": llms["GPT-4o"],
24
  # 4 different diagnosis templates (to run in parallel:
25
  "templates_diagnose": [
26
  template_diagnose_double_negation,
@@ -29,14 +32,17 @@ chain_configs = {
29
  template_diagnose_distractor_partially_correct,
30
  ],
31
  "template_diagnose_scorecard": diagnose_scorecard_template,
32
- "llm_diagnose": llms["GPT-4o"], # Default; can be replaced in UI
33
  },
34
  "distractors": {
35
  "class": DistractorsChain,
36
  "template_standardize": standardize_template,
37
  "llm_standardize": llms["GPT-4o-mini-zero"], # Always fixed
38
- "template_distractors": distractors_template,
39
- "llm_distractors": llms["GPT-4o"], # Default; can be replaced in UI
40
- "llm_4o_mini": llms["GPT-4o-mini"],
 
 
 
41
  },
42
  }
 
7
  template_diagnose_correct_answer_stands_out,
8
  template_diagnose_distractor_clearly_wrong,
9
  template_diagnose_distractor_partially_correct,
10
+ diagnose_scorecard_template,
11
+ template_distractors_brainstorm_1,
12
+ template_distractors_brainstorm_2,
13
+ distractors_consolidate_template
14
  )
15
  from chains.diagnoser_chain import DiagnoserChain
16
  from chains.distractors_chain import DistractorsChain
17
  from config.llm_config import llms
18
 
19
+ # Note: The default LLM here is GPT-4o (low temp); the UI can override this choice.
20
  chain_configs = {
21
  "diagnoser": {
22
  "class": DiagnoserChain,
23
  "template_standardize": standardize_template,
24
  "llm_standardize": llms["GPT-4o-mini-zero"], # Always fixed
25
  "llm_4o_mini": llms["GPT-4o-mini"],
26
+ "llm_4o": llms["GPT-4o (low temp)"],
27
  # 4 different diagnosis templates (to run in parallel:
28
  "templates_diagnose": [
29
  template_diagnose_double_negation,
 
32
  template_diagnose_distractor_partially_correct,
33
  ],
34
  "template_diagnose_scorecard": diagnose_scorecard_template,
35
+ "llm_diagnose": llms["GPT-4o (low temp)"], # Default; can be replaced in UI
36
  },
37
  "distractors": {
38
  "class": DistractorsChain,
39
  "template_standardize": standardize_template,
40
  "llm_standardize": llms["GPT-4o-mini-zero"], # Always fixed
41
+ "template_distractors_brainstorm_1": template_distractors_brainstorm_1,
42
+ "template_distractors_brainstorm_2": template_distractors_brainstorm_2,
43
+ "llm_brainstorm_1": llms["GPT-4o (low temp)"],
44
+ "llm_brainstorm_2": llms["GPT-4o (mid temp"],
45
+ "template_consolidate": distractors_consolidate_template,
46
+ "llm_consolidate": llms["GPT-4o (low temp)"], # or something else
47
  },
48
  }
config/llm_config.py CHANGED
@@ -19,8 +19,12 @@ HIGH = 1.2
19
  def create_openai_llm(model_name: str, temperature: float):
20
  return ChatOpenAI(api_key=OPENAI_API_KEY, model_name=model_name, temperature=temperature)
21
 
22
- def create_openai_reasoning_llm(model_name: str):
23
- return ChatOpenAI(api_key=OPENAI_API_KEY, model_name=model_name)
 
 
 
 
24
 
25
  def create_anthropic_llm(model_name: str, temperature: float):
26
  return ChatAnthropic(api_key=ANTHROPIC_API_KEY, model_name=model_name, temperature=temperature)
@@ -29,14 +33,26 @@ def create_deepseek_llm(model_name: str, temperature: float):
29
  return ChatAnthropic(api_key=ANTHROPIC_API_KEY, model_name=model_name, temperature=temperature)
30
 
31
  llms = {
32
- "GPT-4o": create_openai_llm("gpt-4o", LOW),
 
 
 
 
33
  "GPT-4o-mini-zero": create_openai_llm("gpt-4o-mini", ZERO),
34
  "GPT-4o-mini": create_openai_llm("gpt-4o-mini", LOW),
35
  "GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
36
  "GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
37
- "GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", HIGH),
 
 
38
  "o1": create_openai_reasoning_llm("o1-2024-12-17"),
39
- "o3-mini": create_openai_reasoning_llm("o3-mini-2025-01-31"),
40
- "Claude 3.5": create_anthropic_llm("claude-3-5-sonnet-latest", LOW),
41
- "Deepseek R1🚧": create_anthropic_llm("deepseek-reasoner", LOW),
42
- }
 
 
 
 
 
 
 
19
  def create_openai_llm(model_name: str, temperature: float):
20
  return ChatOpenAI(api_key=OPENAI_API_KEY, model_name=model_name, temperature=temperature)
21
 
22
+ def create_openai_reasoning_llm(model_name: str, reasoning_effort: str = None):
23
+ # If reasoning_effort is provided, pass it; otherwise, avoid sending the parameter.
24
+ if reasoning_effort:
25
+ return ChatOpenAI(api_key=OPENAI_API_KEY, model_name=model_name, reasoning_effort=reasoning_effort)
26
+ else:
27
+ return ChatOpenAI(api_key=OPENAI_API_KEY, model_name=model_name)
28
 
29
  def create_anthropic_llm(model_name: str, temperature: float):
30
  return ChatAnthropic(api_key=ANTHROPIC_API_KEY, model_name=model_name, temperature=temperature)
 
33
  return ChatAnthropic(api_key=ANTHROPIC_API_KEY, model_name=model_name, temperature=temperature)
34
 
35
  llms = {
36
+ # OpenAI models with temperature
37
+
38
+ "GPT-4o (low temp)": create_openai_llm("gpt-4o", LOW),
39
+ "GPT-4o (mid temp)": create_openai_llm("gpt-4o", MID),
40
+ "GPT-4o (high temp)": create_openai_llm("gpt-4o", HIGH),
41
  "GPT-4o-mini-zero": create_openai_llm("gpt-4o-mini", ZERO),
42
  "GPT-4o-mini": create_openai_llm("gpt-4o-mini", LOW),
43
  "GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
44
  "GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
45
+ "GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
46
+
47
+ # OpenAI reasoning models (no temperature)
48
  "o1": create_openai_reasoning_llm("o1-2024-12-17"),
49
+ "o3-mini (high-reasoning version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
50
+
51
+ # Anthropic models (Claude)
52
+ "Claude 3.5 (low temp)": create_anthropic_llm("claude-3-5-sonnet-latest", LOW),
53
+ "Claude 3.5 (mid temp)": create_anthropic_llm("claude-3-5-sonnet-latest", MID),
54
+ "Claude 3.5 (high temp)": create_anthropic_llm("claude-3-5-sonnet-latest", HIGH),
55
+
56
+ # DeepSeek
57
+ "Deepseek R1 (low temp)🚧": create_anthropic_llm("deepseek-reasoner", LOW),
58
+ }
config/templates.py CHANGED
@@ -178,6 +178,7 @@ diagnose_scorecard_template = ChatPromptTemplate(
178
  <example 3>
179
  1. The exercise contains a double negative: ❌ -- 2. The correct answer does not stand out: βœ… -- 3. Some of the distractors are too obviously false: ❌ -- 4. None of the distractors are actually also kinda correct: βœ…
180
  </example 3>
 
181
  """),
182
  ("human", "{combined_diagnosis}")
183
  ],
 
178
  <example 3>
179
  1. The exercise contains a double negative: ❌ -- 2. The correct answer does not stand out: βœ… -- 3. Some of the distractors are too obviously false: ❌ -- 4. None of the distractors are actually also kinda correct: βœ…
180
  </example 3>
181
+ Sometimes the diagnoses will be short and clear, but sometimes they will also be elaborate and view the issue from different angles. In that case, overweight the final sentence of the diagnosis. Here, usually the conclusion is drawn
182
  """),
183
  ("human", "{combined_diagnosis}")
184
  ],