BtB-ExpC commited on
Commit
4cbd4c5
Β·
1 Parent(s): 7aead4f

Cleaned up model picker and exercise formatting standardization handling

Browse files
app.py CHANGED
@@ -116,10 +116,10 @@ async def run_distractors(
116
  chain_instance = config["class"](
117
  template_distractors_brainstorm_1=config["template_distractors_brainstorm_1"],
118
  template_distractors_brainstorm_2=config["template_distractors_brainstorm_2"],
119
- llm_brainstorm_1=llms.get(model_choice_distractors_1, config["llm_brainstorm_1"]), # User-selected (low and high temp GPT-4o by default)
120
- llm_brainstorm_2=llms.get(model_choice_distractors_2, config["llm_brainstorm_2"]),
121
  template_consolidate=config["template_consolidate"],
122
- llm_consolidate=config["llm_consolidate"],
123
  )
124
 
125
  # 3) Create N tasks in parallel (one full distractor generation pipeline per sample)
@@ -262,14 +262,14 @@ with gr.Blocks() as interface:
262
  with gr.Row():
263
  model_choice_distractors_1 = gr.Dropdown(
264
  choices=list(llms.keys()),
265
- value="GPT-4o (low temp)",
266
- label="LLM 1",
267
  interactive=True,
268
  )
269
  model_choice_distractors_2 = gr.Dropdown(
270
  choices=list(llms.keys()),
271
- value="GPT-4o (mid temp)",
272
- label="LLM 2",
273
  interactive=True,
274
  )
275
  exercise_format_distractors = gr.Dropdown(
@@ -278,25 +278,31 @@ with gr.Blocks() as interface:
278
  label="Exercise Format",
279
  interactive=True,
280
  )
281
- sampling_count_distractors = gr.Dropdown(
282
- choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
283
- value="1",
284
- label="Response Count",
285
  interactive=True,
286
  )
287
- intermediate_distractors_specification = gr.Dropdown(
288
- choices=["", "2", "3", "4", "5", "6", "7", "8", "9", "10", "a few", "some", "a whole lot of", "a wide range of", "novel"],
289
- value="8",
290
- label="Brainstorm Nx4 distractors",
291
  interactive=True,
292
  )
293
  final_distractors_specification = gr.Dropdown(
294
- choices=["all unique distractors", "the best distractors", "only the very best distractors", "4", "5", "6", "7", "8", "9", "10", "11", "12", "a few", "some", "a whole lot of",
295
- "a wide range of", "novel"],
296
- value="all unique distractors",
297
  label="Finally display X distractors",
298
  interactive=True,
299
  )
 
 
 
 
 
 
300
  # Set up a change callback so that if the user selects any model with "Claude" in the name, the exercise format updates to "XML"
301
  model_choice_distractors_1.change(
302
  fn=update_exercise_format,
 
116
  chain_instance = config["class"](
117
  template_distractors_brainstorm_1=config["template_distractors_brainstorm_1"],
118
  template_distractors_brainstorm_2=config["template_distractors_brainstorm_2"],
119
+ llm_brainstorm_1=llms.get(model_choice_distractors_1, config["llm_brainstorm_1"]), # User-selected LLM 1
120
+ llm_brainstorm_2=llms.get(model_choice_distractors_2, config["llm_brainstorm_2"]), # User-selected LLM 2
121
  template_consolidate=config["template_consolidate"],
122
+ llm_consolidate=llms.get(model_choice_distractors_3, config["llm_consolidate"]), # User-selected LLM 3
123
  )
124
 
125
  # 3) Create N tasks in parallel (one full distractor generation pipeline per sample)
 
262
  with gr.Row():
263
  model_choice_distractors_1 = gr.Dropdown(
264
  choices=list(llms.keys()),
265
+ value="GPT-4o (mid temp)",
266
+ label="LLM 1 - for brainstorming",
267
  interactive=True,
268
  )
269
  model_choice_distractors_2 = gr.Dropdown(
270
  choices=list(llms.keys()),
271
+ value="Claude 3.5 (mid temp)",
272
+ label="LLM 2 - for brainstorming",
273
  interactive=True,
274
  )
275
  exercise_format_distractors = gr.Dropdown(
 
278
  label="Exercise Format",
279
  interactive=True,
280
  )
281
+ intermediate_distractors_specification = gr.Dropdown(
282
+ choices=[" ", " 2 ", " 3 ", " 4 ", " 5 ", " 6 ", " 7 ", " 8 ", " 9 ", " 10 ", " a few ", " some ", " a whole lot of ", " a wide range of ", " novel "],
283
+ value=" 8 ",
284
+ label="Brainstorm X distractors x4",
285
  interactive=True,
286
  )
287
+ model_choice_distractors_3 = gr.Dropdown(
288
+ choices=list(llms.keys()),
289
+ value="GPT-4o (low temp)",
290
+ label="LLM 3 - for interpreting results",
291
  interactive=True,
292
  )
293
  final_distractors_specification = gr.Dropdown(
294
+ choices=[" ", " of all unique distractors", " of the top 5", " of the best distractors", " of only the very best", " of the best 4", " of the best 5", " of the best 6", " of the best 7", " of the best 8", " of the best 9", " of the best 10", " of the best 11", " of the best 12", " of a few of them", " of some of them", " of most of them",
295
+ " of a wide range of", " of the 3 worst"],
296
+ value=" of all unique distractors",
297
  label="Finally display X distractors",
298
  interactive=True,
299
  )
300
+ sampling_count_distractors = gr.Dropdown(
301
+ choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
302
+ value="1",
303
+ label="Response Count",
304
+ interactive=True,
305
+ )
306
  # Set up a change callback so that if the user selects any model with "Claude" in the name, the exercise format updates to "XML"
307
  model_choice_distractors_1.change(
308
  fn=update_exercise_format,
chains/distractors_chain.py CHANGED
@@ -41,32 +41,32 @@ class DistractorsChain(BaseModel):
41
  response = await llm_brainstorm.ainvoke(messages)
42
  content = getattr(response, "content", response)
43
 
44
- return f"[Brainstorm {index_label}]\n{content}"
45
 
46
  tasks = []
47
- # Template 1, low-temp
48
  tasks.append(run_brainstorm(
49
  self.template_distractors_brainstorm_1,
50
  self.llm_brainstorm_1,
51
- "T1-Low"
52
  ))
53
- # Template 1, high-temp
54
  tasks.append(run_brainstorm(
55
  self.template_distractors_brainstorm_1,
56
  self.llm_brainstorm_2,
57
- "T1-High"
58
  ))
59
- # Template 2, low-temp
60
  tasks.append(run_brainstorm(
61
  self.template_distractors_brainstorm_2,
62
  self.llm_brainstorm_1,
63
- "T2-Low"
64
  ))
65
- # Template 2, high-temp
66
  tasks.append(run_brainstorm(
67
  self.template_distractors_brainstorm_2,
68
  self.llm_brainstorm_2,
69
- "T2-High"
70
  ))
71
 
72
  # Kick them off concurrently
 
41
  response = await llm_brainstorm.ainvoke(messages)
42
  content = getattr(response, "content", response)
43
 
44
+ return f"[ --- list separator {index_label} ---]\n\n{content}"
45
 
46
  tasks = []
47
+ # Template 1, LLM 1
48
  tasks.append(run_brainstorm(
49
  self.template_distractors_brainstorm_1,
50
  self.llm_brainstorm_1,
51
+ "T1-1"
52
  ))
53
+ # Template 1, LLM 2
54
  tasks.append(run_brainstorm(
55
  self.template_distractors_brainstorm_1,
56
  self.llm_brainstorm_2,
57
+ "T1-2"
58
  ))
59
+ # Template 2, LLM 1
60
  tasks.append(run_brainstorm(
61
  self.template_distractors_brainstorm_2,
62
  self.llm_brainstorm_1,
63
+ "T2-1"
64
  ))
65
+ # Template 2, LLM 2
66
  tasks.append(run_brainstorm(
67
  self.template_distractors_brainstorm_2,
68
  self.llm_brainstorm_2,
69
+ "T2-2"
70
  ))
71
 
72
  # Kick them off concurrently
config/chain_configs.py CHANGED
@@ -19,8 +19,8 @@ chain_configs = {
19
  "diagnoser": {
20
  "class": DiagnoserChain,
21
  "template_standardize": standardize_template,
22
- "llm_standardize": llms["GPT-4o-mini-zero"], # Always fixed
23
- "llm_4o_mini": llms["GPT-4o-mini"],
24
  "llm_4o": llms["GPT-4o (low temp)"],
25
  # 4 different diagnosis templates (to run in parallel:
26
  "templates_diagnose": [
@@ -35,7 +35,7 @@ chain_configs = {
35
  "distractors": {
36
  "class": DistractorsChain,
37
  "template_standardize": standardize_template,
38
- "llm_standardize": llms["GPT-4o-mini-zero"], # Always fixed
39
  "template_distractors_brainstorm_1": template_distractors_brainstorm_1,
40
  "template_distractors_brainstorm_2": template_distractors_brainstorm_2,
41
  "llm_brainstorm_1": llms["GPT-4o (low temp)"],
 
19
  "diagnoser": {
20
  "class": DiagnoserChain,
21
  "template_standardize": standardize_template,
22
+ "llm_standardize": llms["GPT-4o-mini (zero temp)"], # Always fixed
23
+ "llm_4o_mini": llms["GPT-4o-mini (low temp)"],
24
  "llm_4o": llms["GPT-4o (low temp)"],
25
  # 4 different diagnosis templates (to run in parallel:
26
  "templates_diagnose": [
 
35
  "distractors": {
36
  "class": DistractorsChain,
37
  "template_standardize": standardize_template,
38
+ "llm_standardize": llms["GPT-4o-mini (zero temp)"], # Always fixed
39
  "template_distractors_brainstorm_1": template_distractors_brainstorm_1,
40
  "template_distractors_brainstorm_2": template_distractors_brainstorm_2,
41
  "llm_brainstorm_1": llms["GPT-4o (low temp)"],
config/exercise_standardizer.py CHANGED
@@ -22,6 +22,7 @@ async def standardize_exercise(user_query: str, exercise_format: str, template:
22
  )
23
 
24
  std_messages = prompt_std.to_messages()
25
- standardized_exercise = await llm.ainvoke(std_messages)
 
26
 
27
  return standardized_exercise
 
22
  )
23
 
24
  std_messages = prompt_std.to_messages()
25
+ response = await llm.ainvoke(std_messages)
26
+ standardized_exercise = getattr(response, "content", response)
27
 
28
  return standardized_exercise
config/llm_config.py CHANGED
@@ -12,7 +12,7 @@ DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
12
  # Define temperature presets (adjust as needed)
13
  ZERO = 0
14
  LOW = 0.2
15
- MID = 0.7
16
  HIGH = 1.2
17
 
18
  # Factory functions for each provider
@@ -35,15 +35,13 @@ def create_deepseek_llm(model_name: str, temperature: float):
35
  # all of them in one dictionary
36
  llms = {
37
  # OpenAI models with temperature
38
-
39
  "GPT-4o (low temp)": create_openai_llm("gpt-4o", LOW),
40
  "GPT-4o (mid temp)": create_openai_llm("gpt-4o", MID),
41
  "GPT-4o (high temp)": create_openai_llm("gpt-4o", HIGH),
42
- "GPT-4o-mini-zero": create_openai_llm("gpt-4o-mini", ZERO),
43
- "GPT-4o-mini": create_openai_llm("gpt-4o-mini", LOW),
44
- "GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
45
- "GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
46
- "GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
47
 
48
  # OpenAI reasoning models (no temperature)
49
  "o1": create_openai_reasoning_llm("o1-2024-12-17"),
@@ -52,40 +50,46 @@ llms = {
52
  "o3-mini (high-reasoning effort version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
53
 
54
  # Anthropic models (Claude)
 
55
  "Claude 3.5 (low temp)": create_anthropic_llm("claude-3-5-sonnet-latest", LOW),
56
  "Claude 3.5 (mid temp)": create_anthropic_llm("claude-3-5-sonnet-latest", MID),
57
  "Claude 3.5 (high temp)": create_anthropic_llm("claude-3-5-sonnet-latest", HIGH),
58
 
59
  # DeepSeek
 
60
  "Deepseek R1 (low temp)🚧": create_anthropic_llm("deepseek-reasoner", LOW),
 
 
61
  }
62
 
63
  # specific for Diagnosis tab
64
- llms_diagnosis_tab = {
65
- # OpenAI models with temperature
66
-
67
  "GPT-4o (low temp)": create_openai_llm("gpt-4o", LOW),
68
  "GPT-4o (mid temp)": create_openai_llm("gpt-4o", MID),
69
  "GPT-4o (high temp)": create_openai_llm("gpt-4o", HIGH),
70
- "GPT-4o-mini-zero": create_openai_llm("gpt-4o-mini", ZERO),
71
- "GPT-4o-mini": create_openai_llm("gpt-4o-mini", LOW),
72
- "GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
73
- "GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
74
- "GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
75
 
76
  # OpenAI reasoning models (no temperature)
77
  "o1": create_openai_reasoning_llm("o1-2024-12-17"),
78
- "o3-mini (low-reasoning version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="low"),
79
- "o3-mini (medium-reasoning version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="medium"),
80
- "o3-mini (high-reasoning version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
81
-
82
-
83
 
84
- # Anthropic models (Claude)
 
85
  "Claude 3.5 (low temp)": create_anthropic_llm("claude-3-5-sonnet-latest", LOW),
86
  "Claude 3.5 (mid temp)": create_anthropic_llm("claude-3-5-sonnet-latest", MID),
87
  "Claude 3.5 (high temp)": create_anthropic_llm("claude-3-5-sonnet-latest", HIGH),
 
 
88
 
89
  # DeepSeek
 
90
  "Deepseek R1 (low temp)🚧": create_anthropic_llm("deepseek-reasoner", LOW),
 
 
91
  }
 
12
  # Define temperature presets (adjust as needed)
13
  ZERO = 0
14
  LOW = 0.2
15
+ MID = 0.6
16
  HIGH = 1.2
17
 
18
  # Factory functions for each provider
 
35
  # all of them in one dictionary
36
  llms = {
37
  # OpenAI models with temperature
38
+ "GPT-4o (zero temp)": create_openai_llm("gpt-4o", ZERO),
39
  "GPT-4o (low temp)": create_openai_llm("gpt-4o", LOW),
40
  "GPT-4o (mid temp)": create_openai_llm("gpt-4o", MID),
41
  "GPT-4o (high temp)": create_openai_llm("gpt-4o", HIGH),
42
+ "GPT-4o-mini (zero temp)": create_openai_llm("gpt-4o-mini", ZERO),
43
+ "GPT-4o-mini (low temp)": create_openai_llm("gpt-4o-mini", LOW),
44
+ "GPT-4 Turbo (low temp)": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
 
 
45
 
46
  # OpenAI reasoning models (no temperature)
47
  "o1": create_openai_reasoning_llm("o1-2024-12-17"),
 
50
  "o3-mini (high-reasoning effort version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
51
 
52
  # Anthropic models (Claude)
53
+ "Claude 3.5 (zero temp)": create_anthropic_llm("claude-3-5-sonnet-latest", ZERO),
54
  "Claude 3.5 (low temp)": create_anthropic_llm("claude-3-5-sonnet-latest", LOW),
55
  "Claude 3.5 (mid temp)": create_anthropic_llm("claude-3-5-sonnet-latest", MID),
56
  "Claude 3.5 (high temp)": create_anthropic_llm("claude-3-5-sonnet-latest", HIGH),
57
 
58
  # DeepSeek
59
+ "Deepseek R1 (zero temp)🚧": create_anthropic_llm("deepseek-reasoner", ZERO),
60
  "Deepseek R1 (low temp)🚧": create_anthropic_llm("deepseek-reasoner", LOW),
61
+ "Deepseek R1 (mid temp)🚧": create_anthropic_llm("deepseek-reasoner", MID),
62
+ "Deepseek R1 (high temp)🚧": create_anthropic_llm("deepseek-reasoner", HIGH),
63
  }
64
 
65
  # specific for Diagnosis tab
66
+ llms_most_wanted = {
67
+ # OpenAI models
68
+ "GPT-4o (zero temp)": create_openai_llm("gpt-4o", ZERO),
69
  "GPT-4o (low temp)": create_openai_llm("gpt-4o", LOW),
70
  "GPT-4o (mid temp)": create_openai_llm("gpt-4o", MID),
71
  "GPT-4o (high temp)": create_openai_llm("gpt-4o", HIGH),
72
+ "GPT-4o-mini (zero temp)": create_openai_llm("gpt-4o-mini", ZERO),
73
+ "GPT-4o-mini (low temp)": create_openai_llm("gpt-4o-mini", LOW),
74
+ "GPT-4 Turbo (low temp)": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
 
 
75
 
76
  # OpenAI reasoning models (no temperature)
77
  "o1": create_openai_reasoning_llm("o1-2024-12-17"),
78
+ "o3-mini (low-reasoning effort version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="low"),
79
+ "o3-mini (medium-reasoning effort version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="medium"),
80
+ "o3-mini (high-reasoning effort version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
 
 
81
 
82
+ # Anthropic models
83
+ "Claude 3.5 (zero temp)": create_anthropic_llm("claude-3-5-sonnet-latest", ZERO),
84
  "Claude 3.5 (low temp)": create_anthropic_llm("claude-3-5-sonnet-latest", LOW),
85
  "Claude 3.5 (mid temp)": create_anthropic_llm("claude-3-5-sonnet-latest", MID),
86
  "Claude 3.5 (high temp)": create_anthropic_llm("claude-3-5-sonnet-latest", HIGH),
87
+ "Claude 3.5 Haiku (zero temp)": create_anthropic_llm("claude-3-5-haiku-latest", HIGH),
88
+ "Claude 3.5 Haiku (low temp)": create_anthropic_llm("claude-3-5-haiku-latest", HIGH),
89
 
90
  # DeepSeek
91
+ "Deepseek R1 (zero temp)🚧": create_anthropic_llm("deepseek-reasoner", ZERO),
92
  "Deepseek R1 (low temp)🚧": create_anthropic_llm("deepseek-reasoner", LOW),
93
+ "Deepseek R1 (mid temp)🚧": create_anthropic_llm("deepseek-reasoner", MID),
94
+ "Deepseek R1 (high temp)🚧": create_anthropic_llm("deepseek-reasoner", HIGH),
95
  }
config/templates.py CHANGED
@@ -189,7 +189,7 @@ diagnose_scorecard_template = ChatPromptTemplate(
189
 
190
  template_distractors_brainstorm_1 = ChatPromptTemplate(
191
  messages=[
192
- ("system", "You are a brainstorming assistant. Based on the given multiple choice exercise, come up with {intermediate_distractors_specification} additional high-quality distractors: "
193
  "alternative answer options that are not correct, yet also not so implausible that even poorly informed students would immediately dismiss them. Make sure to use the same language as the existing exercise."),
194
  ("human", "{standardized_exercise}")
195
  ],
@@ -198,7 +198,7 @@ template_distractors_brainstorm_1 = ChatPromptTemplate(
198
 
199
  template_distractors_brainstorm_2 = ChatPromptTemplate(
200
  messages=[
201
- ("system", "You are a brainstorming assistant. Based on the given multiple choice exercise, come up with {intermediate_distractors_specification} additional high-quality distractors: "
202
  "alternative answer options that are not correct, yet not so implausible that even poorly informed students would immediately dismiss them. Go about this very methodically: "
203
  "Really try to think outside of the box and get creative here, providing potential alternative distractors across a wide range of options. "
204
  "Before you present your final selection, take your time to really consider the entire solution space, weighing your different ideas an options, then to list the distractors. Make sure to use the same language as the existing exercise."),
@@ -211,13 +211,12 @@ template_distractors_brainstorm_2 = ChatPromptTemplate(
211
  template_consolidate_distractors = ChatPromptTemplate(
212
  messages=[
213
  ("system", "You are given several lists of potential distractors (answer options to a multiple choice exercise), that need to be consolidated into one list. "
214
- "Filter out duplicates, do some logical sorting among them, and just return one plain list of {final_distractors_specification}. "
215
- "Only focus on the distractors (answer options) themselves, ignore any reasoning about them. Return only the list, formatted without numbering or bullet points, just every distractor on its own line. Use the same language as the existing exercise. "),
216
- ("human", "For context, this is the exercise that the distractors are about: "
217
  "{standardized_exercise} "
218
- ""
219
- "Here are the lists:"
220
- "{brainstorm_outputs}")
221
  ],
222
  input_variables=["standardized_exercise", "brainstorm_outputs", "final_distractors_specification"]
223
  )
 
189
 
190
  template_distractors_brainstorm_1 = ChatPromptTemplate(
191
  messages=[
192
+ ("system", "You are a brainstorming assistant. Based on the given multiple choice exercise, come up with{intermediate_distractors_specification}additional high-quality distractors: "
193
  "alternative answer options that are not correct, yet also not so implausible that even poorly informed students would immediately dismiss them. Make sure to use the same language as the existing exercise."),
194
  ("human", "{standardized_exercise}")
195
  ],
 
198
 
199
  template_distractors_brainstorm_2 = ChatPromptTemplate(
200
  messages=[
201
+ ("system", "You are a brainstorming assistant. Based on the given multiple choice exercise, come up with{intermediate_distractors_specification}additional high-quality distractors: "
202
  "alternative answer options that are not correct, yet not so implausible that even poorly informed students would immediately dismiss them. Go about this very methodically: "
203
  "Really try to think outside of the box and get creative here, providing potential alternative distractors across a wide range of options. "
204
  "Before you present your final selection, take your time to really consider the entire solution space, weighing your different ideas an options, then to list the distractors. Make sure to use the same language as the existing exercise."),
 
211
  template_consolidate_distractors = ChatPromptTemplate(
212
  messages=[
213
  ("system", "You are given several lists of potential distractors (answer options to a multiple choice exercise), that need to be consolidated into one list. "
214
+ "Filter out duplicates, do some logical sorting among them, and just return one plain list{final_distractors_specification}. "
215
+ "Only focus on the distractors (answer options) themselves, ignore any reasoning about them. Return only the list, nothing else. Format the list without numbering or bullet points, just put every distractor on its own line. Use the same language as the existing exercise. "),
216
+ ("human", "For context, this is the exercise that the distractors are about:\n "
217
  "{standardized_exercise} "
218
+ "Here are the lists:\n "
219
+ "{brainstorm_outputs} ")
 
220
  ],
221
  input_variables=["standardized_exercise", "brainstorm_outputs", "final_distractors_specification"]
222
  )