BtB-ExpC commited on
Commit
eec7865
·
2 Parent(s): 7282fa9 9d2a494

added new models (Gemini 2.5 pro & gpt-4.1-mini)

Browse files
app/ui/write_fluster_tab.py CHANGED
@@ -28,7 +28,7 @@ def build_write_fluster_tab():
28
  )
29
 
30
  include_diagnosis = gr.Checkbox(
31
- label="Immediately diagnose & fix",
32
  value=False,
33
  info="Diagnose each exercise and fix if issues found?"
34
  )
 
28
  )
29
 
30
  include_diagnosis = gr.Checkbox(
31
+ label="Immediately diagnose & fix 🚧",
32
  value=False,
33
  info="Diagnose each exercise and fix if issues found?"
34
  )
chains/exercises/run_fluster_with_diagnosis.py CHANGED
@@ -1,6 +1,6 @@
1
  # chains/exercises/run_fluster_with_diagnosis.py
2
  import asyncio
3
- from typing import Tuple, List
4
 
5
  from app.helpers.exercise_standardizer import structurize_exercise, ExerciseSet, Exercise, exercise_to_string
6
  from chains.exercises.runner_without import write_fluster_track
@@ -64,6 +64,9 @@ async def _async_fluster_with_diagnosis(
64
  fluster_config = chain_configs["fluster"]
65
  diagnoser_config = chain_configs["diagnoser"]
66
 
 
 
 
67
  # 1) Generate track0 & track2 in parallel
68
  track0_coro = write_fluster_track(
69
  user_input_text,
@@ -85,8 +88,8 @@ async def _async_fluster_with_diagnosis(
85
  fluster2_exs = await parse_fluster_text_to_exercises(track2_text)
86
 
87
  # 3) Diagnose + fix each exercise
88
- diag0_results, fixed0_exs = await diagnose_and_fix_all(fluster0_exs, diagnoser_config)
89
- diag2_results, fixed2_exs = await diagnose_and_fix_all(fluster2_exs, diagnoser_config)
90
 
91
  # 4) Convert the final exercises to strings for display
92
  # (Or you can store them back into a bigger data structure.)
@@ -110,16 +113,6 @@ async def _async_fluster_with_diagnosis(
110
  final2_text # fixes_box_3
111
  )
112
 
113
- def run_fluster_with_diagnosis(
114
- user_input_text: str,
115
- model_choice_1: str,
116
- model_choice_2: str
117
- ) -> Tuple[str, str, str, str, str, str, str, str]:
118
- """
119
- Synchronous entrypoint for the UI or external calls.
120
- """
121
- return asyncio.run(_async_fluster_with_diagnosis(user_input_text, model_choice_1, model_choice_2))
122
-
123
 
124
  async def write_fluster_track(
125
  user_input: str,
@@ -140,8 +133,7 @@ async def write_fluster_track(
140
  # 2) Decide LLM
141
  # either use model_choice_key from user, or the config's default
142
  fallback_llm = fluster_config["default_llm_a"] if track_index in (0, 1) else fluster_config["default_llm_b"]
143
- gen_llm = chain_configs["fluster"].get(model_choice_key, fallback_llm)
144
- # ^ careful: you'd need a dictionary of LLMs. Or do: llm = llms.get(model_choice_key, fallback_llm)
145
 
146
  # 3) Format + invoke the "writing" prompt
147
  prompt_value = await gen_template.aformat_prompt(learning_objective=user_input)
@@ -166,9 +158,9 @@ async def write_fluster_track(
166
 
167
 
168
  async def diagnose_and_fix_all(
169
- exercises: List[Exercise],
170
- diagnoser_config: dict
171
- ) -> tuple[List[str], List[Exercise]]:
172
  """
173
  For each exercise, run the 'diagnose_only' from the DiagnoserChain,
174
  then interpret the results (scorecard) to see if we need a fix,
@@ -177,6 +169,7 @@ async def diagnose_and_fix_all(
177
  Returns:
178
  - a list of strings (one per exercise) summarizing the diagnosis,
179
  - a list of possibly fixed exercises.
 
180
  """
181
  diag_chain = diagnoser_config["class"](
182
  templates_diagnose=diagnoser_config["templates_diagnose"],
@@ -206,7 +199,7 @@ async def diagnose_and_fix_all(
206
  fluster_config = chain_configs["fluster"]
207
 
208
  if "❌" in scorecard:
209
- ex_fixed = await fix_exercise(ex, scorecard, fluster_config)
210
  fixed_exs.append(ex_fixed)
211
  else:
212
  fixed_exs.append(ex)
@@ -247,55 +240,27 @@ async def diagnose_exercise(ex: Exercise) -> str:
247
 
248
  from pydantic import ValidationError
249
 
250
- async def fix_exercise(
251
- ex: Exercise,
252
- diag_str: str,
253
- fluster_config: dict
254
- ) -> Exercise:
255
- """
256
- Calls 'template_fix_exercise' + 'llm_fix_exercise' from the fluster config
257
- to rewrite the exercise so it addresses the diagnosis issues.
258
- """
259
-
260
- template_fix = fluster_config["template_fix_exercise"]
261
- llm_fix = fluster_config["llm_fix_exercise"]
262
-
263
- # 1) Convert the exercise to text
264
- ex_text = exercise_to_string(ex) # some function that formats ex into text
265
-
266
- # 2) Format the fix prompt
267
- prompt_value = await template_fix.aformat_prompt(
268
- exercise_text=ex_text,
269
- diagnosis=diag_str
270
- )
271
- messages = prompt_value.to_messages()
272
-
273
- # 3) Invoke the LLM
274
- fix_resp = await llm_fix.ainvoke(messages)
275
- raw_content = getattr(fix_resp, "content", fix_resp)
276
-
277
- # 4) We can parse the LLM result if we want a structured object
278
- # For example, if we told the LLM to return JSON that matches the Exercise schema:
279
- # ex_fixed_data = parse the JSON
280
- # ex_fixed = Exercise.model_validate(ex_fixed_data)
281
- #
282
- # Or if the LLM just returned plain text, you can do a simpler approach:
283
- # For now, as a placeholder, let's just say we re-build the prompt field:
284
-
285
- # If you do structured output, do something like:
286
- # try:
287
- # ex_dict = json.loads(raw_content)
288
- # ex_fixed = Exercise.model_validate(ex_dict)
289
- # except (JSONDecodeError, ValidationError) as e:
290
- # # fallback if needed
291
- # ex_fixed = ex.copy(update={"prompt": ex.prompt + " (fallback fix)"})
292
-
293
- # For the sake of example, let's do a naive approach:
294
- ex_fixed = ex.copy(update={"prompt": raw_content})
295
-
296
- return ex_fixed
297
-
298
-
299
 
300
 
301
 
 
1
  # chains/exercises/run_fluster_with_diagnosis.py
2
  import asyncio
3
+ from typing import Tuple, List, Any
4
 
5
  from app.helpers.exercise_standardizer import structurize_exercise, ExerciseSet, Exercise, exercise_to_string
6
  from chains.exercises.runner_without import write_fluster_track
 
64
  fluster_config = chain_configs["fluster"]
65
  diagnoser_config = chain_configs["diagnoser"]
66
 
67
+ llm1 = llms.get(model_choice_1, fluster_config["default_llm_a"])
68
+ llm2 = llms.get(model_choice_2, fluster_config["default_llm_b"])
69
+
70
  # 1) Generate track0 & track2 in parallel
71
  track0_coro = write_fluster_track(
72
  user_input_text,
 
88
  fluster2_exs = await parse_fluster_text_to_exercises(track2_text)
89
 
90
  # 3) Diagnose + fix each exercise
91
+ diag0_results, fixed0_exs = await diagnose_and_fix_all(fluster0_exs, diagnoser_config, llm_fix=llm1)
92
+ diag2_results, fixed2_exs = await diagnose_and_fix_all(fluster2_exs, diagnoser_config, llm_fix=llm2)
93
 
94
  # 4) Convert the final exercises to strings for display
95
  # (Or you can store them back into a bigger data structure.)
 
113
  final2_text # fixes_box_3
114
  )
115
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  async def write_fluster_track(
118
  user_input: str,
 
133
  # 2) Decide LLM
134
  # either use model_choice_key from user, or the config's default
135
  fallback_llm = fluster_config["default_llm_a"] if track_index in (0, 1) else fluster_config["default_llm_b"]
136
+ gen_llm = llms.get(model_choice_key, fallback_llm)
 
137
 
138
  # 3) Format + invoke the "writing" prompt
139
  prompt_value = await gen_template.aformat_prompt(learning_objective=user_input)
 
158
 
159
 
160
  async def diagnose_and_fix_all(
161
+ exercises: List[Exercise],
162
+ diagnoser_config: dict,
163
+ llm_fix: Any) -> tuple[List[str], List[Exercise]]:
164
  """
165
  For each exercise, run the 'diagnose_only' from the DiagnoserChain,
166
  then interpret the results (scorecard) to see if we need a fix,
 
169
  Returns:
170
  - a list of strings (one per exercise) summarizing the diagnosis,
171
  - a list of possibly fixed exercises.
172
+ :param llm_fix:
173
  """
174
  diag_chain = diagnoser_config["class"](
175
  templates_diagnose=diagnoser_config["templates_diagnose"],
 
199
  fluster_config = chain_configs["fluster"]
200
 
201
  if "❌" in scorecard:
202
+ ex_fixed = await fix_exercise(ex, scorecard, fluster_config, llm_fix)
203
  fixed_exs.append(ex_fixed)
204
  else:
205
  fixed_exs.append(ex)
 
240
 
241
  from pydantic import ValidationError
242
 
243
+ async def fix_exercise(ex: Exercise, diag_str: str, cfg: dict, llm_fix:None) -> Exercise:
244
+ tmpl_fix = cfg["template_fix_exercise"]
245
+ if not llm_fix:
246
+ llm_fix = cfg["llm_fix_exercise"]
247
+ llm_cast = cfg["llm_structurize"] # already in chain_configs
248
+
249
+ # 1️⃣ first call creative rewrite
250
+ prompt = await tmpl_fix.aformat_prompt(
251
+ exercise_text = exercise_to_string(ex),
252
+ diagnosis = diag_str
253
+ )
254
+ raw = (await llm_fix.ainvoke(prompt.to_messages())).content
255
+
256
+ # 2️⃣ second call cast to schema
257
+ try:
258
+ ex_fixed = await llm_cast.with_structured_output(Exercise).ainvoke(
259
+ [("user", raw)] # minimal prompt: just the text
260
+ )
261
+ return ex_fixed
262
+ except Exception:
263
+ return ex.copy(update={"prompt": raw})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
 
266
 
config/llm_config.py CHANGED
@@ -55,6 +55,8 @@ llms = {
55
  "GPT-4.1-mini (zero temp)": create_openai_llm("gpt-4.1-mini", ZERO),
56
  "GPT-4.1-mini (low temp)": create_openai_llm("gpt-4.1-mini", LOW),
57
  "GPT-4 Turbo (low temp)": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
 
 
58
  "GPT-4.5 (low temp)": create_openai_llm("gpt-4.5-preview-2025-02-27", LOW),
59
  "GPT-4.5 (mid temp)": create_openai_llm("gpt-4.5-preview-2025-02-27", MID),
60
 
@@ -64,6 +66,7 @@ llms = {
64
  "o3-mini (low reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="low"),
65
  "o3-mini (medium reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="medium"),
66
  "o3-mini (high reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
 
67
 
68
  # Anthropic models (Claude)
69
  "Claude 3.5 (zero temp)": create_anthropic_llm("claude-3-5-sonnet-latest", ZERO),
@@ -75,13 +78,14 @@ llms = {
75
  "Claude 3.7": create_anthropic_reasoning_llm("claude-3-7-sonnet-latest"),
76
 
77
  # DeepSeek
78
- "Deepseek R1 (zero temp)": create_deepseek_llm("deepseek-reasoner", ZERO),
79
- "Deepseek R1 (low temp)": create_deepseek_llm("deepseek-reasoner", LOW),
80
- "Deepseek R1 (mid temp)": create_deepseek_llm("deepseek-reasoner", MID),
81
- "Deepseek R1 (high temp)": create_deepseek_llm("deepseek-reasoner", HIGH),
 
82
 
83
  # Google models (Gemini)
84
- "Gemini 2.5 Pro Experimental (zero temp)": create_google_reasoning_llm(model_name= "gemini-2.5-pro-exp-03-25"),
85
  }
86
 
87
  # specific for Diagnosis tab
 
55
  "GPT-4.1-mini (zero temp)": create_openai_llm("gpt-4.1-mini", ZERO),
56
  "GPT-4.1-mini (low temp)": create_openai_llm("gpt-4.1-mini", LOW),
57
  "GPT-4 Turbo (low temp)": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
58
+ "GPT-4.1 (low temp)": create_openai_llm("gpt-4.1", LOW),
59
+ "GPT-4.1 (mid temp)": create_openai_llm("gpt-4.1", MID),
60
  "GPT-4.5 (low temp)": create_openai_llm("gpt-4.5-preview-2025-02-27", LOW),
61
  "GPT-4.5 (mid temp)": create_openai_llm("gpt-4.5-preview-2025-02-27", MID),
62
 
 
66
  "o3-mini (low reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="low"),
67
  "o3-mini (medium reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="medium"),
68
  "o3-mini (high reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
69
+ "o3 (high reasoning_effort)": create_openai_reasoning_llm("o3", reasoning_effort="high"),
70
 
71
  # Anthropic models (Claude)
72
  "Claude 3.5 (zero temp)": create_anthropic_llm("claude-3-5-sonnet-latest", ZERO),
 
78
  "Claude 3.7": create_anthropic_reasoning_llm("claude-3-7-sonnet-latest"),
79
 
80
  # DeepSeek
81
+ "DeepSeek-R1 (zero temp)": create_deepseek_llm("deepseek-reasoner", ZERO),
82
+ "DeepSeek-R1 (low temp)": create_deepseek_llm("deepseek-reasoner", LOW),
83
+ "DeepSeek-R1 (mid temp)": create_deepseek_llm("deepseek-reasoner", MID),
84
+ "DeepSeek-R1 (high temp)": create_deepseek_llm("deepseek-reasoner", HIGH),
85
+ "DeepSeek-V3 (low temp)": create_deepseek_llm("deepseek-chat", LOW),
86
 
87
  # Google models (Gemini)
88
+ "Gemini 2.5 Pro Experimental (zero temp)": create_google_reasoning_llm(model_name= "gemini-2.5-pro"),
89
  }
90
 
91
  # specific for Diagnosis tab
config/system_prompt_texts.py CHANGED
@@ -496,7 +496,7 @@ template_sanitize_learning_objectives_text = """
496
  template_write_fluster_a_text= """
497
  # Task outline
498
 
499
- Given a learning objective, your goal is to write an exercise set of 3 high-quality multiple choice exercises that all test the exact same key fact that's stated in the learning objective.
500
 
501
 
502
  # Concepts
@@ -507,8 +507,8 @@ A learning objective states a specific fact. For example: "De student weet dat d
507
  ## Exercise
508
  An exercise tests the fact that is stated in the learning objective. It consists of:
509
  1. A prompt, posing to the student:
 
510
  - A question or statement
511
- - (Optional) Theory, additional information to clarify the question or statement
512
  2. Choices, which are the multiple answer options that are presented to the student as potential answers to the prompt.
513
  3. Correct answer, which indicates which of the choices is the correct answer to the prompt.
514
  4. (Optional) Explanation, explaining or expanding on the answer to the student to facilitate increased learning.
@@ -518,13 +518,13 @@ The student is always first presented with 1 and 2 (prompt and choices), and the
518
  An exercise set comprises 3 exercises that all test the same single learning objective in three different ways: one bigger multiple choice exercise and two smaller true/false statements.
519
 
520
  ## Distractors
521
- Distractors are the alternative answer option choices of the exercises that are not the correct answer. The false statement can also be considered a distractor (tempting the student to thing it is correct). Distractors are in fact the most important part of the exercises, because they often either make or break it. This is because distractors are difficult to get right, because in order to be effective they need to strike a precarious balance between "plausible-sounding" and yet "not too close to the truth", both at the same time. More on that in the requirements section.
522
 
523
  ## Theory (optional)
524
- Theory is sometimes shown before answering the exercise, as an optional part of the prompt to clarify the question.
525
 
526
  ## Explanation (optional)
527
- An explanation should sometimes be presented to the student after they've answered the exercise, as an optional part of the correct answer reveal to better facilitate learning.
528
 
529
 
530
  # Examples
@@ -641,7 +641,7 @@ The information that's posed in the prompt part of the exercise should only cont
641
 
642
 
643
  ## Theory & Explanation (optional)
644
- Theory or Explanation should only be added to all 3 exercises if there's additional info present in the learning objective (often between parentheses, or as a subclause) that is outside of the main fact that's to be tested.
645
  ### Theory (optional)
646
  Put any info here that is useful for the student to know before answering the question, as context to clarify the question or statement. The student is prompted with this together with the posing of the rest of the exercise.
647
  ### Explanation (optional)
@@ -660,7 +660,7 @@ The ideal distractor falls in the middle of this spectrum - plausible enough to
660
  Try to exactly match the terminology and language difficulty level from the learning objective. If it's stated in simple words, use equally simple words in the exercises as well.
661
 
662
  ## Output format
663
- Output format doesn't matter. Only prioritize thorough reasoning to arrive at high-quality exercises that satisfy all of the above requirements.
664
 
665
  # Approach
666
 
 
496
  template_write_fluster_a_text= """
497
  # Task outline
498
 
499
+ Given a learning objective, your goal is to write an exercise set of 3 high-quality multiple choice exercises that all test the exact same key fact that's stated in the learning objective. All exercises must be written in the same language as the learning objective.
500
 
501
 
502
  # Concepts
 
507
  ## Exercise
508
  An exercise tests the fact that is stated in the learning objective. It consists of:
509
  1. A prompt, posing to the student:
510
+ - (Optional) Theory, additional information to clarify the question or statement
511
  - A question or statement
 
512
  2. Choices, which are the multiple answer options that are presented to the student as potential answers to the prompt.
513
  3. Correct answer, which indicates which of the choices is the correct answer to the prompt.
514
  4. (Optional) Explanation, explaining or expanding on the answer to the student to facilitate increased learning.
 
518
  An exercise set comprises 3 exercises that all test the same single learning objective in three different ways: one bigger multiple choice exercise and two smaller true/false statements.
519
 
520
  ## Distractors
521
+ Distractors are the alternative answer option choices of the exercises that are not the correct answer. The false statement can also be considered a distractor (tempting the student to think it is correct). Distractors are in fact the most important part of the exercises, because they often either make or break it. This is because distractors are difficult to get right, because in order to be effective they need to strike a precarious balance between "plausible-sounding" and yet "not too close to the truth", both at the same time. More on that in the requirements section.
522
 
523
  ## Theory (optional)
524
+ Theory is sometimes shown before answering the exercise, as an optional part of the prompt to clarify the question or statement.
525
 
526
  ## Explanation (optional)
527
+ An explanation should sometimes be presented to the student after they've answered the exercise, as an optional part of the correct answer reveal to better facilitate learning. For the false‑statement exercise, explanations are mandatory (because they should provide the correct answer to the student).”
528
 
529
 
530
  # Examples
 
641
 
642
 
643
  ## Theory & Explanation (optional)
644
+ Theory or Explanation should only be added if they're relevant to all 3 exercises if there's additional info present in the learning objective (often between parentheses, or as a subclause) that is outside of the main fact that's to be tested.
645
  ### Theory (optional)
646
  Put any info here that is useful for the student to know before answering the question, as context to clarify the question or statement. The student is prompted with this together with the posing of the rest of the exercise.
647
  ### Explanation (optional)
 
660
  Try to exactly match the terminology and language difficulty level from the learning objective. If it's stated in simple words, use equally simple words in the exercises as well.
661
 
662
  ## Output format
663
+ Output format doesn't matter, parsing your response into fixed exercises will be dealt with later in a separate step. For now, prioritize thorough reasoning to arrive at the most high-quality exercises that optimally satisfy all of the above requirements. Feel free to do brainstorming and even back-tracking if you notice a mistake while generating a response.
664
 
665
  # Approach
666
 
config/templates.py CHANGED
@@ -256,11 +256,11 @@ template_fix_exercise = ChatPromptTemplate(
256
  (
257
  "system",
258
  "You are a helpful assistant that fixes issues in a single multiple choice exercise "
259
- "based on diagnosis notes. Return only valid text with the same keys as the original."
260
  ),
261
  (
262
  "user",
263
- "Original exercise:\n{exercise_text}\n\nDiagnosis:\n{diagnosis}\n\n"
264
  "Rewrite the exercise so that all issues in the diagnosis are resolved. "
265
  "Use the same structure (prompt, choice_id_1..4, correct_answer_id, explanation)."
266
  ),
 
256
  (
257
  "system",
258
  "You are a helpful assistant that fixes issues in a single multiple choice exercise "
259
+ "based on diagnosis notes. Return an improved exercise that has the same amount of answer options as the original, and the same correct answer. For example, if the correct answer is 'Deze stelling is niet correct', then this must remain the correct answer."
260
  ),
261
  (
262
  "user",
263
+ "Original exercise:\n{exercise_text}\n\nDiagnosis:\n{diagnosis}\n\n" # this is the scorecard summary, ideally I guess this would be the complete diagnoses of all issues
264
  "Rewrite the exercise so that all issues in the diagnosis are resolved. "
265
  "Use the same structure (prompt, choice_id_1..4, correct_answer_id, explanation)."
266
  ),