BtB-ExpC commited on
Commit
c0ffcf0
·
1 Parent(s): 6e757e3

biiig change, built run_fluster_with_diagnosis

Browse files
app/helpers/exercise_standardizer.py CHANGED
@@ -1,6 +1,6 @@
1
  # app/helpers/exercise_standardizer.py
2
  from langchain_core.prompts import ChatPromptTemplate
3
- from typing import Any
4
 
5
  from pydantic import BaseModel
6
 
@@ -31,37 +31,67 @@ async def standardize_exercise(user_query: str, exercise_format: str, template:
31
 
32
  return standardized_exercise
33
 
34
- # class ExerciseComplete(BaseModel):
35
- # id: int
36
- # content: str
37
- # choice_id_1: str
38
- # choice_id_2: str
39
- # choice_id_3: str
40
- # choice_id_4: str
41
- # correct_answer_id: Literal[1, 2, 3, 4]
42
 
 
 
 
 
 
 
 
 
 
43
 
 
 
 
44
 
45
- async def structurize_exercise(user_query: str, exercise_format: str, template: ChatPromptTemplate, llm: Any):
 
 
 
 
 
 
 
46
  """
47
- Standardizes an exercise's format using the specified template and LLM
 
48
  """
49
- if exercise_format == "Raw (original)":
50
- return user_query # No transformation needed
 
51
 
52
- formatting_instructions = FORMAT_MAPPINGS_EXERCISES.get(
53
- exercise_format,
54
- "Please reformat the given exercise to ease further processing."
55
- )
56
 
57
- prompt_std = await template.aformat_prompt(
58
- user_input=user_query,
59
- formatting_instructions=formatting_instructions
 
 
 
 
 
 
 
 
 
 
 
 
60
  )
61
 
62
- std_messages = prompt_std.to_messages()
63
- response = await llm.ainvoke(std_messages)
64
- standardized_exercise = getattr(response, "content", response)
65
 
66
- return standardized_exercise
 
 
 
 
 
 
 
 
67
 
 
1
  # app/helpers/exercise_standardizer.py
2
  from langchain_core.prompts import ChatPromptTemplate
3
+ from typing import Any, Literal, List, Union
4
 
5
  from pydantic import BaseModel
6
 
 
31
 
32
  return standardized_exercise
33
 
 
 
 
 
 
 
 
 
34
 
35
+ class Exercise(BaseModel):
36
+ id: int
37
+ prompt: str
38
+ choice_id_1: str
39
+ choice_id_2: str
40
+ choice_id_3: Union[str, None]
41
+ choice_id_4: Union[str, None]
42
+ correct_answer_id: Literal[1, 2, 3, 4]
43
+ explanation: Union[str, None]
44
 
45
+ class ExerciseSet(BaseModel):
46
+ id: int
47
+ exercises: List[Exercise]
48
 
49
+
50
+
51
+
52
+ async def structurize_exercise(
53
+ fluster_text: str,
54
+ template: ChatPromptTemplate,
55
+ llm: Any # e.g. ChatOpenAI
56
+ ) -> ExerciseSet:
57
  """
58
+ Distills individual exercises and their components from the fluster text
59
+ using a structured-output call that returns a Fluster pydantic object.
60
  """
61
+ # 1) Format the prompt
62
+ prompt_str = await template.aformat_prompt(fluster=fluster_text)
63
+ messages = prompt_str.to_messages()
64
 
65
+ # 2) Call the LLM with the schema
66
+ response = await llm.with_structured_output(ExerciseSet).ainvoke(messages)
67
+ exercise_set = response.choices[0].message.parsed
 
68
 
69
+ # If the model refused or the schema was violated, you might get None or an error
70
+ if exercise_set is None:
71
+ raise ValueError(f"LLM refusal or invalid structured data.\nLLM response: {response}")
72
+
73
+ return exercise_set
74
+
75
+
76
+ def exercise_to_string(ex):
77
+ choices = [ex.choice_id_1, ex.choice_id_2, ex.choice_id_3, ex.choice_id_4]
78
+ choice_texts = [f" {idx + 1}) {choice}" for idx, choice in enumerate(choices) if choice]
79
+
80
+ correct_choice_text = next(
81
+ (f" Correct answer: {idx + 1}) {choice}"
82
+ for idx, choice in enumerate(choices) if choice == ex.correct_answer_id),
83
+ " Correct answer: Unknown"
84
  )
85
 
86
+ explanation_text = f" Explanation: {ex.explanation}" if ex.explanation else ""
 
 
87
 
88
+ plaintext_exercise = (
89
+ f"Exercise {ex.id}:\n"
90
+ f" {ex.prompt}\n"
91
+ + "\n".join(choice_texts) + "\n"
92
+ + correct_choice_text + "\n"
93
+ + explanation_text + "\n\n"
94
+ )
95
+
96
+ return plaintext_exercise
97
 
app/ui/write_fluster_tab.py CHANGED
@@ -27,27 +27,52 @@ def build_write_fluster_tab():
27
  label="LLM 2"
28
  )
29
 
 
 
 
 
 
 
30
 
31
  exercises_input = gr.Textbox(label="Enter a learning objective", value="De student weet dat")
32
  write_fluster_button = gr.Button("Generate Fluster")
33
 
34
- # 2×2 textboxes => 4 total
35
- # For clarity:
36
- # row 1 => (box_0, box_1)
37
- # row 2 => (box_2, box_3)
38
- with gr.Row():
39
- box_0 = gr.Textbox(label="Prompt A + LLM 1", interactive=False, lines=14)
40
- box_1 = gr.Textbox(label="Prompt A + LLM 1", interactive=False, lines=14)
41
- with gr.Row():
42
- box_2 = gr.Textbox(label="Prompt A + LLM 2", interactive=False, lines=14)
43
- box_3 = gr.Textbox(label="Prompt A + LLM 2", interactive=False, lines=14)
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
 
46
 
47
- # Return references if needed
48
- return (model_choice_1,
49
- model_choice_2,
50
- exercises_input,
51
- write_fluster_button,
52
- [box_0, box_1, box_2, box_3],
 
 
 
 
 
 
53
  )
 
27
  label="LLM 2"
28
  )
29
 
30
+ include_diagnosis = gr.Checkbox(
31
+ label="Immediately diagnose & fix",
32
+ value=False,
33
+ info="Diagnose each exercise and fix if issues found?"
34
+ )
35
+
36
 
37
  exercises_input = gr.Textbox(label="Enter a learning objective", value="De student weet dat")
38
  write_fluster_button = gr.Button("Generate Fluster")
39
 
40
+ # Results section
41
+ with gr.Column():
42
+ # Original fluster results (2×2 grid)
43
+ gr.Markdown("### Generated Fluster")
44
+ with gr.Row():
45
+ box_0 = gr.Textbox(label="Prompt A + LLM 1", interactive=False, lines=14)
46
+ box_2 = gr.Textbox(label="Prompt A + LLM 2", interactive=False, lines=14)
47
+ with gr.Row():
48
+ box_1 = gr.Textbox(label="Prompt A + LLM 1", interactive=False, lines=14)
49
+ box_3 = gr.Textbox(label="Prompt A + LLM 2", interactive=False, lines=14)
50
+
51
+ # -- 2 side-by-side textboxes for diagnosis results (Track1 & Track3)
52
+ with gr.Row():
53
+ diagnosis_box_1 = gr.Textbox(label="Diagnoses: Track1 (3 exercises)", interactive=False,
54
+ visible=True, lines=3)
55
+ diagnosis_box_3 = gr.Textbox(label="Diagnoses: Track3 (3 exercises)", interactive=False,
56
+ visible=True, lines=3)
57
+
58
+ # -- 2 side-by-side textboxes for final fixed flusters (Track1 & Track3)
59
+ with gr.Row():
60
+ fixes_box_1 = gr.Textbox(label="Final Fixed Track1", interactive=False, visible=True, lines=14)
61
+ fixes_box_3 = gr.Textbox(label="Final Fixed Track3", interactive=False, visible=True, lines=14)
62
+
63
 
64
 
65
 
66
+ # Return all necessary references
67
+ return (
68
+ model_choice_1,
69
+ model_choice_2,
70
+ include_diagnosis,
71
+ exercises_input,
72
+ write_fluster_button,
73
+ [box_0, box_1, box_2, box_3],
74
+ diagnosis_box_1,
75
+ diagnosis_box_3,
76
+ fixes_box_1,
77
+ fixes_box_3
78
  )
chains/diagnoser/diagnoser_chain.py CHANGED
@@ -12,13 +12,12 @@ class DiagnoserChain(BaseModel):
12
  llm_4o_mini: Any
13
  llm_4o: Any
14
 
15
- async def diagnose_only(self, standardized_exercise: str) -> str:
16
  """
17
  Takes a PRE-standardized exercise and:
18
- (1) Runs multiple diagnosis prompts in parallel,
19
- (2) Merges the results,
20
- (3) Generates a scorecard line,
21
- (4) Returns the combined text + scorecard.
22
  """
23
 
24
  # Step 1: define an async helper to run each diagnosis in parallel
@@ -47,7 +46,7 @@ class DiagnoserChain(BaseModel):
47
  scorecard_response = await self.llm_4o.ainvoke(scorecard_messages)
48
  scorecard = getattr(scorecard_response, "content", scorecard_response)
49
 
50
- return combined_diagnosis + "\n--- [SCORECARD] ---\n" + scorecard
51
 
52
  class Config:
53
  arbitrary_types_allowed = True
 
12
  llm_4o_mini: Any
13
  llm_4o: Any
14
 
15
+ async def diagnose_only(self, standardized_exercise: str) -> tuple[str, str]:
16
  """
17
  Takes a PRE-standardized exercise and:
18
+ Runs multiple diagnosis prompts, merges results, calls the scorecard prompt.
19
+ Returns a tuple: (combined_diagnosis, scorecard).
20
+ The first item is the merged text from each prompt; the second item is the final single-line scorecard.
 
21
  """
22
 
23
  # Step 1: define an async helper to run each diagnosis in parallel
 
46
  scorecard_response = await self.llm_4o.ainvoke(scorecard_messages)
47
  scorecard = getattr(scorecard_response, "content", scorecard_response)
48
 
49
+ return combined_diagnosis, scorecard
50
 
51
  class Config:
52
  arbitrary_types_allowed = True
chains/diagnoser/runner.py CHANGED
@@ -56,9 +56,13 @@ async def run_diagnoser(user_query: str, model_choice_diagnose: str, exercise_fo
56
  ]
57
  # run concurrently
58
  responses = await asyncio.gather(*tasks)
 
 
 
 
59
 
60
  # pad up to 10 if needed
61
- all_responses = list(responses) + [""] * (10 - len(responses))
62
 
63
  # Return a tuple of exactly 10 responses, + the standardized exercise.
64
  return tuple(all_responses) + (standardized_exercise,)
 
56
  ]
57
  # run concurrently
58
  responses = await asyncio.gather(*tasks)
59
+ formatted_responses = [
60
+ f"{combined_diagnosis}\n--- [SCORECARD] ---\n{scorecard}"
61
+ for combined_diagnosis, scorecard in responses
62
+ ]
63
 
64
  # pad up to 10 if needed
65
+ all_responses = formatted_responses + [""] * (10 - len(formatted_responses))
66
 
67
  # Return a tuple of exactly 10 responses, + the standardized exercise.
68
  return tuple(all_responses) + (standardized_exercise,)
chains/distractors/runner.py CHANGED
@@ -1,4 +1,4 @@
1
- # chains/distractors/runner.py
2
  import asyncio
3
 
4
  from config.chain_configs import chain_configs
 
1
+ # chains/distractors/runner_without.py
2
  import asyncio
3
 
4
  from config.chain_configs import chain_configs
chains/exercises/run_fluster_with_diagnosis.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # chains/exercises/run_fluster_with_diagnosis.py
2
+ import asyncio
3
+ from typing import Tuple
4
+
5
+ from app.helpers.exercise_standardizer import structurize_exercise, ExerciseSet, Exercise, exercise_to_string
6
+ from chains.exercises.runner_without import write_fluster_track
7
+ from config.chain_configs import chain_configs
8
+
9
+
10
+ async def _async_fluster_with_diagnosis(
11
+ user_input_text: str,
12
+ model_choice_1: str,
13
+ model_choice_2: str
14
+ ) -> Tuple[str, str, str, str, str, str, str, str]:
15
+ """
16
+ The core async pipeline:
17
+ 1. Generate fluster text for track0 & track2 (in parallel).
18
+ 2. Parse each text => get a list of Exercise objects.
19
+ 3. Diagnose each exercise => fix if needed.
20
+ 4. Build the final output strings for the UI.
21
+ """
22
+ fluster_config = chain_configs["fluster"]
23
+ diagnoser_config = chain_configs["diagnoser"]
24
+
25
+ # 1) Generate track0 & track2 in parallel
26
+ track0_coro = write_fluster_track(
27
+ 0,
28
+ user_input_text,
29
+ fluster_config["template_write_fluster_a"],
30
+ fluster_config["template_write_fluster_b"],
31
+ llms.get(model_choice_1, fluster_config["default_llm_a"]),
32
+ llms.get(model_choice_2, fluster_config["default_llm_b"]),
33
+ fluster_config["template_sanitize"],
34
+ fluster_config["llm_sanitize"]
35
+ )
36
+ track2_coro = write_fluster_track(
37
+ 2,
38
+ user_input_text,
39
+ fluster_config["template_write_fluster_a"],
40
+ fluster_config["template_write_fluster_b"],
41
+ llms.get(model_choice_1, fluster_config["default_llm_a"]),
42
+ llms.get(model_choice_2, fluster_config["default_llm_b"]),
43
+ fluster_config["template_sanitize"],
44
+ fluster_config["llm_sanitize"]
45
+ )
46
+
47
+ (t0_idx, track0_text), (t2_idx, track2_text) = await asyncio.gather(track0_coro, track2_coro)
48
+
49
+ # 2) Parse each final text => list of Exercises
50
+ fluster0_exs = parse_fluster_text_to_exercises(track0_text)
51
+ fluster2_exs = parse_fluster_text_to_exercises(track2_text)
52
+
53
+ # 3) Diagnose + fix each exercise
54
+ diag0_results, fixed0_exs = await diagnose_and_fix_all(fluster0_exs, diagnoser_config)
55
+ diag2_results, fixed2_exs = await diagnose_and_fix_all(fluster2_exs, diagnoser_config)
56
+
57
+ # 4) Convert the final exercises to strings for display
58
+ # (Or you can store them back into a bigger data structure.)
59
+ final0_text = build_fluster_text(fixed0_exs)
60
+ final2_text = build_fluster_text(fixed2_exs)
61
+
62
+ # We'll combine the diagnoses into single strings
63
+ diagnosis_text_0 = "\n".join(diag0_results)
64
+ diagnosis_text_2 = "\n".join(diag2_results)
65
+
66
+ # 5) Return the 8 items in the order your UI needs
67
+ # (track0_text, "", track2_text, "", diag0_text, diag2_text, fixed0_text, fixed2_text)
68
+ return (
69
+ track0_text, # box_0
70
+ "", # box_1 (unused)
71
+ track2_text, # box_2
72
+ "", # box_3 (unused)
73
+ diagnosis_text_0, # diagnosis_box_1
74
+ diagnosis_text_2, # diagnosis_box_3
75
+ final0_text, # fixes_box_1
76
+ final2_text # fixes_box_3
77
+ )
78
+
79
+ def run_fluster_with_diagnosis(
80
+ user_input_text: str,
81
+ model_choice_1: str,
82
+ model_choice_2: str
83
+ ) -> Tuple[str, str, str, str, str, str, str, str]:
84
+ """
85
+ Synchronous entrypoint for the UI or external calls.
86
+ """
87
+ return asyncio.run(_async_fluster_with_diagnosis(user_input_text, model_choice_1, model_choice_2))
88
+
89
+
90
+ async def write_fluster_track(
91
+ user_input: str,
92
+ model_choice_key: str,
93
+ fluster_config: dict,
94
+ track_index: int
95
+ ) -> tuple[int, str]:
96
+ """
97
+ Uses the fluster chain config to write a single track's fluster.
98
+ Returns (track_index, final_sanitized_text).
99
+ """
100
+ # 1) Decide prompt A or B
101
+ if track_index in (0, 2):
102
+ gen_template = fluster_config["template_write_fluster_a"]
103
+ else:
104
+ gen_template = fluster_config["template_write_fluster_b"]
105
+
106
+ # 2) Decide LLM
107
+ # either use model_choice_key from user, or the config's default
108
+ fallback_llm = fluster_config["default_llm_a"] if track_index in (0, 1) else fluster_config["default_llm_b"]
109
+ gen_llm = chain_configs["fluster"].get(model_choice_key, fallback_llm)
110
+ # ^ careful: you'd need a dictionary of LLMs. Or do: llm = llms.get(model_choice_key, fallback_llm)
111
+
112
+ # 3) Format + invoke the "writing" prompt
113
+ prompt_value = await gen_template.aformat_prompt(learning_objective=user_input)
114
+ gen_resp = await gen_llm.ainvoke(prompt_value.to_messages())
115
+ raw_text = getattr(gen_resp, "content", gen_resp)
116
+
117
+ # 4) (Optionally refine distractors) - if you have that step
118
+ # ...
119
+ # refine_msg = ...
120
+ # refined_resp = await fluster_config["llm_refine"].ainvoke(...)
121
+ # raw_text = refined_resp.content
122
+
123
+ # 5) sanitize
124
+ sanitize_template = fluster_config["template_sanitize"]
125
+ llm_sanitize = fluster_config["llm_sanitize"]
126
+ sanitize_prompt = await sanitize_template.aformat_prompt(refinement_result=raw_text)
127
+ sanitize_resp = await llm_sanitize.ainvoke(sanitize_prompt.to_messages())
128
+ final_text = getattr(sanitize_resp, "content", sanitize_resp)
129
+
130
+ return (track_index, final_text)
131
+
132
+
133
+
134
+ async def diagnose_and_fix_all(
135
+ exercises: List[Exercise],
136
+ diagnoser_config: dict
137
+ ) -> tuple[List[str], List[Exercise]]:
138
+ """
139
+ For each exercise, run the 'diagnose_only' from the DiagnoserChain,
140
+ then interpret the results (scorecard) to see if we need a fix,
141
+ then produce an updated exercise if needed.
142
+
143
+ Returns:
144
+ - a list of strings (one per exercise) summarizing the diagnosis,
145
+ - a list of possibly fixed exercises.
146
+ """
147
+ diag_chain = diagnoser_config["class"](
148
+ templates_diagnose=diagnoser_config["templates_diagnose"],
149
+ template_diagnose_scorecard=diagnoser_config["template_diagnose_scorecard"],
150
+ llm_diagnose=diagnoser_config["llm_diagnose"],
151
+ llm_4o_mini=diagnoser_config["llm_4o_mini"],
152
+ llm_4o=diagnoser_config["llm_4o"]
153
+ )
154
+
155
+ diag_strings = []
156
+ fixed_exs = []
157
+
158
+ # Could do parallel calls, but let's keep it simple here
159
+ for ex in exercises:
160
+ # 1) Build a standardized string from the exercise
161
+ ex_str = exercise_to_string(ex) # user-defined
162
+ # 2) call diagnose_only => returns combined text + scorecard
163
+ combined_diag, scorecard = await diag_chain.diagnose_only(
164
+ ex_str
165
+ )
166
+ # 3) interpret the result
167
+ diag_result = (
168
+ f"Exercise {ex.id}:\n{combined_diag}\n--- [SCORECARD] ---\n{scorecard}"
169
+ )
170
+ diag_strings.append(diag_result)
171
+
172
+ if "❌" in scorecard:
173
+ ex_fixed = await fix_exercise(ex, scorecard)
174
+ fixed_exs.append(ex_fixed)
175
+ else:
176
+ fixed_exs.append(ex)
177
+
178
+ return diag_strings, fixed_exs
179
+
180
+
181
+
182
+ async def diagnose_exercise(ex: Exercise) -> str:
183
+ """
184
+ Convert an Exercise object to a standardized string that DiagnoserChain can handle,
185
+ then call DiagnoserChain.diagnose_only(...).
186
+ """
187
+ # 1) standardize or build a string from the exercise
188
+ # e.g. "Vraag: ...\nA) ...\nB) ...\nCorrect=1"
189
+ standardized_str = exercise_to_string(ex)
190
+
191
+ # 2) get the chain config for "diagnoser"
192
+ diag_config = chain_configs["diagnoser"]
193
+
194
+ # 3) instantiate the chain object (if needed) or reuse a global one
195
+ chain_instance = diag_config["class"](
196
+ templates_diagnose=diag_config["templates_diagnose"],
197
+ template_diagnose_scorecard=diag_config["template_diagnose_scorecard"],
198
+ llm_diagnose=diag_config["llm_diagnose"],
199
+ llm_4o_mini=diag_config["llm_4o_mini"],
200
+ llm_4o=diag_config["llm_4o"]
201
+ )
202
+
203
+ # 4) call diagnose_only
204
+ diagnosis = await chain_instance.diagnose_only(standardized_str)
205
+ return diagnosis
206
+
207
+
208
+
209
+
210
+
211
+
212
+ from pydantic import ValidationError
213
+
214
+ async def fix_exercise(
215
+ ex: Exercise,
216
+ diag_str: str,
217
+ fluster_config: dict
218
+ ) -> Exercise:
219
+ """
220
+ Calls 'template_fix_exercise' + 'llm_fix_exercise' from the fluster config
221
+ to rewrite the exercise so it addresses the diagnosis issues.
222
+ """
223
+
224
+ template_fix = fluster_config["template_fix_exercise"]
225
+ llm_fix = fluster_config["llm_fix_exercise"]
226
+
227
+ # 1) Convert the exercise to text
228
+ ex_text = exercise_to_string(ex) # some function that formats ex into text
229
+
230
+ # 2) Format the fix prompt
231
+ prompt_value = await template_fix.aformat_prompt(
232
+ exercise_text=ex_text,
233
+ diagnosis=diag_str
234
+ )
235
+ messages = prompt_value.to_messages()
236
+
237
+ # 3) Invoke the LLM
238
+ fix_resp = await llm_fix.ainvoke(messages)
239
+ raw_content = getattr(fix_resp, "content", fix_resp)
240
+
241
+ # 4) We can parse the LLM result if we want a structured object
242
+ # For example, if we told the LLM to return JSON that matches the Exercise schema:
243
+ # ex_fixed_data = parse the JSON
244
+ # ex_fixed = Exercise.model_validate(ex_fixed_data)
245
+ #
246
+ # Or if the LLM just returned plain text, you can do a simpler approach:
247
+ # For now, as a placeholder, let's just say we re-build the prompt field:
248
+
249
+ # If you do structured output, do something like:
250
+ # try:
251
+ # ex_dict = json.loads(raw_content)
252
+ # ex_fixed = Exercise.model_validate(ex_dict)
253
+ # except (JSONDecodeError, ValidationError) as e:
254
+ # # fallback if needed
255
+ # ex_fixed = ex.copy(update={"prompt": ex.prompt + " (fallback fix)"})
256
+
257
+ # For the sake of example, let's do a naive approach:
258
+ ex_fixed = ex.copy(update={"prompt": raw_content})
259
+
260
+ return ex_fixed
261
+
262
+
263
+
264
+
265
+
266
+ def build_fluster_text(ex_list: list[Exercise]) -> str:
267
+ """
268
+ Combine the final fixed exercises into a user-facing block of text.
269
+ """
270
+ lines = []
271
+ for ex in ex_list:
272
+ lines.append(
273
+ f"Exercise {ex.id}:\n"
274
+ f" {ex.prompt}\n"
275
+ f" 1) {ex.choice_id_1}\n"
276
+ f" 2) {ex.choice_id_2}\n"
277
+ f" 3) {ex.choice_id_3}\n"
278
+ f" 4) {ex.choice_id_4}\n"
279
+ f" Correct answer: {ex.correct_answer_id}\n"
280
+ f" Explanation: {ex.explanation}\n\n"
281
+ )
282
+ return "\n".join(lines)
chains/exercises/runner_with.py ADDED
File without changes
chains/exercises/{runner.py → runner_without.py} RENAMED
@@ -1,4 +1,4 @@
1
- # chains/exercises/runner.py
2
  import asyncio
3
  from typing import AsyncGenerator
4
  from config.llm_config import llms
@@ -6,7 +6,66 @@ from config.chain_configs import chain_configs
6
  from config.templates import template_sanitize_fluster
7
 
8
 
9
- async def run_fluster(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  user_input_text: str,
11
  model_choice_1: str, # for "LLM A"
12
  model_choice_2: str # for "LLM B"
@@ -39,53 +98,23 @@ async def run_fluster(
39
  # We'll hold the final results for each of the 4 tracks in a list
40
  partial_results = ["", "", "", ""]
41
 
42
- # Helper function: runs the pipeline for a single track
43
- async def run_track(track_index: int):
44
- """
45
- Steps for each track:
46
- 1) pick prompt A or B
47
- 2) pick LLM A or B
48
- 3) generate
49
- 4) refine
50
- 5) sanitize
51
- 6) return final text
52
- """
53
- # Decide which prompt to use
54
- if track_index in (0, 2):
55
- gen_template = template_write_a
56
- else:
57
- gen_template = template_write_b
58
-
59
- # Decide which LLM to use
60
- if track_index in (0, 1):
61
- gen_llm = llm_a
62
- else:
63
- gen_llm = llm_b
64
-
65
- # 1) Generate
66
- gen_msg = await gen_template.aformat_prompt(learning_objective=user_input_text)
67
- gen_resp = await gen_llm.ainvoke(gen_msg.to_messages())
68
- write_fluster_result = getattr(gen_resp, "content", gen_resp)
69
-
70
- # 2) Refine distractors << # we skip refinement for now
71
- # refine_msg = await template_refine.aformat_prompt(write_fluster_result=write_fluster_result)
72
- # refine_resp = await llm_refine.ainvoke(refine_msg.to_messages())
73
- # refined_output = getattr(refine_resp, "content", refine_resp)
74
-
75
- # 3) Sanitize
76
- sanitize_msg = await template_sanitize.aformat_prompt(refinement_result=write_fluster_result)
77
- sanitize_resp = await llm_sanitize.ainvoke(sanitize_msg.to_messages())
78
- sanitized_output = getattr(sanitize_resp, "content", sanitize_resp)
79
-
80
- return track_index, sanitized_output
81
-
82
- # Prepare the 4 tasks
83
- tasks = [
84
- run_track(0),
85
- run_track(1),
86
- run_track(2),
87
- run_track(3),
88
- ]
89
 
90
  # Run them in parallel
91
  for coro in asyncio.as_completed(tasks):
 
1
+ # chains/exercises/runner_without.py
2
  import asyncio
3
  from typing import AsyncGenerator
4
  from config.llm_config import llms
 
6
  from config.templates import template_sanitize_fluster
7
 
8
 
9
+ # chains/exercises/runner_utils.py (for example)
10
+
11
+ import asyncio
12
+ from typing import Tuple, Any
13
+ from langchain_core.prompts.chat import ChatPromptTemplate
14
+
15
+ async def write_fluster_track(
16
+ track_index: int,
17
+ user_input_text: str,
18
+ template_write_a: ChatPromptTemplate,
19
+ template_write_b: ChatPromptTemplate,
20
+ llm_a: Any,
21
+ llm_b: Any,
22
+ # If you later enable the "refine" step, pass those too:
23
+ # template_refine: ChatPromptTemplate,
24
+ # llm_refine: Any,
25
+ template_sanitize: ChatPromptTemplate,
26
+ llm_sanitize: Any
27
+ ) -> Tuple[int, str]:
28
+ """
29
+ A reusable helper that:
30
+ (1) Picks prompt A or B,
31
+ (2) Picks LLM A or B,
32
+ (3) Generates a fluster,
33
+ (4) Optionally refines distractors,
34
+ (5) Sanitizes,
35
+ (6) Returns (track_index, final_text).
36
+ """
37
+
38
+ # Decide which prompt to use
39
+ if track_index in (0, 2):
40
+ gen_template = template_write_a
41
+ else:
42
+ gen_template = template_write_b
43
+
44
+ # Decide which LLM to use
45
+ if track_index in (0, 1):
46
+ gen_llm = llm_a
47
+ else:
48
+ gen_llm = llm_b
49
+
50
+ # 1) Generate
51
+ gen_msg = await gen_template.aformat_prompt(learning_objective=user_input_text)
52
+ gen_resp = await gen_llm.ainvoke(gen_msg.to_messages())
53
+ write_fluster_result = getattr(gen_resp, "content", gen_resp)
54
+
55
+ # 2) Refine distractors (currently skipped)
56
+ # refine_msg = await template_refine.aformat_prompt(write_fluster_result=write_fluster_result)
57
+ # refine_resp = await llm_refine.ainvoke(refine_msg.to_messages())
58
+ # refined_output = getattr(refine_resp, "content", refine_resp)
59
+
60
+ # 3) Sanitize
61
+ sanitize_msg = await template_sanitize.aformat_prompt(refinement_result=write_fluster_result)
62
+ sanitize_resp = await llm_sanitize.ainvoke(sanitize_msg.to_messages())
63
+ sanitized_output = getattr(sanitize_resp, "content", sanitize_resp)
64
+
65
+ return (track_index, sanitized_output)
66
+
67
+
68
+ async def run_fluster_no_diagnosis(
69
  user_input_text: str,
70
  model_choice_1: str, # for "LLM A"
71
  model_choice_2: str # for "LLM B"
 
98
  # We'll hold the final results for each of the 4 tracks in a list
99
  partial_results = ["", "", "", ""]
100
 
101
+ ## We'll define tasks that each call `write_fluster_track(...)`
102
+ tasks = []
103
+ for track_i in range(4):
104
+ coro = write_fluster_track(
105
+ track_i,
106
+ user_input_text,
107
+ template_write_a,
108
+ template_write_b,
109
+ llm_a,
110
+ llm_b,
111
+ # template_refine,
112
+ # llm_refine,
113
+ template_sanitize,
114
+ llm_sanitize
115
+ )
116
+ tasks.append(coro)
117
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  # Run them in parallel
120
  for coro in asyncio.as_completed(tasks):
config/chain_configs.py CHANGED
@@ -16,7 +16,7 @@ from config.templates import (
16
  template_write_fluster_a,
17
  template_write_fluster_b,
18
  template_refine_fluster,
19
- template_sanitize_fluster,
20
  )
21
  from chains.diagnoser.diagnoser_chain import DiagnoserChain
22
  from chains.distractors.distractors_chain import DistractorsChain
@@ -76,5 +76,9 @@ chain_configs = {
76
  "llm_refine": llms["GPT-4o (zero temp)"],
77
  "template_sanitize": template_sanitize_fluster,
78
  "llm_sanitize": llms["GPT-4o-mini (zero temp)"],
 
 
 
 
79
  },
80
  }
 
16
  template_write_fluster_a,
17
  template_write_fluster_b,
18
  template_refine_fluster,
19
+ template_sanitize_fluster, template_isolate_exercises,
20
  )
21
  from chains.diagnoser.diagnoser_chain import DiagnoserChain
22
  from chains.distractors.distractors_chain import DistractorsChain
 
76
  "llm_refine": llms["GPT-4o (zero temp)"],
77
  "template_sanitize": template_sanitize_fluster,
78
  "llm_sanitize": llms["GPT-4o-mini (zero temp)"],
79
+ "template_structurize": template_isolate_exercises,
80
+ "llm_structurize": llms["GPT-4o (zero temp)"],
81
+ "template_fix_exercise": template_fix_exercise,
82
+ "llm_fix_exercise": llms["GPT-4o (low temp)"],
83
  },
84
  }
config/llm_config.py CHANGED
@@ -44,8 +44,8 @@ llms = {
44
  "GPT-4 Turbo (low temp)": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
45
 
46
  # OpenAI reasoning models (no temperature)
47
- "o1 (low reasoning_effort)": create_openai_reasoning_llm("o1-2024-12-17", reasoning_effort="low"),
48
- "o1 (high reasoning_effort)": create_openai_reasoning_llm("o1-2024-12-17", reasoning_effort="high"),
49
  "o3-mini (low reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="low"),
50
  "o3-mini (medium reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="medium"),
51
  "o3-mini (high reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
 
44
  "GPT-4 Turbo (low temp)": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
45
 
46
  # OpenAI reasoning models (no temperature)
47
+ "o1 (low reasoning_effort)": create_openai_reasoning_llm("o1", reasoning_effort="low"),
48
+ "o1 (high reasoning_effort)": create_openai_reasoning_llm("o1", reasoning_effort="high"),
49
  "o3-mini (low reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="low"),
50
  "o3-mini (medium reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="medium"),
51
  "o3-mini (high reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
config/system_prompt_texts.py CHANGED
@@ -634,10 +634,11 @@ An explanation should sometimes be presented to the student after they've answer
634
  # Requirements
635
 
636
  ## Exercise
637
- Each of the 3 exercises must test the very same key fact in the given learning objective (the info that's not in parentheses). Assume this described fact is self-evident, not in need of any further outside source or authority for substantiation. Any text between parentheses must only be used in the Theory or Explanation sections of the exercises.
638
 
639
  ## Prompt
640
- The information in the prompt should only contain information that's also present in the learning objective. For example, don't reference anything outside of it (for example, don't use "according to the study text" if the learning objective doesn't say this either).
 
641
 
642
  ## Theory & Explanation (optional)
643
  Theory or Explanation should only be added to all 3 exercises if there's additional info present in the learning objective (often between parentheses, or as a subclause) that is outside of the main fact that's to be tested.
@@ -650,7 +651,7 @@ Put any info there that is not necessary to clarify the prompt beforehand (or th
650
  A good distractor makes a student pause and consider it, separating those who understand the material from those who do not. A bad distractor fails to do this; it can either:
651
  1. Confuse or trick even well-prepared students into believing it might be correct (“too close to the truth”)
652
  2. Be so obviously wrong that no one would reasonably choose it, not even the least knowledgeable student (“too obviously false”).
653
- To be effective, distractors must therefore look "very plausible to someone who doesn't know the topic" and yet remai n "clearly wrong to someone who knows the topic well", all at the same time.
654
  Distractors are too close to the truth, when they are so similar to the correct answer that experts might debate whether they're also valid. They create unnecessary ambiguity and frustrate knowledgeable test-takers, for example by containing partial truths.
655
  Distractors are too obviously false, when they are clearly ridiculous or fantastical to even the dumbest student.
656
  The ideal distractor falls in the middle of this spectrum - plausible enough to tempt those with incomplete knowledge, but clearly incorrect to those who understand the material.
@@ -1034,6 +1035,26 @@ Roughly follow the following template:
1034
  [exercise 3]
1035
  """
1036
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1037
 
1038
 
1039
  XML_templates= [
 
634
  # Requirements
635
 
636
  ## Exercise
637
+ Each of the 3 exercises must test the very same key fact in the given learning objective (the info that's not in parentheses). Any text between parentheses must only be used in the Theory or Explanation sections of the exercises.
638
 
639
  ## Prompt
640
+ The information that's posed in the prompt part of the exercise should only contain information that's also present in the learning objective. Do not reference any source outside of it. See the below examples, both for the learning objective:
641
+
642
 
643
  ## Theory & Explanation (optional)
644
  Theory or Explanation should only be added to all 3 exercises if there's additional info present in the learning objective (often between parentheses, or as a subclause) that is outside of the main fact that's to be tested.
 
651
  A good distractor makes a student pause and consider it, separating those who understand the material from those who do not. A bad distractor fails to do this; it can either:
652
  1. Confuse or trick even well-prepared students into believing it might be correct (“too close to the truth”)
653
  2. Be so obviously wrong that no one would reasonably choose it, not even the least knowledgeable student (“too obviously false”).
654
+ To be effective, distractors must therefore look "very plausible to someone who doesn't know the topic" and yet remain "clearly wrong to someone who knows the topic well", all at the same time.
655
  Distractors are too close to the truth, when they are so similar to the correct answer that experts might debate whether they're also valid. They create unnecessary ambiguity and frustrate knowledgeable test-takers, for example by containing partial truths.
656
  Distractors are too obviously false, when they are clearly ridiculous or fantastical to even the dumbest student.
657
  The ideal distractor falls in the middle of this spectrum - plausible enough to tempt those with incomplete knowledge, but clearly incorrect to those who understand the material.
 
1035
  [exercise 3]
1036
  """
1037
 
1038
+ template_isolate_exercises_text = """
1039
+ Split up the given exercise set into its individual exercises, adhering to this schema:"
1040
+ class Exercise(BaseModel):
1041
+ id: int
1042
+ prompt: str
1043
+ choice_id_1: str
1044
+ choice_id_2: str
1045
+ choice_id_3: Union[str, None]
1046
+ choice_id_4: Union[str, None]
1047
+ correct_answer_id: Literal[1, 2, 3, 4]
1048
+ explanation: Union[str, None]
1049
+
1050
+
1051
+ class ExerciseSet(BaseModel):
1052
+ id: int
1053
+ exercises: List[Exercise]
1054
+
1055
+ Set sequential ids starting at 1. Prompt is the posing of the question (including headers, like 'Vraag:' and Theory/Case if present). Thirds and fourth answer options (choices) are optional, as are the explanations, as they're not always present.
1056
+ """
1057
+
1058
 
1059
 
1060
  XML_templates= [
config/templates.py CHANGED
@@ -18,7 +18,7 @@ from config.system_prompt_texts import (
18
  template_write_fluster_a_text,
19
  template_write_fluster_b_text,
20
  template_refine_fluster_text,
21
- template_sanitize_fluster_text,
22
  )
23
 
24
 
@@ -242,8 +242,6 @@ template_refine_fluster = ChatPromptTemplate(
242
  input_variables=["write_fluster_result"]
243
  )
244
 
245
-
246
-
247
  template_sanitize_fluster = ChatPromptTemplate(
248
  messages=[
249
  ("system", template_sanitize_fluster_text),
@@ -252,4 +250,27 @@ template_sanitize_fluster = ChatPromptTemplate(
252
  input_variables=["refinement_result"]
253
  )
254
 
 
 
 
 
 
 
 
255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  template_write_fluster_a_text,
19
  template_write_fluster_b_text,
20
  template_refine_fluster_text,
21
+ template_sanitize_fluster_text, template_isolate_exercises_text,
22
  )
23
 
24
 
 
242
  input_variables=["write_fluster_result"]
243
  )
244
 
 
 
245
  template_sanitize_fluster = ChatPromptTemplate(
246
  messages=[
247
  ("system", template_sanitize_fluster_text),
 
250
  input_variables=["refinement_result"]
251
  )
252
 
253
+ template_isolate_exercises = ChatPromptTemplate(
254
+ messages=[
255
+ ("system", template_isolate_exercises_text),
256
+ ("human", "{fluster}")
257
+ ],
258
+ input_variables=["fluster"]
259
+ )
260
 
261
+ template_fix_exercise = ChatPromptTemplate(
262
+ messages=[
263
+ (
264
+ "system",
265
+ "You are a helpful assistant that fixes issues in a single multiple choice exercise "
266
+ "based on diagnosis notes. Return only valid text with the same keys as the original."
267
+ ),
268
+ (
269
+ "user",
270
+ "Original exercise:\n{exercise_text}\n\nDiagnosis:\n{diagnosis}\n\n"
271
+ "Rewrite the exercise so that all issues in the diagnosis are resolved. "
272
+ "Use the same structure (prompt, choice_id_1..4, correct_answer_id, explanation)."
273
+ ),
274
+ ],
275
+ input_variables=["exercise_text", "diagnosis"]
276
+ )
main.py CHANGED
@@ -11,7 +11,8 @@ from app.ui.test_set_tab import build_test_set_tab
11
  from app.ui.write_fluster_tab import build_write_fluster_tab
12
  from chains.diagnoser.runner import run_diagnoser
13
  from chains.distractors.runner import run_distractors
14
- from chains.exercises.runner import run_fluster
 
15
  from chains.learning_objectives_generator.runner import run_learning_objectives_generator
16
  from utils.auth import login as auth_login
17
 
@@ -95,8 +96,13 @@ with gr.Blocks() as interface:
95
  (model_choice_fluster_1,
96
  model_choice_fluster_2,
97
  exercises_input,
 
98
  write_fluster_button,
99
  [fluster_box_0, fluster_box_1, fluster_box_2, fluster_box_3],
 
 
 
 
100
  ) = build_write_fluster_tab()
101
 
102
  # 6 Empty separators (somehow scale=6 doesn't work)
@@ -163,12 +169,50 @@ with gr.Blocks() as interface:
163
  # or "stream=True" depending on your version of Gradio
164
  )
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  write_fluster_button.click(
167
- fn=run_fluster, # async generator
168
- inputs=[exercises_input, model_choice_fluster_1, model_choice_fluster_2],
169
- outputs=[fluster_box_0, fluster_box_1, fluster_box_2, fluster_box_3], # fill the 4 textboxes
170
- api_name=None,
171
- queue=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  )
173
 
174
  pipeline_choice.change(fn=log_dropdown_choice, inputs=pipeline_choice, outputs=[])
 
11
  from app.ui.write_fluster_tab import build_write_fluster_tab
12
  from chains.diagnoser.runner import run_diagnoser
13
  from chains.distractors.runner import run_distractors
14
+ from chains.exercises.run_fluster_with_diagnosis import run_fluster_with_diagnosis
15
+ from chains.exercises.runner_without import run_fluster_no_diagnosis
16
  from chains.learning_objectives_generator.runner import run_learning_objectives_generator
17
  from utils.auth import login as auth_login
18
 
 
96
  (model_choice_fluster_1,
97
  model_choice_fluster_2,
98
  exercises_input,
99
+ include_diagnosis,
100
  write_fluster_button,
101
  [fluster_box_0, fluster_box_1, fluster_box_2, fluster_box_3],
102
+ diagnosis_box_1,
103
+ diagnosis_box_3,
104
+ fixes_box_1,
105
+ fixes_box_3
106
  ) = build_write_fluster_tab()
107
 
108
  # 6 Empty separators (somehow scale=6 doesn't work)
 
169
  # or "stream=True" depending on your version of Gradio
170
  )
171
 
172
+ def fluster_pipeline_dispatch(
173
+ user_input: str,
174
+ model_1: str,
175
+ model_2: str,
176
+ include_diagnosis: bool
177
+ ):
178
+ """
179
+ Decide how to run the fluster generation.
180
+ If include_diagnosis=False, we do all 4 tracks, no diagnosing/fixing.
181
+ If include_diagnosis=True, we ONLY do tracks 1 & 3, then parse+diagnose+fix them.
182
+ We'll then return 8 values:
183
+ (track1, track2, track3, track4, diag1, diag3, fix1, fix3)
184
+ """
185
+
186
+ if not include_diagnosis:
187
+ # => run the original pipeline that yields 4 parallel flusters
188
+ # and do NOT parse/diagnose/fix anything.
189
+ track0, track1, track2, track3 = run_fluster_no_diagnosis(user_input, model_1, model_2)
190
+ return (track0, track1, track2, track3, "", "", "", "")
191
+ else:
192
+ # => run only track0 & track2 (i.e. track 1 & track3 in the UI),
193
+ # parse them for 3 exercises each, diagnose, fix
194
+ return run_fluster_with_diagnosis(user_input, model_1, model_2)
195
+
196
+
197
  write_fluster_button.click(
198
+ fn=fluster_pipeline_dispatch,
199
+ inputs=[
200
+ exercises_input,
201
+ model_choice_fluster_1,
202
+ model_choice_fluster_2,
203
+ include_diagnosis
204
+ ],
205
+ outputs=[
206
+ fluster_box_0, # track1
207
+ fluster_box_1, # track2
208
+ fluster_box_2, # track3
209
+ fluster_box_3, # track4
210
+ diagnosis_box_1,
211
+ diagnosis_box_3,
212
+ fixes_box_1,
213
+ fixes_box_3
214
+ ],
215
+ queue=True
216
  )
217
 
218
  pipeline_choice.change(fn=log_dropdown_choice, inputs=pipeline_choice, outputs=[])
pending_issues.md CHANGED
@@ -8,12 +8,30 @@ De student weet dat iemands leven wordt gevormd door drie dingen: interne factor
8
 
9
  # Fluster generation
10
  ## References to things outside of the LO
11
- Input:
12
- De student weet dat iemands leven wordt gevormd door drie dingen: interne factoren, externe factoren en zelfbepaling.
13
 
14
- Result (1/2):
15
 
16
  Vraag:
17
  Waaruit wordt iemands leven volgens de leerstof gevormd?
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
 
8
 
9
  # Fluster generation
10
  ## References to things outside of the LO
11
+ ### Input: De student weet dat iemands leven wordt gevormd door drie dingen: interne factoren, externe factoren en zelfbepaling.
 
12
 
13
+ Result 75%:
14
 
15
  Vraag:
16
  Waaruit wordt iemands leven volgens de leerstof gevormd?
17
 
18
+ Vraag:
19
+ Volgens de leerdoel wordt ieders leven gevormd door drie dingen. Welke drie zijn dat?
20
+
21
+ ## Overuse of absolutes Alleen/Uitsluitend
22
+ ### Input: De student weet dat iemands leven wordt gevormd door drie dingen: interne factoren, externe factoren en zelfbepaling.
23
+
24
+ Result (90%):
25
+ Ieders leven wordt uitsluitend bepaald door interne factoren.
26
+
27
+ Ieders leven wordt alleen gevormd door externe factoren en zelfbepaling.
28
+
29
+ ### Input: De student weet dat interne factoren van binnenuit komen (zoals genen en gezondheid) en mede bepalen wie je bent.:
30
+ Vraag: Wat wordt er bedoeld met "interne factoren" als we kijken naar wie je bent?
31
+ 1. Dat zijn factoren van binnenuit, zoals je genen en gezondheid, die mede bepalen wie je bent. ⬅️
32
+ 2. Dat zijn omstandigheden in je omgeving, zoals gezin en cultuur, die bepalen wie je bent.
33
+ 3. Dat zijn **alleen eigenschappen** die je tijdens je jeugd aanleert op school en in je omgeving.
34
+ 4. Dat zijn **uitsluitend dingen** die te maken hebben met je gedrag, zonder invloed van bijvoorbeeld je lichamelijke gesteldheid.
35
+
36
+
37