Spaces:

BtB-ExpC
/

Exercises

Sleeping

App Files Files Community

BtB-ExpC commited on Feb 20, 2025

Commit

c0ffcf0

1 Parent(s): 6e757e3

biiig change, built run_fluster_with_diagnosis

Browse files

Files changed (14) hide show

app/helpers/exercise_standardizer.py +54 -24
app/ui/write_fluster_tab.py +41 -16
chains/diagnoser/diagnoser_chain.py +5 -6
chains/diagnoser/runner.py +5 -1
chains/distractors/runner.py +1 -1
chains/exercises/run_fluster_with_diagnosis.py +282 -0
chains/exercises/runner_with.py +0 -0
chains/exercises/{runner.py → runner_without.py} +78 -49
config/chain_configs.py +5 -1
config/llm_config.py +2 -2
config/system_prompt_texts.py +24 -3
config/templates.py +24 -3
main.py +50 -6
pending_issues.md +21 -3

app/helpers/exercise_standardizer.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # app/helpers/exercise_standardizer.py
 from langchain_core.prompts import ChatPromptTemplate
-from typing import Any
 from pydantic import BaseModel
@@ -31,37 +31,67 @@ async def standardize_exercise(user_query: str, exercise_format: str, template:
     return standardized_exercise
-# class ExerciseComplete(BaseModel):
-#     id: int
-#     content: str
-#     choice_id_1: str
-#     choice_id_2: str
-#     choice_id_3: str
-#     choice_id_4: str
-#     correct_answer_id: Literal[1, 2, 3, 4]
-async def structurize_exercise(user_query: str, exercise_format: str, template: ChatPromptTemplate, llm: Any):
     """
-    Standardizes an exercise's format using the specified template and LLM
     """
-    if exercise_format == "Raw (original)":
-        return user_query  # No transformation needed
-    formatting_instructions = FORMAT_MAPPINGS_EXERCISES.get(
-        exercise_format,
-        "Please reformat the given exercise to ease further processing."
-    )
-    prompt_std = await template.aformat_prompt(
-        user_input=user_query,
-        formatting_instructions=formatting_instructions
     )
-    std_messages = prompt_std.to_messages()
-    response = await llm.ainvoke(std_messages)
-    standardized_exercise = getattr(response, "content", response)
-    return standardized_exercise

 # app/helpers/exercise_standardizer.py
 from langchain_core.prompts import ChatPromptTemplate
+from typing import Any, Literal, List, Union
 from pydantic import BaseModel
     return standardized_exercise
+class Exercise(BaseModel):
+    id: int
+    prompt: str
+    choice_id_1: str
+    choice_id_2: str
+    choice_id_3: Union[str, None]
+    choice_id_4: Union[str, None]
+    correct_answer_id: Literal[1, 2, 3, 4]
+    explanation: Union[str, None]
+class ExerciseSet(BaseModel):
+    id: int
+    exercises: List[Exercise]
+async def structurize_exercise(
+    fluster_text: str,
+    template: ChatPromptTemplate,
+    llm: Any   # e.g. ChatOpenAI
+) -> ExerciseSet:
     """
+    Distills individual exercises and their components from the fluster text
+    using a structured-output call that returns a Fluster pydantic object.
     """
+    # 1) Format the prompt
+    prompt_str = await template.aformat_prompt(fluster=fluster_text)
+    messages = prompt_str.to_messages()
+    # 2) Call the LLM with the schema
+    response = await llm.with_structured_output(ExerciseSet).ainvoke(messages)
+    exercise_set = response.choices[0].message.parsed
+    # If the model refused or the schema was violated, you might get None or an error
+    if exercise_set is None:
+        raise ValueError(f"LLM refusal or invalid structured data.\nLLM response: {response}")
+    return exercise_set
+def exercise_to_string(ex):
+    choices = [ex.choice_id_1, ex.choice_id_2, ex.choice_id_3, ex.choice_id_4]
+    choice_texts = [f"  {idx + 1}) {choice}" for idx, choice in enumerate(choices) if choice]
+    correct_choice_text = next(
+        (f"  Correct answer: {idx + 1}) {choice}"
+         for idx, choice in enumerate(choices) if choice == ex.correct_answer_id),
+        "  Correct answer: Unknown"
     )
+    explanation_text = f"  Explanation: {ex.explanation}" if ex.explanation else ""
+    plaintext_exercise = (
+            f"Exercise {ex.id}:\n"
+            f"  {ex.prompt}\n"
+            + "\n".join(choice_texts) + "\n"
+            + correct_choice_text + "\n"
+            + explanation_text + "\n\n"
+    )
+    return plaintext_exercise

app/ui/write_fluster_tab.py CHANGED Viewed

@@ -27,27 +27,52 @@ def build_write_fluster_tab():
                 label="LLM 2"
             )
         exercises_input = gr.Textbox(label="Enter a learning objective", value="De student weet dat")
         write_fluster_button = gr.Button("Generate Fluster")
-        # 2×2 textboxes => 4 total
-        # For clarity:
-        #   row 1 => (box_0, box_1)
-        #   row 2 => (box_2, box_3)
-        with gr.Row():
-            box_0 = gr.Textbox(label="Prompt A + LLM 1", interactive=False, lines=14)
-            box_1 = gr.Textbox(label="Prompt A + LLM 1", interactive=False, lines=14)
-        with gr.Row():
-            box_2 = gr.Textbox(label="Prompt A + LLM 2", interactive=False, lines=14)
-            box_3 = gr.Textbox(label="Prompt A + LLM 2", interactive=False, lines=14)
-    # Return references if needed
-    return (model_choice_1,
-            model_choice_2,
-            exercises_input,
-            write_fluster_button,
-            [box_0, box_1, box_2, box_3],
     )

                 label="LLM 2"
             )
+            include_diagnosis = gr.Checkbox(
+                label="Immediately diagnose & fix",
+                value=False,
+                info="Diagnose each exercise and fix if issues found?"
+            )
         exercises_input = gr.Textbox(label="Enter a learning objective", value="De student weet dat")
         write_fluster_button = gr.Button("Generate Fluster")
+        # Results section
+        with gr.Column():
+            # Original fluster results (2×2 grid)
+            gr.Markdown("### Generated Fluster")
+            with gr.Row():
+                box_0 = gr.Textbox(label="Prompt A + LLM 1", interactive=False, lines=14)
+                box_2 = gr.Textbox(label="Prompt A + LLM 2", interactive=False, lines=14)
+            with gr.Row():
+                box_1 = gr.Textbox(label="Prompt A + LLM 1", interactive=False, lines=14)
+                box_3 = gr.Textbox(label="Prompt A + LLM 2", interactive=False, lines=14)
+            # -- 2 side-by-side textboxes for diagnosis results (Track1 & Track3)
+            with gr.Row():
+                diagnosis_box_1 = gr.Textbox(label="Diagnoses: Track1 (3 exercises)", interactive=False,
+                                             visible=True, lines=3)
+                diagnosis_box_3 = gr.Textbox(label="Diagnoses: Track3 (3 exercises)", interactive=False,
+                                             visible=True, lines=3)
+            # -- 2 side-by-side textboxes for final fixed flusters (Track1 & Track3)
+            with gr.Row():
+                fixes_box_1 = gr.Textbox(label="Final Fixed Track1", interactive=False, visible=True, lines=14)
+                fixes_box_3 = gr.Textbox(label="Final Fixed Track3", interactive=False, visible=True, lines=14)
+    # Return all necessary references
+    return (
+        model_choice_1,
+        model_choice_2,
+        include_diagnosis,
+        exercises_input,
+        write_fluster_button,
+        [box_0, box_1, box_2, box_3],
+        diagnosis_box_1,
+        diagnosis_box_3,
+        fixes_box_1,
+        fixes_box_3
     )

chains/diagnoser/diagnoser_chain.py CHANGED Viewed

@@ -12,13 +12,12 @@ class DiagnoserChain(BaseModel):
     llm_4o_mini: Any
     llm_4o: Any
-    async def diagnose_only(self, standardized_exercise: str) -> str:
         """
         Takes a PRE-standardized exercise and:
-          (1) Runs multiple diagnosis prompts in parallel,
-          (2) Merges the results,
-          (3) Generates a scorecard line,
-          (4) Returns the combined text + scorecard.
         """
         # Step 1: define an async helper to run each diagnosis in parallel
@@ -47,7 +46,7 @@ class DiagnoserChain(BaseModel):
         scorecard_response = await self.llm_4o.ainvoke(scorecard_messages)
         scorecard = getattr(scorecard_response, "content", scorecard_response)
-        return combined_diagnosis + "\n--- [SCORECARD] ---\n" + scorecard
     class Config:
         arbitrary_types_allowed = True

     llm_4o_mini: Any
     llm_4o: Any
+    async def diagnose_only(self, standardized_exercise: str) -> tuple[str, str]:
         """
         Takes a PRE-standardized exercise and:
+        Runs multiple diagnosis prompts, merges results, calls the scorecard prompt.
+        Returns a tuple: (combined_diagnosis, scorecard).
+        The first item is the merged text from each prompt; the second item is the final single-line scorecard.
         """
         # Step 1: define an async helper to run each diagnosis in parallel
         scorecard_response = await self.llm_4o.ainvoke(scorecard_messages)
         scorecard = getattr(scorecard_response, "content", scorecard_response)
+        return combined_diagnosis, scorecard
     class Config:
         arbitrary_types_allowed = True

chains/diagnoser/runner.py CHANGED Viewed

@@ -56,9 +56,13 @@ async def run_diagnoser(user_query: str, model_choice_diagnose: str, exercise_fo
     ]
     # run concurrently
     responses = await asyncio.gather(*tasks)
     # pad up to 10 if needed
-    all_responses = list(responses) + [""] * (10 - len(responses))
     # Return a tuple of exactly 10 responses, + the standardized exercise.
     return tuple(all_responses) + (standardized_exercise,)

     ]
     # run concurrently
     responses = await asyncio.gather(*tasks)
+    formatted_responses = [
+        f"{combined_diagnosis}\n--- [SCORECARD] ---\n{scorecard}"
+        for combined_diagnosis, scorecard in responses
+    ]
     # pad up to 10 if needed
+    all_responses = formatted_responses + [""] * (10 - len(formatted_responses))
     # Return a tuple of exactly 10 responses, + the standardized exercise.
     return tuple(all_responses) + (standardized_exercise,)

chains/distractors/runner.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# chains/distractors/runner.py
 import asyncio
 from config.chain_configs import chain_configs

+# chains/distractors/runner_without.py
 import asyncio
 from config.chain_configs import chain_configs

chains/exercises/run_fluster_with_diagnosis.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# chains/exercises/run_fluster_with_diagnosis.py
+import asyncio
+from typing import Tuple
+from app.helpers.exercise_standardizer import structurize_exercise, ExerciseSet, Exercise, exercise_to_string
+from chains.exercises.runner_without import write_fluster_track
+from config.chain_configs import chain_configs
+async def _async_fluster_with_diagnosis(
+    user_input_text: str,
+    model_choice_1: str,
+    model_choice_2: str
+) -> Tuple[str, str, str, str, str, str, str, str]:
+    """
+    The core async pipeline:
+      1. Generate fluster text for track0 & track2 (in parallel).
+      2. Parse each text => get a list of Exercise objects.
+      3. Diagnose each exercise => fix if needed.
+      4. Build the final output strings for the UI.
+    """
+    fluster_config = chain_configs["fluster"]
+    diagnoser_config = chain_configs["diagnoser"]
+    # 1) Generate track0 & track2 in parallel
+    track0_coro = write_fluster_track(
+        0,
+        user_input_text,
+        fluster_config["template_write_fluster_a"],
+        fluster_config["template_write_fluster_b"],
+        llms.get(model_choice_1, fluster_config["default_llm_a"]),
+        llms.get(model_choice_2, fluster_config["default_llm_b"]),
+        fluster_config["template_sanitize"],
+        fluster_config["llm_sanitize"]
+    )
+    track2_coro = write_fluster_track(
+        2,
+        user_input_text,
+        fluster_config["template_write_fluster_a"],
+        fluster_config["template_write_fluster_b"],
+        llms.get(model_choice_1, fluster_config["default_llm_a"]),
+        llms.get(model_choice_2, fluster_config["default_llm_b"]),
+        fluster_config["template_sanitize"],
+        fluster_config["llm_sanitize"]
+    )
+    (t0_idx, track0_text), (t2_idx, track2_text) = await asyncio.gather(track0_coro, track2_coro)
+    # 2) Parse each final text => list of Exercises
+    fluster0_exs = parse_fluster_text_to_exercises(track0_text)
+    fluster2_exs = parse_fluster_text_to_exercises(track2_text)
+    # 3) Diagnose + fix each exercise
+    diag0_results, fixed0_exs = await diagnose_and_fix_all(fluster0_exs, diagnoser_config)
+    diag2_results, fixed2_exs = await diagnose_and_fix_all(fluster2_exs, diagnoser_config)
+    # 4) Convert the final exercises to strings for display
+    # (Or you can store them back into a bigger data structure.)
+    final0_text = build_fluster_text(fixed0_exs)
+    final2_text = build_fluster_text(fixed2_exs)
+    # We'll combine the diagnoses into single strings
+    diagnosis_text_0 = "\n".join(diag0_results)
+    diagnosis_text_2 = "\n".join(diag2_results)
+    # 5) Return the 8 items in the order your UI needs
+    #    (track0_text, "", track2_text, "", diag0_text, diag2_text, fixed0_text, fixed2_text)
+    return (
+        track0_text,   # box_0
+        "",            # box_1 (unused)
+        track2_text,   # box_2
+        "",            # box_3 (unused)
+        diagnosis_text_0,  # diagnosis_box_1
+        diagnosis_text_2,  # diagnosis_box_3
+        final0_text,       # fixes_box_1
+        final2_text        # fixes_box_3
+    )
+def run_fluster_with_diagnosis(
+    user_input_text: str,
+    model_choice_1: str,
+    model_choice_2: str
+) -> Tuple[str, str, str, str, str, str, str, str]:
+    """
+    Synchronous entrypoint for the UI or external calls.
+    """
+    return asyncio.run(_async_fluster_with_diagnosis(user_input_text, model_choice_1, model_choice_2))
+async def write_fluster_track(
+    user_input: str,
+    model_choice_key: str,
+    fluster_config: dict,
+    track_index: int
+) -> tuple[int, str]:
+    """
+    Uses the fluster chain config to write a single track's fluster.
+    Returns (track_index, final_sanitized_text).
+    """
+    # 1) Decide prompt A or B
+    if track_index in (0, 2):
+        gen_template = fluster_config["template_write_fluster_a"]
+    else:
+        gen_template = fluster_config["template_write_fluster_b"]
+    # 2) Decide LLM
+    # either use model_choice_key from user, or the config's default
+    fallback_llm = fluster_config["default_llm_a"] if track_index in (0, 1) else fluster_config["default_llm_b"]
+    gen_llm = chain_configs["fluster"].get(model_choice_key, fallback_llm)
+    # ^ careful: you'd need a dictionary of LLMs. Or do: llm = llms.get(model_choice_key, fallback_llm)
+    # 3) Format + invoke the "writing" prompt
+    prompt_value = await gen_template.aformat_prompt(learning_objective=user_input)
+    gen_resp = await gen_llm.ainvoke(prompt_value.to_messages())
+    raw_text = getattr(gen_resp, "content", gen_resp)
+    # 4) (Optionally refine distractors) - if you have that step
+    # ...
+    # refine_msg = ...
+    # refined_resp = await fluster_config["llm_refine"].ainvoke(...)
+    # raw_text = refined_resp.content
+    # 5) sanitize
+    sanitize_template = fluster_config["template_sanitize"]
+    llm_sanitize = fluster_config["llm_sanitize"]
+    sanitize_prompt = await sanitize_template.aformat_prompt(refinement_result=raw_text)
+    sanitize_resp = await llm_sanitize.ainvoke(sanitize_prompt.to_messages())
+    final_text = getattr(sanitize_resp, "content", sanitize_resp)
+    return (track_index, final_text)
+async def diagnose_and_fix_all(
+    exercises: List[Exercise],
+    diagnoser_config: dict
+) -> tuple[List[str], List[Exercise]]:
+    """
+    For each exercise, run the 'diagnose_only' from the DiagnoserChain,
+    then interpret the results (scorecard) to see if we need a fix,
+    then produce an updated exercise if needed.
+    Returns:
+      - a list of strings (one per exercise) summarizing the diagnosis,
+      - a list of possibly fixed exercises.
+    """
+    diag_chain = diagnoser_config["class"](
+        templates_diagnose=diagnoser_config["templates_diagnose"],
+        template_diagnose_scorecard=diagnoser_config["template_diagnose_scorecard"],
+        llm_diagnose=diagnoser_config["llm_diagnose"],
+        llm_4o_mini=diagnoser_config["llm_4o_mini"],
+        llm_4o=diagnoser_config["llm_4o"]
+    )
+    diag_strings = []
+    fixed_exs = []
+    # Could do parallel calls, but let's keep it simple here
+    for ex in exercises:
+        # 1) Build a standardized string from the exercise
+        ex_str = exercise_to_string(ex)  # user-defined
+        # 2) call diagnose_only => returns combined text + scorecard
+        combined_diag, scorecard = await diag_chain.diagnose_only(
+            ex_str
+        )
+        # 3) interpret the result
+        diag_result = (
+            f"Exercise {ex.id}:\n{combined_diag}\n--- [SCORECARD] ---\n{scorecard}"
+        )
+        diag_strings.append(diag_result)
+        if "❌" in scorecard:
+            ex_fixed = await fix_exercise(ex, scorecard)
+            fixed_exs.append(ex_fixed)
+        else:
+            fixed_exs.append(ex)
+    return diag_strings, fixed_exs
+async def diagnose_exercise(ex: Exercise) -> str:
+    """
+    Convert an Exercise object to a standardized string that DiagnoserChain can handle,
+    then call DiagnoserChain.diagnose_only(...).
+    """
+    # 1) standardize or build a string from the exercise
+    # e.g. "Vraag: ...\nA) ...\nB) ...\nCorrect=1"
+    standardized_str = exercise_to_string(ex)
+    # 2) get the chain config for "diagnoser"
+    diag_config = chain_configs["diagnoser"]
+    # 3) instantiate the chain object (if needed) or reuse a global one
+    chain_instance = diag_config["class"](
+        templates_diagnose=diag_config["templates_diagnose"],
+        template_diagnose_scorecard=diag_config["template_diagnose_scorecard"],
+        llm_diagnose=diag_config["llm_diagnose"],
+        llm_4o_mini=diag_config["llm_4o_mini"],
+        llm_4o=diag_config["llm_4o"]
+    )
+    # 4) call diagnose_only
+    diagnosis = await chain_instance.diagnose_only(standardized_str)
+    return diagnosis
+from pydantic import ValidationError
+async def fix_exercise(
+    ex: Exercise,
+    diag_str: str,
+    fluster_config: dict
+) -> Exercise:
+    """
+    Calls 'template_fix_exercise' + 'llm_fix_exercise' from the fluster config
+    to rewrite the exercise so it addresses the diagnosis issues.
+    """
+    template_fix = fluster_config["template_fix_exercise"]
+    llm_fix = fluster_config["llm_fix_exercise"]
+    # 1) Convert the exercise to text
+    ex_text = exercise_to_string(ex)  # some function that formats ex into text
+    # 2) Format the fix prompt
+    prompt_value = await template_fix.aformat_prompt(
+        exercise_text=ex_text,
+        diagnosis=diag_str
+    )
+    messages = prompt_value.to_messages()
+    # 3) Invoke the LLM
+    fix_resp = await llm_fix.ainvoke(messages)
+    raw_content = getattr(fix_resp, "content", fix_resp)
+    # 4) We can parse the LLM result if we want a structured object
+    # For example, if we told the LLM to return JSON that matches the Exercise schema:
+    #    ex_fixed_data = parse the JSON
+    #    ex_fixed = Exercise.model_validate(ex_fixed_data)
+    #
+    # Or if the LLM just returned plain text, you can do a simpler approach:
+    # For now, as a placeholder, let's just say we re-build the prompt field:
+    # If you do structured output, do something like:
+    # try:
+    #     ex_dict = json.loads(raw_content)
+    #     ex_fixed = Exercise.model_validate(ex_dict)
+    # except (JSONDecodeError, ValidationError) as e:
+    #     # fallback if needed
+    #     ex_fixed = ex.copy(update={"prompt": ex.prompt + " (fallback fix)"})
+    # For the sake of example, let's do a naive approach:
+    ex_fixed = ex.copy(update={"prompt": raw_content})
+    return ex_fixed
+def build_fluster_text(ex_list: list[Exercise]) -> str:
+    """
+    Combine the final fixed exercises into a user-facing block of text.
+    """
+    lines = []
+    for ex in ex_list:
+        lines.append(
+            f"Exercise {ex.id}:\n"
+            f"  {ex.prompt}\n"
+            f"  1) {ex.choice_id_1}\n"
+            f"  2) {ex.choice_id_2}\n"
+            f"  3) {ex.choice_id_3}\n"
+            f"  4) {ex.choice_id_4}\n"
+            f"  Correct answer: {ex.correct_answer_id}\n"
+            f"  Explanation: {ex.explanation}\n\n"
+        )
+    return "\n".join(lines)

chains/exercises/runner_with.py ADDED Viewed

File without changes

chains/exercises/{runner.py → runner_without.py} RENAMED Viewed

@@ -1,4 +1,4 @@
-# chains/exercises/runner.py
 import asyncio
 from typing import AsyncGenerator
 from config.llm_config import llms
@@ -6,7 +6,66 @@ from config.chain_configs import chain_configs
 from config.templates import template_sanitize_fluster
-async def run_fluster(
     user_input_text: str,
     model_choice_1: str,  # for "LLM A"
     model_choice_2: str   # for "LLM B"
@@ -39,53 +98,23 @@ async def run_fluster(
     # We'll hold the final results for each of the 4 tracks in a list
     partial_results = ["", "", "", ""]
-    # Helper function: runs the pipeline for a single track
-    async def run_track(track_index: int):
-        """
-        Steps for each track:
-          1) pick prompt A or B
-          2) pick LLM A or B
-          3) generate
-          4) refine
-          5) sanitize
-          6) return final text
-        """
-        # Decide which prompt to use
-        if track_index in (0, 2):
-            gen_template = template_write_a
-        else:
-            gen_template = template_write_b
-        # Decide which LLM to use
-        if track_index in (0, 1):
-            gen_llm = llm_a
-        else:
-            gen_llm = llm_b
-        # 1) Generate
-        gen_msg = await gen_template.aformat_prompt(learning_objective=user_input_text)
-        gen_resp = await gen_llm.ainvoke(gen_msg.to_messages())
-        write_fluster_result = getattr(gen_resp, "content", gen_resp)
-        # 2) Refine distractors << # we skip refinement for now
-        # refine_msg = await template_refine.aformat_prompt(write_fluster_result=write_fluster_result)
-        # refine_resp = await llm_refine.ainvoke(refine_msg.to_messages())
-        # refined_output = getattr(refine_resp, "content", refine_resp)
-        # 3) Sanitize
-        sanitize_msg = await template_sanitize.aformat_prompt(refinement_result=write_fluster_result)
-        sanitize_resp = await llm_sanitize.ainvoke(sanitize_msg.to_messages())
-        sanitized_output = getattr(sanitize_resp, "content", sanitize_resp)
-        return track_index, sanitized_output
-    # Prepare the 4 tasks
-    tasks = [
-        run_track(0),
-        run_track(1),
-        run_track(2),
-        run_track(3),
-    ]
     # Run them in parallel
     for coro in asyncio.as_completed(tasks):

+# chains/exercises/runner_without.py
 import asyncio
 from typing import AsyncGenerator
 from config.llm_config import llms
 from config.templates import template_sanitize_fluster
+# chains/exercises/runner_utils.py (for example)
+import asyncio
+from typing import Tuple, Any
+from langchain_core.prompts.chat import ChatPromptTemplate
+async def write_fluster_track(
+    track_index: int,
+    user_input_text: str,
+    template_write_a: ChatPromptTemplate,
+    template_write_b: ChatPromptTemplate,
+    llm_a: Any,
+    llm_b: Any,
+    # If you later enable the "refine" step, pass those too:
+    # template_refine: ChatPromptTemplate,
+    # llm_refine: Any,
+    template_sanitize: ChatPromptTemplate,
+    llm_sanitize: Any
+) -> Tuple[int, str]:
+    """
+    A reusable helper that:
+      (1) Picks prompt A or B,
+      (2) Picks LLM A or B,
+      (3) Generates a fluster,
+      (4) Optionally refines distractors,
+      (5) Sanitizes,
+      (6) Returns (track_index, final_text).
+    """
+    # Decide which prompt to use
+    if track_index in (0, 2):
+        gen_template = template_write_a
+    else:
+        gen_template = template_write_b
+    # Decide which LLM to use
+    if track_index in (0, 1):
+        gen_llm = llm_a
+    else:
+        gen_llm = llm_b
+    # 1) Generate
+    gen_msg = await gen_template.aformat_prompt(learning_objective=user_input_text)
+    gen_resp = await gen_llm.ainvoke(gen_msg.to_messages())
+    write_fluster_result = getattr(gen_resp, "content", gen_resp)
+    # 2) Refine distractors (currently skipped)
+    # refine_msg = await template_refine.aformat_prompt(write_fluster_result=write_fluster_result)
+    # refine_resp = await llm_refine.ainvoke(refine_msg.to_messages())
+    # refined_output = getattr(refine_resp, "content", refine_resp)
+    # 3) Sanitize
+    sanitize_msg = await template_sanitize.aformat_prompt(refinement_result=write_fluster_result)
+    sanitize_resp = await llm_sanitize.ainvoke(sanitize_msg.to_messages())
+    sanitized_output = getattr(sanitize_resp, "content", sanitize_resp)
+    return (track_index, sanitized_output)
+async def run_fluster_no_diagnosis(
     user_input_text: str,
     model_choice_1: str,  # for "LLM A"
     model_choice_2: str   # for "LLM B"
     # We'll hold the final results for each of the 4 tracks in a list
     partial_results = ["", "", "", ""]
+    ## We'll define tasks that each call `write_fluster_track(...)`
+    tasks = []
+    for track_i in range(4):
+        coro = write_fluster_track(
+            track_i,
+            user_input_text,
+            template_write_a,
+            template_write_b,
+            llm_a,
+            llm_b,
+            # template_refine,
+            # llm_refine,
+            template_sanitize,
+            llm_sanitize
+        )
+        tasks.append(coro)
     # Run them in parallel
     for coro in asyncio.as_completed(tasks):

config/chain_configs.py CHANGED Viewed

@@ -16,7 +16,7 @@ from config.templates import (
     template_write_fluster_a,
     template_write_fluster_b,
     template_refine_fluster,
-    template_sanitize_fluster,
 )
 from chains.diagnoser.diagnoser_chain import DiagnoserChain
 from chains.distractors.distractors_chain import DistractorsChain
@@ -76,5 +76,9 @@ chain_configs = {
         "llm_refine": llms["GPT-4o (zero temp)"],
         "template_sanitize": template_sanitize_fluster,
         "llm_sanitize": llms["GPT-4o-mini (zero temp)"],
     },
 }

     template_write_fluster_a,
     template_write_fluster_b,
     template_refine_fluster,
+    template_sanitize_fluster, template_isolate_exercises,
 )
 from chains.diagnoser.diagnoser_chain import DiagnoserChain
 from chains.distractors.distractors_chain import DistractorsChain
         "llm_refine": llms["GPT-4o (zero temp)"],
         "template_sanitize": template_sanitize_fluster,
         "llm_sanitize": llms["GPT-4o-mini (zero temp)"],
+        "template_structurize": template_isolate_exercises,
+        "llm_structurize": llms["GPT-4o (zero temp)"],
+        "template_fix_exercise": template_fix_exercise,
+        "llm_fix_exercise": llms["GPT-4o (low temp)"],
     },
 }

config/llm_config.py CHANGED Viewed

@@ -44,8 +44,8 @@ llms = {
     "GPT-4 Turbo (low temp)": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
     # OpenAI reasoning models (no temperature)
-    "o1 (low reasoning_effort)": create_openai_reasoning_llm("o1-2024-12-17", reasoning_effort="low"),
-    "o1 (high reasoning_effort)": create_openai_reasoning_llm("o1-2024-12-17", reasoning_effort="high"),
     "o3-mini (low reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="low"),
     "o3-mini (medium reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="medium"),
     "o3-mini (high reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),

     "GPT-4 Turbo (low temp)": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
     # OpenAI reasoning models (no temperature)
+    "o1 (low reasoning_effort)": create_openai_reasoning_llm("o1", reasoning_effort="low"),
+    "o1 (high reasoning_effort)": create_openai_reasoning_llm("o1", reasoning_effort="high"),
     "o3-mini (low reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="low"),
     "o3-mini (medium reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="medium"),
     "o3-mini (high reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),

config/system_prompt_texts.py CHANGED Viewed

@@ -634,10 +634,11 @@ An explanation should sometimes be presented to the student after they've answer
 # Requirements
 ## Exercise
-Each of the 3 exercises must test the very same key fact in the given learning objective (the info that's not in parentheses). Assume this described fact is self-evident, not in need of any further outside source or authority for substantiation. Any text between parentheses must only be used in the Theory or Explanation sections of the exercises.
 ## Prompt
-The information in the prompt should only contain information that's also present in the learning objective. For example, don't reference anything outside of it (for example, don't use "according to the study text" if the learning objective doesn't say this either).
 ## Theory & Explanation (optional)
 Theory or Explanation should only be added to all 3 exercises if there's additional info present in the learning objective (often between parentheses, or as a subclause) that is outside of the main fact that's to be tested.
@@ -650,7 +651,7 @@ Put any info there that is not necessary to clarify the prompt beforehand (or th
 A good distractor makes a student pause and consider it, separating those who understand the material from those who do not. A bad distractor fails to do this; it can either:
 1. Confuse or trick even well-prepared students into believing it might be correct (“too close to the truth”)
 2. Be so obviously wrong that no one would reasonably choose it, not even the least knowledgeable student (“too obviously false”).
-To be effective, distractors must therefore look "very plausible to someone who doesn't know the topic" and yet remai n "clearly wrong to someone who knows the topic well", all at the same time.
 Distractors are too close to the truth, when they are so similar to the correct answer that experts might debate whether they're also valid. They create unnecessary ambiguity and frustrate knowledgeable test-takers, for example by containing partial truths.
 Distractors are too obviously false, when they are clearly ridiculous or fantastical to even the dumbest student.
 The ideal distractor falls in the middle of this spectrum - plausible enough to tempt those with incomplete knowledge, but clearly incorrect to those who understand the material.
@@ -1034,6 +1035,26 @@ Roughly follow the following template:
 [exercise 3]
 """
 XML_templates= [

 # Requirements
 ## Exercise
+Each of the 3 exercises must test the very same key fact in the given learning objective (the info that's not in parentheses). Any text between parentheses must only be used in the Theory or Explanation sections of the exercises.
 ## Prompt
+The information that's posed in the prompt part of the exercise should only contain information that's also present in the learning objective. Do not reference any source outside of it. See the below examples, both for the learning objective:
 ## Theory & Explanation (optional)
 Theory or Explanation should only be added to all 3 exercises if there's additional info present in the learning objective (often between parentheses, or as a subclause) that is outside of the main fact that's to be tested.
 A good distractor makes a student pause and consider it, separating those who understand the material from those who do not. A bad distractor fails to do this; it can either:
 1. Confuse or trick even well-prepared students into believing it might be correct (“too close to the truth”)
 2. Be so obviously wrong that no one would reasonably choose it, not even the least knowledgeable student (“too obviously false”).
+To be effective, distractors must therefore look "very plausible to someone who doesn't know the topic" and yet remain "clearly wrong to someone who knows the topic well", all at the same time.
 Distractors are too close to the truth, when they are so similar to the correct answer that experts might debate whether they're also valid. They create unnecessary ambiguity and frustrate knowledgeable test-takers, for example by containing partial truths.
 Distractors are too obviously false, when they are clearly ridiculous or fantastical to even the dumbest student.
 The ideal distractor falls in the middle of this spectrum - plausible enough to tempt those with incomplete knowledge, but clearly incorrect to those who understand the material.
 [exercise 3]
 """
+template_isolate_exercises_text = """
+Split up the given exercise set into its individual exercises, adhering to this schema:"
+class Exercise(BaseModel):
+    id: int
+    prompt: str
+    choice_id_1: str
+    choice_id_2: str
+    choice_id_3: Union[str, None]
+    choice_id_4: Union[str, None]
+    correct_answer_id: Literal[1, 2, 3, 4]
+    explanation: Union[str, None]
+class ExerciseSet(BaseModel):
+    id: int
+    exercises: List[Exercise]
+Set sequential ids starting at 1. Prompt is the posing of the question (including headers, like 'Vraag:' and Theory/Case if present). Thirds and fourth answer options (choices) are optional, as are the explanations, as they're not always present.
+"""
 XML_templates= [

config/templates.py CHANGED Viewed

@@ -18,7 +18,7 @@ from config.system_prompt_texts import (
     template_write_fluster_a_text,
     template_write_fluster_b_text,
     template_refine_fluster_text,
-    template_sanitize_fluster_text,
 )
@@ -242,8 +242,6 @@ template_refine_fluster = ChatPromptTemplate(
     input_variables=["write_fluster_result"]
 )
 template_sanitize_fluster = ChatPromptTemplate(
     messages=[
         ("system", template_sanitize_fluster_text),
@@ -252,4 +250,27 @@ template_sanitize_fluster = ChatPromptTemplate(
     input_variables=["refinement_result"]
 )

     template_write_fluster_a_text,
     template_write_fluster_b_text,
     template_refine_fluster_text,
+    template_sanitize_fluster_text, template_isolate_exercises_text,
 )
     input_variables=["write_fluster_result"]
 )
 template_sanitize_fluster = ChatPromptTemplate(
     messages=[
         ("system", template_sanitize_fluster_text),
     input_variables=["refinement_result"]
 )
+template_isolate_exercises = ChatPromptTemplate(
+    messages=[
+        ("system", template_isolate_exercises_text),
+        ("human", "{fluster}")
+    ],
+    input_variables=["fluster"]
+)
+template_fix_exercise = ChatPromptTemplate(
+    messages=[
+        (
+            "system",
+            "You are a helpful assistant that fixes issues in a single multiple choice exercise "
+            "based on diagnosis notes. Return only valid text with the same keys as the original."
+        ),
+        (
+            "user",
+            "Original exercise:\n{exercise_text}\n\nDiagnosis:\n{diagnosis}\n\n"
+            "Rewrite the exercise so that all issues in the diagnosis are resolved. "
+            "Use the same structure (prompt, choice_id_1..4, correct_answer_id, explanation)."
+        ),
+    ],
+    input_variables=["exercise_text", "diagnosis"]
+)

main.py CHANGED Viewed

@@ -11,7 +11,8 @@ from app.ui.test_set_tab import build_test_set_tab
 from app.ui.write_fluster_tab import build_write_fluster_tab
 from chains.diagnoser.runner import run_diagnoser
 from chains.distractors.runner import run_distractors
-from chains.exercises.runner import run_fluster
 from chains.learning_objectives_generator.runner import run_learning_objectives_generator
 from utils.auth import login as auth_login
@@ -95,8 +96,13 @@ with gr.Blocks() as interface:
             (model_choice_fluster_1,
              model_choice_fluster_2,
              exercises_input,
              write_fluster_button,
              [fluster_box_0, fluster_box_1, fluster_box_2, fluster_box_3],
              ) = build_write_fluster_tab()
             # 6 Empty separators (somehow scale=6 doesn't work)
@@ -163,12 +169,50 @@ with gr.Blocks() as interface:
         # or "stream=True" depending on your version of Gradio
     )
     write_fluster_button.click(
-        fn=run_fluster,  # async generator
-        inputs=[exercises_input, model_choice_fluster_1, model_choice_fluster_2],
-        outputs=[fluster_box_0, fluster_box_1, fluster_box_2, fluster_box_3],  # fill the 4 textboxes
-        api_name=None,
-        queue=True,
     )
     pipeline_choice.change(fn=log_dropdown_choice, inputs=pipeline_choice, outputs=[])

 from app.ui.write_fluster_tab import build_write_fluster_tab
 from chains.diagnoser.runner import run_diagnoser
 from chains.distractors.runner import run_distractors
+from chains.exercises.run_fluster_with_diagnosis import run_fluster_with_diagnosis
+from chains.exercises.runner_without import run_fluster_no_diagnosis
 from chains.learning_objectives_generator.runner import run_learning_objectives_generator
 from utils.auth import login as auth_login
             (model_choice_fluster_1,
              model_choice_fluster_2,
              exercises_input,
+             include_diagnosis,
              write_fluster_button,
              [fluster_box_0, fluster_box_1, fluster_box_2, fluster_box_3],
+             diagnosis_box_1,
+             diagnosis_box_3,
+             fixes_box_1,
+             fixes_box_3
              ) = build_write_fluster_tab()
             # 6 Empty separators (somehow scale=6 doesn't work)
         # or "stream=True" depending on your version of Gradio
     )
+    def fluster_pipeline_dispatch(
+            user_input: str,
+            model_1: str,
+            model_2: str,
+            include_diagnosis: bool
+    ):
+        """
+        Decide how to run the fluster generation.
+        If include_diagnosis=False, we do all 4 tracks, no diagnosing/fixing.
+        If include_diagnosis=True, we ONLY do tracks 1 & 3, then parse+diagnose+fix them.
+        We'll then return 8 values:
+          (track1, track2, track3, track4, diag1, diag3, fix1, fix3)
+        """
+        if not include_diagnosis:
+            # => run the original pipeline that yields 4 parallel flusters
+            #    and do NOT parse/diagnose/fix anything.
+            track0, track1, track2, track3 = run_fluster_no_diagnosis(user_input, model_1, model_2)
+            return (track0, track1, track2, track3, "", "", "", "")
+        else:
+            # => run only track0 & track2 (i.e. track 1 & track3 in the UI),
+            #    parse them for 3 exercises each, diagnose, fix
+            return run_fluster_with_diagnosis(user_input, model_1, model_2)
     write_fluster_button.click(
+        fn=fluster_pipeline_dispatch,
+        inputs=[
+            exercises_input,
+            model_choice_fluster_1,
+            model_choice_fluster_2,
+            include_diagnosis
+        ],
+        outputs=[
+            fluster_box_0,  # track1
+            fluster_box_1,  # track2
+            fluster_box_2,  # track3
+            fluster_box_3,  # track4
+            diagnosis_box_1,
+            diagnosis_box_3,
+            fixes_box_1,
+            fixes_box_3
+        ],
+        queue=True
     )
     pipeline_choice.change(fn=log_dropdown_choice, inputs=pipeline_choice, outputs=[])

pending_issues.md CHANGED Viewed

@@ -8,12 +8,30 @@ De student weet dat iemands leven wordt gevormd door drie dingen: interne factor
 # Fluster generation
 ## References to things outside of the LO
-Input:
-De student weet dat iemands leven wordt gevormd door drie dingen: interne factoren, externe factoren en zelfbepaling.
-Result (1/2):
 Vraag:
    Waaruit wordt iemands leven volgens de leerstof gevormd?

 # Fluster generation
 ## References to things outside of the LO
+### Input: De student weet dat iemands leven wordt gevormd door drie dingen: interne factoren, externe factoren en zelfbepaling.
+Result 75%:
 Vraag:
    Waaruit wordt iemands leven volgens de leerstof gevormd?
+  Vraag:
+   Volgens de leerdoel wordt ieders leven gevormd door drie dingen. Welke drie zijn dat?
+## Overuse of absolutes Alleen/Uitsluitend
+### Input: De student weet dat iemands leven wordt gevormd door drie dingen: interne factoren, externe factoren en zelfbepaling.
+Result (90%):
+ Ieders leven wordt uitsluitend bepaald door interne factoren.
+   Ieders leven wordt alleen gevormd door externe factoren en zelfbepaling.
+### Input: De student weet dat interne factoren van binnenuit komen (zoals genen en gezondheid) en mede bepalen wie je bent.:
+   Vraag: Wat wordt er bedoeld met "interne factoren" als we kijken naar wie je bent?
+   1. Dat zijn factoren van binnenuit, zoals je genen en gezondheid, die mede bepalen wie je bent. ⬅️
+   2. Dat zijn omstandigheden in je omgeving, zoals gezin en cultuur, die bepalen wie je bent.
+   3. Dat zijn **alleen eigenschappen** die je tijdens je jeugd aanleert op school en in je omgeving.
+   4. Dat zijn **uitsluitend dingen** die te maken hebben met je gedrag, zonder invloed van bijvoorbeeld je lichamelijke gesteldheid.