Spaces:

BtB-ExpC
/

Exercises

Sleeping

App Files Files Community

BtB-ExpC commited on Jun 22, 2025

Commit

eec7865

2 Parent(s): 7282fa9 9d2a494

added new models (Gemini 2.5 pro & gpt-4.1-mini)

Browse files

Files changed (5) hide show

app/ui/write_fluster_tab.py +1 -1
chains/exercises/run_fluster_with_diagnosis.py +33 -68
config/llm_config.py +9 -5
config/system_prompt_texts.py +7 -7
config/templates.py +2 -2

app/ui/write_fluster_tab.py CHANGED Viewed

@@ -28,7 +28,7 @@ def build_write_fluster_tab():
             )
             include_diagnosis = gr.Checkbox(
-                label="Immediately diagnose & fix",
                 value=False,
                 info="Diagnose each exercise and fix if issues found?"
             )

             )
             include_diagnosis = gr.Checkbox(
+                label="Immediately diagnose & fix 🚧",
                 value=False,
                 info="Diagnose each exercise and fix if issues found?"
             )

chains/exercises/run_fluster_with_diagnosis.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # chains/exercises/run_fluster_with_diagnosis.py
 import asyncio
-from typing import Tuple, List
 from app.helpers.exercise_standardizer import structurize_exercise, ExerciseSet, Exercise, exercise_to_string
 from chains.exercises.runner_without import write_fluster_track
@@ -64,6 +64,9 @@ async def _async_fluster_with_diagnosis(
     fluster_config = chain_configs["fluster"]
     diagnoser_config = chain_configs["diagnoser"]
     # 1) Generate track0 & track2 in parallel
     track0_coro = write_fluster_track(
         user_input_text,
@@ -85,8 +88,8 @@ async def _async_fluster_with_diagnosis(
     fluster2_exs = await parse_fluster_text_to_exercises(track2_text)
     # 3) Diagnose + fix each exercise
-    diag0_results, fixed0_exs = await diagnose_and_fix_all(fluster0_exs, diagnoser_config)
-    diag2_results, fixed2_exs = await diagnose_and_fix_all(fluster2_exs, diagnoser_config)
     # 4) Convert the final exercises to strings for display
     # (Or you can store them back into a bigger data structure.)
@@ -110,16 +113,6 @@ async def _async_fluster_with_diagnosis(
         final2_text        # fixes_box_3
     )
-def run_fluster_with_diagnosis(
-    user_input_text: str,
-    model_choice_1: str,
-    model_choice_2: str
-) -> Tuple[str, str, str, str, str, str, str, str]:
-    """
-    Synchronous entrypoint for the UI or external calls.
-    """
-    return asyncio.run(_async_fluster_with_diagnosis(user_input_text, model_choice_1, model_choice_2))
 async def write_fluster_track(
     user_input: str,
@@ -140,8 +133,7 @@ async def write_fluster_track(
     # 2) Decide LLM
     # either use model_choice_key from user, or the config's default
     fallback_llm = fluster_config["default_llm_a"] if track_index in (0, 1) else fluster_config["default_llm_b"]
-    gen_llm = chain_configs["fluster"].get(model_choice_key, fallback_llm)
-    # ^ careful: you'd need a dictionary of LLMs. Or do: llm = llms.get(model_choice_key, fallback_llm)
     # 3) Format + invoke the "writing" prompt
     prompt_value = await gen_template.aformat_prompt(learning_objective=user_input)
@@ -166,9 +158,9 @@ async def write_fluster_track(
 async def diagnose_and_fix_all(
-    exercises: List[Exercise],
-    diagnoser_config: dict
-) -> tuple[List[str], List[Exercise]]:
     """
     For each exercise, run the 'diagnose_only' from the DiagnoserChain,
     then interpret the results (scorecard) to see if we need a fix,
@@ -177,6 +169,7 @@ async def diagnose_and_fix_all(
     Returns:
       - a list of strings (one per exercise) summarizing the diagnosis,
       - a list of possibly fixed exercises.
     """
     diag_chain = diagnoser_config["class"](
         templates_diagnose=diagnoser_config["templates_diagnose"],
@@ -206,7 +199,7 @@ async def diagnose_and_fix_all(
         fluster_config = chain_configs["fluster"]
         if "❌" in scorecard:
-            ex_fixed = await fix_exercise(ex, scorecard, fluster_config)
             fixed_exs.append(ex_fixed)
         else:
             fixed_exs.append(ex)
@@ -247,55 +240,27 @@ async def diagnose_exercise(ex: Exercise) -> str:
 from pydantic import ValidationError
-async def fix_exercise(
-    ex: Exercise,
-    diag_str: str,
-    fluster_config: dict
-) -> Exercise:
-    """
-    Calls 'template_fix_exercise' + 'llm_fix_exercise' from the fluster config
-    to rewrite the exercise so it addresses the diagnosis issues.
-    """
-    template_fix = fluster_config["template_fix_exercise"]
-    llm_fix = fluster_config["llm_fix_exercise"]
-    # 1) Convert the exercise to text
-    ex_text = exercise_to_string(ex)  # some function that formats ex into text
-    # 2) Format the fix prompt
-    prompt_value = await template_fix.aformat_prompt(
-        exercise_text=ex_text,
-        diagnosis=diag_str
-    )
-    messages = prompt_value.to_messages()
-    # 3) Invoke the LLM
-    fix_resp = await llm_fix.ainvoke(messages)
-    raw_content = getattr(fix_resp, "content", fix_resp)
-    # 4) We can parse the LLM result if we want a structured object
-    # For example, if we told the LLM to return JSON that matches the Exercise schema:
-    #    ex_fixed_data = parse the JSON
-    #    ex_fixed = Exercise.model_validate(ex_fixed_data)
-    #
-    # Or if the LLM just returned plain text, you can do a simpler approach:
-    # For now, as a placeholder, let's just say we re-build the prompt field:
-    # If you do structured output, do something like:
-    # try:
-    #     ex_dict = json.loads(raw_content)
-    #     ex_fixed = Exercise.model_validate(ex_dict)
-    # except (JSONDecodeError, ValidationError) as e:
-    #     # fallback if needed
-    #     ex_fixed = ex.copy(update={"prompt": ex.prompt + " (fallback fix)"})
-    # For the sake of example, let's do a naive approach:
-    ex_fixed = ex.copy(update={"prompt": raw_content})
-    return ex_fixed

 # chains/exercises/run_fluster_with_diagnosis.py
 import asyncio
+from typing import Tuple, List, Any
 from app.helpers.exercise_standardizer import structurize_exercise, ExerciseSet, Exercise, exercise_to_string
 from chains.exercises.runner_without import write_fluster_track
     fluster_config = chain_configs["fluster"]
     diagnoser_config = chain_configs["diagnoser"]
+    llm1 = llms.get(model_choice_1, fluster_config["default_llm_a"])
+    llm2 = llms.get(model_choice_2, fluster_config["default_llm_b"])
     # 1) Generate track0 & track2 in parallel
     track0_coro = write_fluster_track(
         user_input_text,
     fluster2_exs = await parse_fluster_text_to_exercises(track2_text)
     # 3) Diagnose + fix each exercise
+    diag0_results, fixed0_exs = await diagnose_and_fix_all(fluster0_exs, diagnoser_config, llm_fix=llm1)
+    diag2_results, fixed2_exs = await diagnose_and_fix_all(fluster2_exs, diagnoser_config, llm_fix=llm2)
     # 4) Convert the final exercises to strings for display
     # (Or you can store them back into a bigger data structure.)
         final2_text        # fixes_box_3
     )
 async def write_fluster_track(
     user_input: str,
     # 2) Decide LLM
     # either use model_choice_key from user, or the config's default
     fallback_llm = fluster_config["default_llm_a"] if track_index in (0, 1) else fluster_config["default_llm_b"]
+    gen_llm = llms.get(model_choice_key, fallback_llm)
     # 3) Format + invoke the "writing" prompt
     prompt_value = await gen_template.aformat_prompt(learning_objective=user_input)
 async def diagnose_and_fix_all(
+        exercises: List[Exercise],
+        diagnoser_config: dict,
+        llm_fix: Any) -> tuple[List[str], List[Exercise]]:
     """
     For each exercise, run the 'diagnose_only' from the DiagnoserChain,
     then interpret the results (scorecard) to see if we need a fix,
     Returns:
       - a list of strings (one per exercise) summarizing the diagnosis,
       - a list of possibly fixed exercises.
+      :param llm_fix:
     """
     diag_chain = diagnoser_config["class"](
         templates_diagnose=diagnoser_config["templates_diagnose"],
         fluster_config = chain_configs["fluster"]
         if "❌" in scorecard:
+            ex_fixed = await fix_exercise(ex, scorecard, fluster_config, llm_fix)
             fixed_exs.append(ex_fixed)
         else:
             fixed_exs.append(ex)
 from pydantic import ValidationError
+async def fix_exercise(ex: Exercise, diag_str: str, cfg: dict, llm_fix:None) -> Exercise:
+    tmpl_fix  = cfg["template_fix_exercise"]
+    if not llm_fix:
+        llm_fix   = cfg["llm_fix_exercise"]
+    llm_cast  = cfg["llm_structurize"]          # already in chain_configs
+    # 1️⃣ first call – creative rewrite
+    prompt = await tmpl_fix.aformat_prompt(
+                 exercise_text = exercise_to_string(ex),
+                 diagnosis     = diag_str
+             )
+    raw = (await llm_fix.ainvoke(prompt.to_messages())).content
+    # 2️⃣ second call – cast to schema
+    try:
+        ex_fixed = await llm_cast.with_structured_output(Exercise).ainvoke(
+            [("user", raw)]                   # minimal prompt: just the text
+        )
+        return ex_fixed
+    except Exception:
+        return ex.copy(update={"prompt": raw})

config/llm_config.py CHANGED Viewed

@@ -55,6 +55,8 @@ llms = {
     "GPT-4.1-mini (zero temp)": create_openai_llm("gpt-4.1-mini", ZERO),
     "GPT-4.1-mini (low temp)": create_openai_llm("gpt-4.1-mini", LOW),
     "GPT-4 Turbo (low temp)": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
     "GPT-4.5 (low temp)": create_openai_llm("gpt-4.5-preview-2025-02-27", LOW),
     "GPT-4.5 (mid temp)": create_openai_llm("gpt-4.5-preview-2025-02-27", MID),
@@ -64,6 +66,7 @@ llms = {
     "o3-mini (low reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="low"),
     "o3-mini (medium reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="medium"),
     "o3-mini (high reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
     # Anthropic models (Claude)
     "Claude 3.5 (zero temp)": create_anthropic_llm("claude-3-5-sonnet-latest", ZERO),
@@ -75,13 +78,14 @@ llms = {
     "Claude 3.7": create_anthropic_reasoning_llm("claude-3-7-sonnet-latest"),
     # DeepSeek
-    "Deepseek R1 (zero temp)": create_deepseek_llm("deepseek-reasoner", ZERO),
-    "Deepseek R1 (low temp)": create_deepseek_llm("deepseek-reasoner", LOW),
-    "Deepseek R1 (mid temp)": create_deepseek_llm("deepseek-reasoner", MID),
-    "Deepseek R1 (high temp)": create_deepseek_llm("deepseek-reasoner", HIGH),
     # Google models (Gemini)
-    "Gemini 2.5 Pro Experimental (zero temp)": create_google_reasoning_llm(model_name= "gemini-2.5-pro-exp-03-25"),
 }
 # specific for Diagnosis tab

     "GPT-4.1-mini (zero temp)": create_openai_llm("gpt-4.1-mini", ZERO),
     "GPT-4.1-mini (low temp)": create_openai_llm("gpt-4.1-mini", LOW),
     "GPT-4 Turbo (low temp)": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
+    "GPT-4.1 (low temp)": create_openai_llm("gpt-4.1", LOW),
+    "GPT-4.1 (mid temp)": create_openai_llm("gpt-4.1", MID),
     "GPT-4.5 (low temp)": create_openai_llm("gpt-4.5-preview-2025-02-27", LOW),
     "GPT-4.5 (mid temp)": create_openai_llm("gpt-4.5-preview-2025-02-27", MID),
     "o3-mini (low reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="low"),
     "o3-mini (medium reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="medium"),
     "o3-mini (high reasoning_effort)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
+    "o3 (high reasoning_effort)": create_openai_reasoning_llm("o3", reasoning_effort="high"),
     # Anthropic models (Claude)
     "Claude 3.5 (zero temp)": create_anthropic_llm("claude-3-5-sonnet-latest", ZERO),
     "Claude 3.7": create_anthropic_reasoning_llm("claude-3-7-sonnet-latest"),
     # DeepSeek
+    "DeepSeek-R1 (zero temp)": create_deepseek_llm("deepseek-reasoner", ZERO),
+    "DeepSeek-R1 (low temp)": create_deepseek_llm("deepseek-reasoner", LOW),
+    "DeepSeek-R1 (mid temp)": create_deepseek_llm("deepseek-reasoner", MID),
+    "DeepSeek-R1 (high temp)": create_deepseek_llm("deepseek-reasoner", HIGH),
+    "DeepSeek-V3 (low temp)": create_deepseek_llm("deepseek-chat", LOW),
     # Google models (Gemini)
+    "Gemini 2.5 Pro Experimental (zero temp)": create_google_reasoning_llm(model_name= "gemini-2.5-pro"),
 }
 # specific for Diagnosis tab

config/system_prompt_texts.py CHANGED Viewed

@@ -496,7 +496,7 @@ template_sanitize_learning_objectives_text = """
 template_write_fluster_a_text= """
 # Task outline
-Given a learning objective, your goal is to write an exercise set of 3 high-quality multiple choice exercises that all test the exact same key fact that's stated in the learning objective.
 # Concepts
@@ -507,8 +507,8 @@ A learning objective states a specific fact. For example: "De student weet dat d
 ## Exercise
 An exercise tests the fact that is stated in the learning objective. It consists of:
 1. A prompt, posing to the student:
     - A question or statement
-    - (Optional) Theory, additional information to clarify the question or statement
 2. Choices, which are the multiple answer options that are presented to the student as potential answers to the prompt.
 3. Correct answer, which indicates which of the choices is the correct answer to the prompt.
 4. (Optional) Explanation, explaining or expanding on the answer to the student to facilitate increased learning.
@@ -518,13 +518,13 @@ The student is always first presented with 1 and 2 (prompt and choices), and the
 An exercise set comprises 3 exercises that all test the same single learning objective in three different ways: one bigger multiple choice exercise and two smaller true/false statements.
 ## Distractors
-Distractors are the alternative answer option choices of the exercises that are not the correct answer. The false statement can also be considered a distractor (tempting the student to thing it is correct). Distractors are in fact the most important part of the exercises, because they often either make or break it. This is because distractors are difficult to get right, because in order to be effective they need to strike a precarious balance between "plausible-sounding" and yet "not too close to the truth", both at the same time. More on that in the requirements section.
 ## Theory (optional)
-Theory is sometimes shown before answering the exercise, as an optional part of the prompt to clarify the question.
 ## Explanation (optional)
-An explanation should sometimes be presented to the student after they've answered the exercise, as an optional part of the correct answer reveal to better facilitate learning.
 # Examples
@@ -641,7 +641,7 @@ The information that's posed in the prompt part of the exercise should only cont
 ## Theory & Explanation (optional)
-Theory or Explanation should only be added to all 3 exercises if there's additional info present in the learning objective (often between parentheses, or as a subclause) that is outside of the main fact that's to be tested.
 ### Theory (optional)
 Put any info here that is useful for the student to know before answering the question, as context to clarify the question or statement. The student is prompted with this together with the posing of the rest of the exercise.
 ### Explanation (optional)
@@ -660,7 +660,7 @@ The ideal distractor falls in the middle of this spectrum - plausible enough to
 Try to exactly match the terminology and language difficulty level from the learning objective. If it's stated in simple words, use equally simple words in the exercises as well.
 ## Output format
-Output format doesn't matter. Only prioritize thorough reasoning to arrive at high-quality exercises that satisfy all of the above requirements.
 # Approach

 template_write_fluster_a_text= """
 # Task outline
+Given a learning objective, your goal is to write an exercise set of 3 high-quality multiple choice exercises that all test the exact same key fact that's stated in the learning objective. All exercises must be written in the same language as the learning objective.
 # Concepts
 ## Exercise
 An exercise tests the fact that is stated in the learning objective. It consists of:
 1. A prompt, posing to the student:
+    - (Optional) Theory, additional information to clarify the question or statement
     - A question or statement
 2. Choices, which are the multiple answer options that are presented to the student as potential answers to the prompt.
 3. Correct answer, which indicates which of the choices is the correct answer to the prompt.
 4. (Optional) Explanation, explaining or expanding on the answer to the student to facilitate increased learning.
 An exercise set comprises 3 exercises that all test the same single learning objective in three different ways: one bigger multiple choice exercise and two smaller true/false statements.
 ## Distractors
+Distractors are the alternative answer option choices of the exercises that are not the correct answer. The false statement can also be considered a distractor (tempting the student to think it is correct). Distractors are in fact the most important part of the exercises, because they often either make or break it. This is because distractors are difficult to get right, because in order to be effective they need to strike a precarious balance between "plausible-sounding" and yet "not too close to the truth", both at the same time. More on that in the requirements section.
 ## Theory (optional)
+Theory is sometimes shown before answering the exercise, as an optional part of the prompt to clarify the question or statement.
 ## Explanation (optional)
+An explanation should sometimes be presented to the student after they've answered the exercise, as an optional part of the correct answer reveal to better facilitate learning. For the false‑statement exercise, explanations are mandatory (because they should provide the correct answer to the student).”
 # Examples
 ## Theory & Explanation (optional)
+Theory or Explanation should only be added if they're relevant to all 3 exercises if there's additional info present in the learning objective (often between parentheses, or as a subclause) that is outside of the main fact that's to be tested.
 ### Theory (optional)
 Put any info here that is useful for the student to know before answering the question, as context to clarify the question or statement. The student is prompted with this together with the posing of the rest of the exercise.
 ### Explanation (optional)
 Try to exactly match the terminology and language difficulty level from the learning objective. If it's stated in simple words, use equally simple words in the exercises as well.
 ## Output format
+Output format doesn't matter, parsing your response into fixed exercises will be dealt with later in a separate step. For now, prioritize thorough reasoning to arrive at the most high-quality exercises that optimally satisfy all of the above requirements. Feel free to do brainstorming and even back-tracking if you notice a mistake while generating a response.
 # Approach

config/templates.py CHANGED Viewed

@@ -256,11 +256,11 @@ template_fix_exercise = ChatPromptTemplate(
         (
             "system",
             "You are a helpful assistant that fixes issues in a single multiple choice exercise "
-            "based on diagnosis notes. Return only valid text with the same keys as the original."
         ),
         (
             "user",
-            "Original exercise:\n{exercise_text}\n\nDiagnosis:\n{diagnosis}\n\n"
             "Rewrite the exercise so that all issues in the diagnosis are resolved. "
             "Use the same structure (prompt, choice_id_1..4, correct_answer_id, explanation)."
         ),

         (
             "system",
             "You are a helpful assistant that fixes issues in a single multiple choice exercise "
+            "based on diagnosis notes. Return an improved exercise that has the same amount of answer options as the original, and the same correct answer. For example, if the correct answer is 'Deze stelling is niet correct', then this must remain the correct answer."
         ),
         (
             "user",
+            "Original exercise:\n{exercise_text}\n\nDiagnosis:\n{diagnosis}\n\n" # this is the scorecard summary, ideally I guess this would be the complete diagnoses of all issues
             "Rewrite the exercise so that all issues in the diagnosis are resolved. "
             "Use the same structure (prompt, choice_id_1..4, correct_answer_id, explanation)."
         ),