Spaces:

BtB-ExpC
/

Exercises

Sleeping

App Files Files Community

BtB-ExpC commited on Feb 9, 2025

Commit

3cda69c

1 Parent(s): 57a4fb4

Biig changes (moved dropdowns into tabs, implemented distractors chain)

Browse files

Files changed (5) hide show

app.py +181 -80
chains/distractors_chain.py +75 -17
config/chain_configs.py +13 -7
config/llm_config.py +24 -8
config/templates.py +1 -0

app.py CHANGED Viewed

@@ -14,12 +14,98 @@ logger = logging.getLogger(__name__)
 # --- Callback to update the exercise format dropdown based on LLM selection ---
 def update_exercise_format(selected_model: str):
     # When "Claude3.5" is selected, default the format to XML; otherwise, default to Markdown.
-    if selected_model == "Claude 3.5":
         return gr.update(value="XML")
     else:
         return gr.update(value="Plaintext")
-# A generic async runner for chains.
 async def run_chain(chain_name: str, input_variables: dict, selected_model: str):
     try:
         chain_config = chain_configs.get(chain_name)
@@ -60,48 +146,6 @@ async def run_chain(chain_name: str, input_variables: dict, selected_model: str)
         logger.error(f"Error in run_chain for '{chain_name}': {e}")
         return f"Error: {e}"
-# Async wrappers for each chain.
-async def run_diagnoser(user_query: str, chosen_model: str, exercise_format: str, sampling_count: str) -> tuple:
-    # figure out how many times to run
-    num_samples = int("".join(filter(str.isdigit, sampling_count)))
-    # Fetch the DiagnoserChain configuration.
-    config = chain_configs["diagnoser"]
-    # 1) Standardize the user query exactly once
-    standardized_exercise = await standardize_exercise(
-        user_query,
-        exercise_format,
-        config["template_standardize"],  # Only if you kept them in config
-        config["llm_standardize"]
-    )
-    # 2) Instantiate the DiagnoserChain using the user-selected LLM for diagnosing
-    chain_instance = config["class"](
-        templates_diagnose=config["templates_diagnose"],
-        llm_diagnose=llms.get(chosen_model, config["llm_diagnose"]),
-        template_diagnose_scorecard=config["template_diagnose_scorecard"],
-        llm_4o_mini=config["llm_4o_mini"],
-        llm_4o=config["llm_4o"]
-    )
-    # 3) Run the multiple samples in parallel
-    # Create a short helper that does only the "diagnose" steps:
-    tasks = [
-        chain_instance.diagnose_only(standardized_exercise)
-        for _ in range(num_samples)
-    ]
-    # run concurrently
-    responses = await asyncio.gather(*tasks)
-    # pad up to 5 if needed
-    all_responses = list(responses) + [""] * (10 - len(responses))
-    # Return a tuple of exactly 5 responses.
-    return tuple(all_responses)
-async def run_distractors(user_query: str, model_choice: str) -> str:
-    return await run_chain("distractors", {"user_query": user_query}, model_choice)
 # -------------------------------
 # Build the Gradio Interface
@@ -116,46 +160,48 @@ with gr.Blocks() as interface:
     # --- Main App (initially hidden) ---
     with gr.Column(visible=False, elem_id="main_app") as app_container:
-        gr.Markdown("## Pick the tab for your task of choice below")
-        # Dropdown for LLM selection.
-        # Create a row for the control dropdowns
-        with gr.Row():
-            model_choice = gr.Dropdown(
-                choices=list(llms.keys()),
-                value="GPT-4o",
-                label="Select LLM",
-                interactive=True,
-            )
-            exercise_format = gr.Dropdown(
-                choices=["Markdown", "XML", "Plaintext", "Raw (original)"],
-                value="Markdown",
-                label="Exercise Format",
-                interactive=True,
-            )
-            sampling_count = gr.Dropdown(
-                choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
-                value="1",
-                label="Sampling Count",
-                interactive=True,
-            )
-        # Set up a change callback so that if the user selects "Claude 3.5", the exercise format updates to "XML"
-        model_choice.change(
-            fn=update_exercise_format,
-            inputs=[model_choice],
-            outputs=[exercise_format]
-        )
         with gr.Tabs():
-            with gr.TabItem("🩺 Validate exercise"):
                 # Insert an HTML info icon with a tooltip at the top of the tab content.
                 gr.HTML(
                     """
                     <div style="margin-bottom: 10px;">
-                        <span style="font-size: 1.5em; cursor: help;" title="Validate exercise: Diagnoses potential issues for the given exercise(s).">
-                            ℹ️ <i>← mouseover for more info</i>
                         </span>
                     </div>
                     """
                 )
                 diagnoser_input = gr.Textbox(label="Enter exercise(s) in any format", placeholder="Exercise body: <mc:exercise xmlns:mc= ...")
                 diagnoser_button = gr.Button("Submit")
                 diagnoser_response_1 = gr.Textbox(label="Response 1", interactive=False)
@@ -168,6 +214,8 @@ with gr.Blocks() as interface:
                 diagnoser_response_8 = gr.Textbox(label="Response 8", interactive=False)
                 diagnoser_response_9 = gr.Textbox(label="Response 9", interactive=False)
                 diagnoser_response_10 = gr.Textbox(label="Response 10", interactive=False)
             with gr.TabItem("🤔 Generate distractors"):
                 # Insert an HTML info icon with a tooltip at the top of the tab content.
                 gr.HTML(
@@ -179,10 +227,52 @@ with gr.Blocks() as interface:
                     </div>
                     """
                 )
                 distractors_input = gr.Textbox(label="Enter exercise(s) in any format", placeholder="Stelling: Dit is een ..... voorbeeld van een stelling. A. Mooi B. Lelijk ...")
                 distractors_button = gr.Button("Submit")
-                gr.Markdown("**Response(s):**")
-                distractors_responses = gr.Column()
             with gr.TabItem("🚧 Generate learning objectives"):
                 # Insert an HTML info icon with a tooltip at the top of the tab content.
                 gr.HTML(
@@ -211,7 +301,7 @@ with gr.Blocks() as interface:
     diagnoser_button.click(
         fn=run_diagnoser,
-        inputs=[diagnoser_input, model_choice, exercise_format, sampling_count],
         outputs=[
             diagnoser_response_1,
             diagnoser_response_2,
@@ -228,8 +318,19 @@ with gr.Blocks() as interface:
     distractors_button.click(
         fn=run_distractors,
-        inputs=[distractors_input, model_choice, exercise_format, sampling_count],
-        outputs=[distractors_responses]
     )
 # Launch the app.

 # --- Callback to update the exercise format dropdown based on LLM selection ---
 def update_exercise_format(selected_model: str):
     # When "Claude3.5" is selected, default the format to XML; otherwise, default to Markdown.
+    if "Claude" in selected_model:
         return gr.update(value="XML")
     else:
         return gr.update(value="Plaintext")
+# Async wrappers for each chain.
+async def run_diagnoser(user_query: str, model_choice_validate: str, exercise_format_validate: str, sampling_count_validate: str) -> tuple:
+    # figure out how many times to run
+    num_samples = int("".join(filter(str.isdigit, sampling_count)))
+    # Fetch the DiagnoserChain configuration.
+    config = chain_configs["diagnoser"]
+    # 1) Standardize the user query exactly once
+    standardized_exercise = await standardize_exercise(
+        user_query,
+        exercise_format,
+        config["template_standardize"],  # Only if you kept them in config
+        config["llm_standardize"]
+    )
+    # 2) Instantiate the DiagnoserChain using the user-selected LLM for diagnosing
+    chain_instance = config["class"](
+        templates_diagnose=config["templates_diagnose"],
+        llm_diagnose=llms.get(model_choice_validate, config["llm_diagnose"]),
+        template_diagnose_scorecard=config["template_diagnose_scorecard"],
+        llm_4o_mini=config["llm_4o_mini"],
+        llm_4o=config["llm_4o"]
+    )
+    # 3) Run the multiple samples in parallel
+    # Create a short helper that does only the "diagnose" steps:
+    tasks = [
+        chain_instance.diagnose_only(standardized_exercise)
+        for _ in range(num_samples)
+    ]
+    # run concurrently
+    responses = await asyncio.gather(*tasks)
+    # pad up to 10 if needed
+    all_responses = list(responses) + [""] * (10 - len(responses))
+    # Return a tuple of exactly 5 responses.
+    return tuple(all_responses)
+async def run_distractors(
+    user_query: str,
+    model_choice_distractors_1: str,
+    model_choice_distractors_2: str,
+    exercise_format_distractors: str,
+    sampling_count_distractors: str
+) -> tuple:
+    # 0) Parse how many concurrent runs (samples) we want
+    num_samples = int("".join(filter(str.isdigit, sampling_count_distractors)))
+    # Fetch the DistractorsChain configuration.
+    config = chain_configs["distractors"]
+    # 1) Standardize the user query exactly once
+    standardized_exercise = await standardize_exercise(
+        user_query,
+        exercise_format_distractors,
+        config["template_standardize"],
+        config["llm_standardize"]
+    )
+    # 2) Build the DistractorsChain instance
+    chain_instance = config["class"](
+        template_distractors_brainstorm_1=config["template_distractors_brainstorm_1"],
+        template_distractors_brainstorm_2=config["template_distractors_brainstorm_2"],
+        llm_brainstorm_1=llms.get(model_choice_distractors_1, config["llm_brainstorm_1"]),  # User-selected (low and high temp GPT-4o by default)
+        llm_brainstorm_2=llms.get(model_choice_distractors_2, config["llm_brainstorm_2"]),
+        template_consolidate=config["template_consolidate"],
+        llm_consolidate=config["llm_consolidate"],
+    )
+    # 3) Create N tasks in parallel (one full distractor generation pipeline per sample)
+    tasks = [
+        chain_instance.run(standardized_exercise) for _ in range(num_samples)
+    ]
+    results = await asyncio.gather(*tasks)
+    # 4) Pad up to 10 outputs to correspond to 10 response fields
+    all_responses = list(results) + [""] * (10 - len(results))
+    return tuple(all_responses)
+# A generic async runner for simple chains (currently not used)
 async def run_chain(chain_name: str, input_variables: dict, selected_model: str):
     try:
         chain_config = chain_configs.get(chain_name)
         logger.error(f"Error in run_chain for '{chain_name}': {e}")
         return f"Error: {e}"
 # -------------------------------
 # Build the Gradio Interface
     # --- Main App (initially hidden) ---
     with gr.Column(visible=False, elem_id="main_app") as app_container:
+        gr.Markdown("## Pick the tab for your task of choice")
         with gr.Tabs():
+            with gr.TabItem("🩺 Diagnose exercise"):
                 # Insert an HTML info icon with a tooltip at the top of the tab content.
                 gr.HTML(
                     """
                     <div style="margin-bottom: 10px;">
+                        <span style="font-size: 1.5em; cursor: help;" title="Diagnose exercise: Diagnoses potential issues for the given exercise(s).">
+                            ℹ️ <i>← mouseover</i>
                         </span>
                     </div>
                     """
                 )
+                # Create a row for the control dropdowns: LLM selection, exercise format, sampling count etc.
+                with gr.Row():
+                    model_choice_validate = gr.Dropdown(
+                        choices=list(llms.keys()),
+                        value="GPT-4o (low temp)",
+                        label="Select LLM",
+                        interactive=True,
+                    )
+                    exercise_format_validate = gr.Dropdown(
+                        choices=["Markdown", "XML", "Plaintext", "Raw (original)"],
+                        value="Markdown",
+                        label="Exercise Format",
+                        interactive=True,
+                    )
+                    sampling_count_validate = gr.Dropdown(
+                        choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
+                        value="1",
+                        label="Sampling Count",
+                        interactive=True,
+                    )
+                # Set up a change callback so that if the user selects any model with "Claude" in the name, the exercise format updates to "XML"
+                model_choice_validate.change(
+                    fn=update_exercise_format,
+                    inputs=[model_choice_validate],
+                    outputs=[exercise_format_validate]
+                )
                 diagnoser_input = gr.Textbox(label="Enter exercise(s) in any format", placeholder="Exercise body: <mc:exercise xmlns:mc= ...")
                 diagnoser_button = gr.Button("Submit")
                 diagnoser_response_1 = gr.Textbox(label="Response 1", interactive=False)
                 diagnoser_response_8 = gr.Textbox(label="Response 8", interactive=False)
                 diagnoser_response_9 = gr.Textbox(label="Response 9", interactive=False)
                 diagnoser_response_10 = gr.Textbox(label="Response 10", interactive=False)
             with gr.TabItem("🤔 Generate distractors"):
                 # Insert an HTML info icon with a tooltip at the top of the tab content.
                 gr.HTML(
                     </div>
                     """
                 )
+                # Create a row for the control dropdowns: LLM selection, exercise format, sampling count etc.
+                with gr.Row():
+                    model_choice_distractors_1 = gr.Dropdown(
+                        choices=list(llms.keys()),
+                        value="GPT-4o (low temp)",
+                        label="Select first LLM",
+                        interactive=True,
+                    )
+                    model_choice_distractors_2 = gr.Dropdown(
+                        choices=list(llms.keys()),
+                        value="GPT-4o (mid temp)",
+                        label="Select second LLM",
+                        interactive=True,
+                    )
+                    exercise_format_distractors = gr.Dropdown(
+                        choices=["Markdown", "XML", "Plaintext", "Raw (original)"],
+                        value="Plaintext",
+                        label="Exercise Format",
+                        interactive=True,
+                    )
+                    sampling_count_distractors = gr.Dropdown(
+                        choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
+                        value="1",
+                        label="Sampling Count",
+                        interactive=True,
+                    )
+                # Set up a change callback so that if the user selects any model with "Claude" in the name, the exercise format updates to "XML"
+                model_choice_distractors_1.change(
+                    fn=update_exercise_format,
+                    inputs=[model_choice_distractors_1],
+                    outputs=[exercise_format_distractors]
+                )
                 distractors_input = gr.Textbox(label="Enter exercise(s) in any format", placeholder="Stelling: Dit is een ..... voorbeeld van een stelling. A. Mooi B. Lelijk ...")
                 distractors_button = gr.Button("Submit")
+                distractors_response_1 = gr.Textbox(label="Response 1", interactive=False)
+                distractors_response_2 = gr.Textbox(label="Response 2", interactive=False)
+                distractors_response_3 = gr.Textbox(label="Response 3", interactive=False)
+                distractors_response_4 = gr.Textbox(label="Response 4", interactive=False)
+                distractors_response_5 = gr.Textbox(label="Response 5", interactive=False)
+                distractors_response_6 = gr.Textbox(label="Response 6", interactive=False)
+                distractors_response_7 = gr.Textbox(label="Response 7", interactive=False)
+                distractors_response_8 = gr.Textbox(label="Response 8", interactive=False)
+                distractors_response_9 = gr.Textbox(label="Response 9", interactive=False)
+                distractors_response_10 = gr.Textbox(label="Response 10", interactive=False)
             with gr.TabItem("🚧 Generate learning objectives"):
                 # Insert an HTML info icon with a tooltip at the top of the tab content.
                 gr.HTML(
     diagnoser_button.click(
         fn=run_diagnoser,
+        inputs=[diagnoser_input, model_choice_validate, exercise_format_validate, sampling_count_validate],
         outputs=[
             diagnoser_response_1,
             diagnoser_response_2,
     distractors_button.click(
         fn=run_distractors,
+        inputs=[distractors_input, model_choice_distractors_1, model_choice_distractors_2, exercise_format_distractors, sampling_count_distractors],
+        outputs=[
+            distractors_response_1,
+            distractors_response_2,
+            distractors_response_3,
+            distractors_response_4,
+            distractors_response_5,
+            distractors_response_6,
+            distractors_response_7,
+            distractors_response_8,
+            distractors_response_9,
+            distractors_response_10
+        ]
     )
 # Launch the app.

chains/distractors_chain.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # chains/distractors_chain.py
 from pydantic import BaseModel
 from typing import Any
 from langchain_core.prompts.chat import ChatPromptTemplate
@@ -6,29 +7,86 @@ from config.exercise_standardizer import standardize_exercise
 class DistractorsChain(BaseModel):
-    template_standardize: ChatPromptTemplate
-    template_distractors: ChatPromptTemplate
-    llm_standardize: Any            # Fixed LLM for step 1
-    llm_distractors: Any                  # User-selectable LLM for step 2
-    async def run(self, user_query: str, exercise_format: str) -> str:
         """
-        Runs the composite chain:
-          1. Standardizes the exercise formatting (if exercise_format isn't Raw).
-          2. Generates new distractors from the standardized format.
         """
-        # --- Step 1: Standardize the exercise formatting (if exercise_format isn't 'Raw (original)') ---
-        standardized_exercise = await standardize_exercise(
-            user_query, exercise_format, self.template_standardize, self.llm_standardize
         )
-        # --- Step 2: Generate new distractors using the standardized exercise ---
-        prompt_distractors = await self.template_distractors.aformat_prompt(standardized_exercise=standardized_exercise)
-        distractors_messages = prompt_distractors.to_messages()
-        distractors = await self.llm_distr.ainvoke(distractors_messages)
-        return distractors
     class Config:
         arbitrary_types_allowed = True

 # chains/distractors_chain.py
+import asyncio
 from pydantic import BaseModel
 from typing import Any
 from langchain_core.prompts.chat import ChatPromptTemplate
 class DistractorsChain(BaseModel):
+    template_distractors_brainstorm_1: ChatPromptTemplate
+    template_distractors_brainstorm_2: ChatPromptTemplate
+    llm_brainstorm_low: Any                  # User-selectable LLMs for brainstorm
+    llm_brainstorm_high: Any
+    template_consolidate: ChatPromptTemplate
+    llm_consolidate: Any
+    async def run(self, standardized_exercise: str) -> str:
         """
+        Overall flow:
+        2) Run 4 parallel brainstorming calls:
+           - 2 uses 'template_distractors_brainstorm_1' with (low-temp, high-temp)
+           - 2 uses 'template_distractors_brainstorm_2' with (low-temp, high-temp)
+        3) Merge those four partial results in a single final answer
+           via a "consolidation" prompt.
+        4) Return the final string
         """
+        # --- Step 2: Brainstorm in parallel ---
+        async def run_brainstorm(
+            prompt_template: ChatPromptTemplate,
+            llm_brainstorm: Any,
+            index_label: str
+        ) -> str:
+            # Format prompt
+            prompt = await prompt_template.aformat_prompt(
+                standardized_exercise=standardized_exercise
+            )
+            messages = prompt.to_messages()
+            # Call the specified LLM
+            response = await llm_brainstorm.ainvoke(messages)
+            content = getattr(response, "content", response)
+            return f"[Brainstorm {index_label}]\n{content}"
+        tasks = []
+        # Template 1, low-temp
+        tasks.append(run_brainstorm(
+            self.template_distractors_brainstorm_1,
+            self.llm_brainstorm_1,
+            "T1-Low"
+        ))
+        # Template 1, high-temp
+        tasks.append(run_brainstorm(
+            self.template_distractors_brainstorm_1,
+            self.llm_brainstorm_2,
+            "T1-High"
+        ))
+        # Template 2, low-temp
+        tasks.append(run_brainstorm(
+            self.template_distractors_brainstorm_2,
+            self.llm_brainstorm_1,
+            "T2-Low"
+        ))
+        # Template 2, high-temp
+        tasks.append(run_brainstorm(
+            self.template_distractors_brainstorm_2,
+            self.llm_brainstorm_2,
+            "T2-High"
+        ))
+        # Kick them off concurrently
+        brainstorm_results = await asyncio.gather(*tasks)
+        # Combine them in a single multiline string
+        combined_brainstorms = "\n\n".join(brainstorm_results)
+        # --- Step 3: Consolidate the 4 partial outputs into a final response ---
+        consolidation_prompt = await self.template_consolidate.aformat_prompt(
+            brainstorm_outputs=combined_brainstorms,
+            standardized_exercise=standardized_exercise
         )
+        consolidation_messages = consolidation_prompt.to_messages()
+        consolidation_response = await self.llm_consolidate.ainvoke(consolidation_messages)
+        final_output = getattr(consolidation_response, "content", consolidation_response)
+        # Return the final merged distractors response
+        return final_output
     class Config:
         arbitrary_types_allowed = True

config/chain_configs.py CHANGED Viewed

@@ -7,20 +7,23 @@ from config.templates import (
     template_diagnose_correct_answer_stands_out,
     template_diagnose_distractor_clearly_wrong,
     template_diagnose_distractor_partially_correct,
-    diagnose_scorecard_template
 )
 from chains.diagnoser_chain import DiagnoserChain
 from chains.distractors_chain import DistractorsChain
 from config.llm_config import llms
-# Note: The default LLM here is 4o; the UI can override this choice.
 chain_configs = {
     "diagnoser": {
         "class": DiagnoserChain,
         "template_standardize": standardize_template,
         "llm_standardize": llms["GPT-4o-mini-zero"],     # Always fixed
         "llm_4o_mini": llms["GPT-4o-mini"],
-        "llm_4o": llms["GPT-4o"],
         # 4 different diagnosis templates (to run in parallel:
         "templates_diagnose": [
             template_diagnose_double_negation,
@@ -29,14 +32,17 @@ chain_configs = {
             template_diagnose_distractor_partially_correct,
         ],
         "template_diagnose_scorecard": diagnose_scorecard_template,
-        "llm_diagnose": llms["GPT-4o"],             # Default; can be replaced in UI
     },
     "distractors": {
         "class": DistractorsChain,
         "template_standardize": standardize_template,
         "llm_standardize": llms["GPT-4o-mini-zero"],     # Always fixed
-        "template_distractors": distractors_template,
-        "llm_distractors": llms["GPT-4o"],                # Default; can be replaced in UI
-        "llm_4o_mini": llms["GPT-4o-mini"],
     },
 }

     template_diagnose_correct_answer_stands_out,
     template_diagnose_distractor_clearly_wrong,
     template_diagnose_distractor_partially_correct,
+    diagnose_scorecard_template,
+    template_distractors_brainstorm_1,
+    template_distractors_brainstorm_2,
+    distractors_consolidate_template
 )
 from chains.diagnoser_chain import DiagnoserChain
 from chains.distractors_chain import DistractorsChain
 from config.llm_config import llms
+# Note: The default LLM here is GPT-4o (low temp); the UI can override this choice.
 chain_configs = {
     "diagnoser": {
         "class": DiagnoserChain,
         "template_standardize": standardize_template,
         "llm_standardize": llms["GPT-4o-mini-zero"],     # Always fixed
         "llm_4o_mini": llms["GPT-4o-mini"],
+        "llm_4o": llms["GPT-4o (low temp)"],
         # 4 different diagnosis templates (to run in parallel:
         "templates_diagnose": [
             template_diagnose_double_negation,
             template_diagnose_distractor_partially_correct,
         ],
         "template_diagnose_scorecard": diagnose_scorecard_template,
+        "llm_diagnose": llms["GPT-4o (low temp)"],             # Default; can be replaced in UI
     },
     "distractors": {
         "class": DistractorsChain,
         "template_standardize": standardize_template,
         "llm_standardize": llms["GPT-4o-mini-zero"],     # Always fixed
+        "template_distractors_brainstorm_1": template_distractors_brainstorm_1,
+        "template_distractors_brainstorm_2": template_distractors_brainstorm_2,
+        "llm_brainstorm_1": llms["GPT-4o (low temp)"],
+        "llm_brainstorm_2": llms["GPT-4o (mid temp"],
+        "template_consolidate": distractors_consolidate_template,
+        "llm_consolidate": llms["GPT-4o (low temp)"],  # or something else
     },
 }

config/llm_config.py CHANGED Viewed

@@ -19,8 +19,12 @@ HIGH = 1.2
 def create_openai_llm(model_name: str, temperature: float):
     return ChatOpenAI(api_key=OPENAI_API_KEY, model_name=model_name, temperature=temperature)
-def create_openai_reasoning_llm(model_name: str):
-    return ChatOpenAI(api_key=OPENAI_API_KEY, model_name=model_name)
 def create_anthropic_llm(model_name: str, temperature: float):
     return ChatAnthropic(api_key=ANTHROPIC_API_KEY, model_name=model_name, temperature=temperature)
@@ -29,14 +33,26 @@ def create_deepseek_llm(model_name: str, temperature: float):
     return ChatAnthropic(api_key=ANTHROPIC_API_KEY, model_name=model_name, temperature=temperature)
 llms = {
-    "GPT-4o": create_openai_llm("gpt-4o", LOW),
     "GPT-4o-mini-zero": create_openai_llm("gpt-4o-mini", ZERO),
     "GPT-4o-mini": create_openai_llm("gpt-4o-mini", LOW),
     "GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
     "GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
-    "GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", HIGH),
     "o1": create_openai_reasoning_llm("o1-2024-12-17"),
-    "o3-mini": create_openai_reasoning_llm("o3-mini-2025-01-31"),
-    "Claude 3.5": create_anthropic_llm("claude-3-5-sonnet-latest", LOW),
-    "Deepseek R1🚧": create_anthropic_llm("deepseek-reasoner", LOW),
-}

 def create_openai_llm(model_name: str, temperature: float):
     return ChatOpenAI(api_key=OPENAI_API_KEY, model_name=model_name, temperature=temperature)
+def create_openai_reasoning_llm(model_name: str, reasoning_effort: str = None):
+    # If reasoning_effort is provided, pass it; otherwise, avoid sending the parameter.
+    if reasoning_effort:
+        return ChatOpenAI(api_key=OPENAI_API_KEY, model_name=model_name, reasoning_effort=reasoning_effort)
+    else:
+        return ChatOpenAI(api_key=OPENAI_API_KEY, model_name=model_name)
 def create_anthropic_llm(model_name: str, temperature: float):
     return ChatAnthropic(api_key=ANTHROPIC_API_KEY, model_name=model_name, temperature=temperature)
     return ChatAnthropic(api_key=ANTHROPIC_API_KEY, model_name=model_name, temperature=temperature)
 llms = {
+    # OpenAI models with temperature
+    "GPT-4o (low temp)": create_openai_llm("gpt-4o", LOW),
+    "GPT-4o (mid temp)": create_openai_llm("gpt-4o", MID),
+    "GPT-4o (high temp)": create_openai_llm("gpt-4o", HIGH),
     "GPT-4o-mini-zero": create_openai_llm("gpt-4o-mini", ZERO),
     "GPT-4o-mini": create_openai_llm("gpt-4o-mini", LOW),
     "GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
     "GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
+    "GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
+    # OpenAI reasoning models (no temperature)
     "o1": create_openai_reasoning_llm("o1-2024-12-17"),
+    "o3-mini (high-reasoning version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
+    # Anthropic models (Claude)
+    "Claude 3.5 (low temp)": create_anthropic_llm("claude-3-5-sonnet-latest", LOW),
+    "Claude 3.5 (mid temp)": create_anthropic_llm("claude-3-5-sonnet-latest", MID),
+    "Claude 3.5 (high temp)": create_anthropic_llm("claude-3-5-sonnet-latest", HIGH),
+    # DeepSeek
+    "Deepseek R1 (low temp)🚧": create_anthropic_llm("deepseek-reasoner", LOW),
+}

config/templates.py CHANGED Viewed

@@ -178,6 +178,7 @@ diagnose_scorecard_template = ChatPromptTemplate(
         <example 3>
         1. The exercise contains a double negative: ❌ -- 2. The correct answer does not stand out: ✅ -- 3. Some of the distractors are too obviously false: ❌ -- 4. None of the distractors are actually also kinda correct: ✅
         </example 3>
         """),
         ("human", "{combined_diagnosis}")
     ],

         <example 3>
         1. The exercise contains a double negative: ❌ -- 2. The correct answer does not stand out: ✅ -- 3. Some of the distractors are too obviously false: ❌ -- 4. None of the distractors are actually also kinda correct: ✅
         </example 3>
+        Sometimes the diagnoses will be short and clear, but sometimes they will also be elaborate and view the issue from different angles. In that case, overweight the final sentence of the diagnosis. Here, usually the conclusion is drawn
         """),
         ("human", "{combined_diagnosis}")
     ],