Spaces:

BtB-ExpC
/

Exercises

Sleeping

App Files Files Community

BtB-ExpC commited on Feb 10, 2025

Commit

4cbd4c5

1 Parent(s): 7aead4f

Cleaned up model picker and exercise formatting standardization handling

Browse files

Files changed (6) hide show

app.py +24 -18
chains/distractors_chain.py +9 -9
config/chain_configs.py +3 -3
config/exercise_standardizer.py +2 -1
config/llm_config.py +25 -21
config/templates.py +7 -8

app.py CHANGED Viewed

@@ -116,10 +116,10 @@ async def run_distractors(
     chain_instance = config["class"](
         template_distractors_brainstorm_1=config["template_distractors_brainstorm_1"],
         template_distractors_brainstorm_2=config["template_distractors_brainstorm_2"],
-        llm_brainstorm_1=llms.get(model_choice_distractors_1, config["llm_brainstorm_1"]),  # User-selected (low and high temp GPT-4o by default)
-        llm_brainstorm_2=llms.get(model_choice_distractors_2, config["llm_brainstorm_2"]),
         template_consolidate=config["template_consolidate"],
-        llm_consolidate=config["llm_consolidate"],
     )
     # 3) Create N tasks in parallel (one full distractor generation pipeline per sample)
@@ -262,14 +262,14 @@ with gr.Blocks() as interface:
                 with gr.Row():
                     model_choice_distractors_1 = gr.Dropdown(
                         choices=list(llms.keys()),
-                        value="GPT-4o (low temp)",
-                        label="LLM 1",
                         interactive=True,
                     )
                     model_choice_distractors_2 = gr.Dropdown(
                         choices=list(llms.keys()),
-                        value="GPT-4o (mid temp)",
-                        label="LLM 2",
                         interactive=True,
                     )
                     exercise_format_distractors = gr.Dropdown(
@@ -278,25 +278,31 @@ with gr.Blocks() as interface:
                         label="Exercise Format",
                         interactive=True,
                     )
-                    sampling_count_distractors = gr.Dropdown(
-                        choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
-                        value="1",
-                        label="Response Count",
                         interactive=True,
                     )
-                    intermediate_distractors_specification = gr.Dropdown(
-                        choices=["", "2", "3", "4", "5", "6", "7", "8", "9", "10", "a few", "some", "a whole lot of", "a wide range of", "novel"],
-                        value="8",
-                        label="Brainstorm Nx4 distractors",
                         interactive=True,
                     )
                     final_distractors_specification = gr.Dropdown(
-                        choices=["all unique distractors", "the best distractors", "only the very best distractors", "4", "5", "6", "7", "8", "9", "10", "11", "12", "a few", "some", "a whole lot of",
-                                 "a wide range of", "novel"],
-                        value="all unique distractors",
                         label="Finally display X distractors",
                         interactive=True,
                     )
                 # Set up a change callback so that if the user selects any model with "Claude" in the name, the exercise format updates to "XML"
                 model_choice_distractors_1.change(
                     fn=update_exercise_format,

     chain_instance = config["class"](
         template_distractors_brainstorm_1=config["template_distractors_brainstorm_1"],
         template_distractors_brainstorm_2=config["template_distractors_brainstorm_2"],
+        llm_brainstorm_1=llms.get(model_choice_distractors_1, config["llm_brainstorm_1"]),  # User-selected LLM 1
+        llm_brainstorm_2=llms.get(model_choice_distractors_2, config["llm_brainstorm_2"]),  # User-selected LLM 2
         template_consolidate=config["template_consolidate"],
+        llm_consolidate=llms.get(model_choice_distractors_3, config["llm_consolidate"]),    # User-selected LLM 3
     )
     # 3) Create N tasks in parallel (one full distractor generation pipeline per sample)
                 with gr.Row():
                     model_choice_distractors_1 = gr.Dropdown(
                         choices=list(llms.keys()),
+                        value="GPT-4o (mid temp)",
+                        label="LLM 1 - for brainstorming",
                         interactive=True,
                     )
                     model_choice_distractors_2 = gr.Dropdown(
                         choices=list(llms.keys()),
+                        value="Claude 3.5 (mid temp)",
+                        label="LLM 2 - for brainstorming",
                         interactive=True,
                     )
                     exercise_format_distractors = gr.Dropdown(
                         label="Exercise Format",
                         interactive=True,
                     )
+                    intermediate_distractors_specification = gr.Dropdown(
+                        choices=[" ", " 2 ", " 3 ", " 4 ", " 5 ", " 6 ", " 7 ", " 8 ", " 9 ", " 10 ", " a few ", " some ", " a whole lot of ", " a wide range of ", " novel "],
+                        value=" 8 ",
+                        label="Brainstorm X distractors x4",
                         interactive=True,
                     )
+                    model_choice_distractors_3 = gr.Dropdown(
+                        choices=list(llms.keys()),
+                        value="GPT-4o (low temp)",
+                        label="LLM 3 - for interpreting results",
                         interactive=True,
                     )
                     final_distractors_specification = gr.Dropdown(
+                        choices=[" ", " of all unique distractors", " of the top 5", " of the best distractors", " of only the very best", " of the best 4", " of the best 5", " of the best 6", " of the best 7", " of the best 8", " of the best 9", " of the best 10", " of the best 11", " of the best 12", " of a few of them", " of some of them", " of most of them",
+                                 " of a wide range of", " of the 3 worst"],
+                        value=" of all unique distractors",
                         label="Finally display X distractors",
                         interactive=True,
                     )
+                    sampling_count_distractors = gr.Dropdown(
+                        choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
+                        value="1",
+                        label="Response Count",
+                        interactive=True,
+                    )
                 # Set up a change callback so that if the user selects any model with "Claude" in the name, the exercise format updates to "XML"
                 model_choice_distractors_1.change(
                     fn=update_exercise_format,

chains/distractors_chain.py CHANGED Viewed

@@ -41,32 +41,32 @@ class DistractorsChain(BaseModel):
             response = await llm_brainstorm.ainvoke(messages)
             content = getattr(response, "content", response)
-            return f"[Brainstorm {index_label}]\n{content}"
         tasks = []
-        # Template 1, low-temp
         tasks.append(run_brainstorm(
             self.template_distractors_brainstorm_1,
             self.llm_brainstorm_1,
-            "T1-Low"
         ))
-        # Template 1, high-temp
         tasks.append(run_brainstorm(
             self.template_distractors_brainstorm_1,
             self.llm_brainstorm_2,
-            "T1-High"
         ))
-        # Template 2, low-temp
         tasks.append(run_brainstorm(
             self.template_distractors_brainstorm_2,
             self.llm_brainstorm_1,
-            "T2-Low"
         ))
-        # Template 2, high-temp
         tasks.append(run_brainstorm(
             self.template_distractors_brainstorm_2,
             self.llm_brainstorm_2,
-            "T2-High"
         ))
         # Kick them off concurrently

             response = await llm_brainstorm.ainvoke(messages)
             content = getattr(response, "content", response)
+            return f"[ --- list separator {index_label} ---]\n\n{content}"
         tasks = []
+        # Template 1, LLM 1
         tasks.append(run_brainstorm(
             self.template_distractors_brainstorm_1,
             self.llm_brainstorm_1,
+            "T1-1"
         ))
+        # Template 1, LLM 2
         tasks.append(run_brainstorm(
             self.template_distractors_brainstorm_1,
             self.llm_brainstorm_2,
+            "T1-2"
         ))
+        # Template 2, LLM 1
         tasks.append(run_brainstorm(
             self.template_distractors_brainstorm_2,
             self.llm_brainstorm_1,
+            "T2-1"
         ))
+        # Template 2, LLM 2
         tasks.append(run_brainstorm(
             self.template_distractors_brainstorm_2,
             self.llm_brainstorm_2,
+            "T2-2"
         ))
         # Kick them off concurrently

config/chain_configs.py CHANGED Viewed

@@ -19,8 +19,8 @@ chain_configs = {
     "diagnoser": {
         "class": DiagnoserChain,
         "template_standardize": standardize_template,
-        "llm_standardize": llms["GPT-4o-mini-zero"],     # Always fixed
-        "llm_4o_mini": llms["GPT-4o-mini"],
         "llm_4o": llms["GPT-4o (low temp)"],
         # 4 different diagnosis templates (to run in parallel:
         "templates_diagnose": [
@@ -35,7 +35,7 @@ chain_configs = {
     "distractors": {
         "class": DistractorsChain,
         "template_standardize": standardize_template,
-        "llm_standardize": llms["GPT-4o-mini-zero"],     # Always fixed
         "template_distractors_brainstorm_1": template_distractors_brainstorm_1,
         "template_distractors_brainstorm_2": template_distractors_brainstorm_2,
         "llm_brainstorm_1": llms["GPT-4o (low temp)"],

     "diagnoser": {
         "class": DiagnoserChain,
         "template_standardize": standardize_template,
+        "llm_standardize": llms["GPT-4o-mini (zero temp)"],     # Always fixed
+        "llm_4o_mini": llms["GPT-4o-mini (low temp)"],
         "llm_4o": llms["GPT-4o (low temp)"],
         # 4 different diagnosis templates (to run in parallel:
         "templates_diagnose": [
     "distractors": {
         "class": DistractorsChain,
         "template_standardize": standardize_template,
+        "llm_standardize": llms["GPT-4o-mini (zero temp)"],     # Always fixed
         "template_distractors_brainstorm_1": template_distractors_brainstorm_1,
         "template_distractors_brainstorm_2": template_distractors_brainstorm_2,
         "llm_brainstorm_1": llms["GPT-4o (low temp)"],

config/exercise_standardizer.py CHANGED Viewed

@@ -22,6 +22,7 @@ async def standardize_exercise(user_query: str, exercise_format: str, template:
     )
     std_messages = prompt_std.to_messages()
-    standardized_exercise = await llm.ainvoke(std_messages)
     return standardized_exercise

     )
     std_messages = prompt_std.to_messages()
+    response = await llm.ainvoke(std_messages)
+    standardized_exercise = getattr(response, "content", response)
     return standardized_exercise

config/llm_config.py CHANGED Viewed

@@ -12,7 +12,7 @@ DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
 # Define temperature presets (adjust as needed)
 ZERO = 0
 LOW = 0.2
-MID = 0.7
 HIGH = 1.2
 # Factory functions for each provider
@@ -35,15 +35,13 @@ def create_deepseek_llm(model_name: str, temperature: float):
 # all of them in one dictionary
 llms = {
     # OpenAI models with temperature
     "GPT-4o (low temp)": create_openai_llm("gpt-4o", LOW),
     "GPT-4o (mid temp)": create_openai_llm("gpt-4o", MID),
     "GPT-4o (high temp)": create_openai_llm("gpt-4o", HIGH),
-    "GPT-4o-mini-zero": create_openai_llm("gpt-4o-mini", ZERO),
-    "GPT-4o-mini": create_openai_llm("gpt-4o-mini", LOW),
-    "GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
-    "GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
-    "GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
     # OpenAI reasoning models (no temperature)
     "o1": create_openai_reasoning_llm("o1-2024-12-17"),
@@ -52,40 +50,46 @@ llms = {
     "o3-mini (high-reasoning effort version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
     # Anthropic models (Claude)
     "Claude 3.5 (low temp)": create_anthropic_llm("claude-3-5-sonnet-latest", LOW),
     "Claude 3.5 (mid temp)": create_anthropic_llm("claude-3-5-sonnet-latest", MID),
     "Claude 3.5 (high temp)": create_anthropic_llm("claude-3-5-sonnet-latest", HIGH),
     # DeepSeek
     "Deepseek R1 (low temp)🚧": create_anthropic_llm("deepseek-reasoner", LOW),
 }
 # specific for Diagnosis tab
-llms_diagnosis_tab = {
-    # OpenAI models with temperature
     "GPT-4o (low temp)": create_openai_llm("gpt-4o", LOW),
     "GPT-4o (mid temp)": create_openai_llm("gpt-4o", MID),
     "GPT-4o (high temp)": create_openai_llm("gpt-4o", HIGH),
-    "GPT-4o-mini-zero": create_openai_llm("gpt-4o-mini", ZERO),
-    "GPT-4o-mini": create_openai_llm("gpt-4o-mini", LOW),
-    "GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
-    "GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
-    "GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
     # OpenAI reasoning models (no temperature)
     "o1": create_openai_reasoning_llm("o1-2024-12-17"),
-    "o3-mini (low-reasoning version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="low"),
-    "o3-mini (medium-reasoning version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="medium"),
-    "o3-mini (high-reasoning version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
-    # Anthropic models (Claude)
     "Claude 3.5 (low temp)": create_anthropic_llm("claude-3-5-sonnet-latest", LOW),
     "Claude 3.5 (mid temp)": create_anthropic_llm("claude-3-5-sonnet-latest", MID),
     "Claude 3.5 (high temp)": create_anthropic_llm("claude-3-5-sonnet-latest", HIGH),
     # DeepSeek
     "Deepseek R1 (low temp)🚧": create_anthropic_llm("deepseek-reasoner", LOW),
 }

 # Define temperature presets (adjust as needed)
 ZERO = 0
 LOW = 0.2
+MID = 0.6
 HIGH = 1.2
 # Factory functions for each provider
 # all of them in one dictionary
 llms = {
     # OpenAI models with temperature
+    "GPT-4o (zero temp)": create_openai_llm("gpt-4o", ZERO),
     "GPT-4o (low temp)": create_openai_llm("gpt-4o", LOW),
     "GPT-4o (mid temp)": create_openai_llm("gpt-4o", MID),
     "GPT-4o (high temp)": create_openai_llm("gpt-4o", HIGH),
+    "GPT-4o-mini (zero temp)": create_openai_llm("gpt-4o-mini", ZERO),
+    "GPT-4o-mini (low temp)": create_openai_llm("gpt-4o-mini", LOW),
+    "GPT-4 Turbo (low temp)": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
     # OpenAI reasoning models (no temperature)
     "o1": create_openai_reasoning_llm("o1-2024-12-17"),
     "o3-mini (high-reasoning effort version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
     # Anthropic models (Claude)
+    "Claude 3.5 (zero temp)": create_anthropic_llm("claude-3-5-sonnet-latest", ZERO),
     "Claude 3.5 (low temp)": create_anthropic_llm("claude-3-5-sonnet-latest", LOW),
     "Claude 3.5 (mid temp)": create_anthropic_llm("claude-3-5-sonnet-latest", MID),
     "Claude 3.5 (high temp)": create_anthropic_llm("claude-3-5-sonnet-latest", HIGH),
     # DeepSeek
+    "Deepseek R1 (zero temp)🚧": create_anthropic_llm("deepseek-reasoner", ZERO),
     "Deepseek R1 (low temp)🚧": create_anthropic_llm("deepseek-reasoner", LOW),
+    "Deepseek R1 (mid temp)🚧": create_anthropic_llm("deepseek-reasoner", MID),
+    "Deepseek R1 (high temp)🚧": create_anthropic_llm("deepseek-reasoner", HIGH),
 }
 # specific for Diagnosis tab
+llms_most_wanted = {
+    # OpenAI models
+    "GPT-4o (zero temp)": create_openai_llm("gpt-4o", ZERO),
     "GPT-4o (low temp)": create_openai_llm("gpt-4o", LOW),
     "GPT-4o (mid temp)": create_openai_llm("gpt-4o", MID),
     "GPT-4o (high temp)": create_openai_llm("gpt-4o", HIGH),
+    "GPT-4o-mini (zero temp)": create_openai_llm("gpt-4o-mini", ZERO),
+    "GPT-4o-mini (low temp)": create_openai_llm("gpt-4o-mini", LOW),
+    "GPT-4 Turbo (low temp)": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
     # OpenAI reasoning models (no temperature)
     "o1": create_openai_reasoning_llm("o1-2024-12-17"),
+    "o3-mini (low-reasoning effort version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="low"),
+    "o3-mini (medium-reasoning effort version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="medium"),
+    "o3-mini (high-reasoning effort version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
+    # Anthropic models
+    "Claude 3.5 (zero temp)": create_anthropic_llm("claude-3-5-sonnet-latest", ZERO),
     "Claude 3.5 (low temp)": create_anthropic_llm("claude-3-5-sonnet-latest", LOW),
     "Claude 3.5 (mid temp)": create_anthropic_llm("claude-3-5-sonnet-latest", MID),
     "Claude 3.5 (high temp)": create_anthropic_llm("claude-3-5-sonnet-latest", HIGH),
+    "Claude 3.5 Haiku (zero temp)": create_anthropic_llm("claude-3-5-haiku-latest", HIGH),
+    "Claude 3.5 Haiku (low temp)": create_anthropic_llm("claude-3-5-haiku-latest", HIGH),
     # DeepSeek
+    "Deepseek R1 (zero temp)🚧": create_anthropic_llm("deepseek-reasoner", ZERO),
     "Deepseek R1 (low temp)🚧": create_anthropic_llm("deepseek-reasoner", LOW),
+    "Deepseek R1 (mid temp)🚧": create_anthropic_llm("deepseek-reasoner", MID),
+    "Deepseek R1 (high temp)🚧": create_anthropic_llm("deepseek-reasoner", HIGH),
 }

config/templates.py CHANGED Viewed

@@ -189,7 +189,7 @@ diagnose_scorecard_template = ChatPromptTemplate(
 template_distractors_brainstorm_1 = ChatPromptTemplate(
     messages=[
-        ("system", "You are a brainstorming assistant. Based on the given multiple choice exercise, come up with {intermediate_distractors_specification} additional high-quality distractors: "
                    "alternative answer options that are not correct, yet also not so implausible that even poorly informed students would immediately dismiss them. Make sure to use the same language as the existing exercise."),
         ("human", "{standardized_exercise}")
     ],
@@ -198,7 +198,7 @@ template_distractors_brainstorm_1 = ChatPromptTemplate(
 template_distractors_brainstorm_2 = ChatPromptTemplate(
     messages=[
-        ("system", "You are a brainstorming assistant. Based on the given multiple choice exercise, come up with {intermediate_distractors_specification} additional high-quality distractors: "
                    "alternative answer options that are not correct, yet not so implausible that even poorly informed students would immediately dismiss them. Go about this very methodically: "
                    "Really try to think outside of the box and get creative here, providing potential alternative distractors across a wide range of options. "
                    "Before you present your final selection, take your time to really consider the entire solution space, weighing your different ideas an options, then to list the distractors. Make sure to use the same language as the existing exercise."),
@@ -211,13 +211,12 @@ template_distractors_brainstorm_2 = ChatPromptTemplate(
 template_consolidate_distractors  = ChatPromptTemplate(
     messages=[
         ("system", "You are given several lists of potential distractors (answer options to a multiple choice exercise), that need to be consolidated into one list. "
-                   "Filter out duplicates, do some logical sorting among them, and just return one plain list of {final_distractors_specification}. "
-                   "Only focus on the distractors (answer options) themselves, ignore any reasoning about them. Return only the list, formatted without numbering or bullet points, just every distractor on its own line. Use the same language as the existing exercise. "),
-        ("human", "For context, this is the exercise that the distractors are about: "
                   "{standardized_exercise} "
-                  ""
-                  "Here are the lists:"
-                  "{brainstorm_outputs}")
     ],
     input_variables=["standardized_exercise", "brainstorm_outputs", "final_distractors_specification"]
 )

 template_distractors_brainstorm_1 = ChatPromptTemplate(
     messages=[
+        ("system", "You are a brainstorming assistant. Based on the given multiple choice exercise, come up with{intermediate_distractors_specification}additional high-quality distractors: "
                    "alternative answer options that are not correct, yet also not so implausible that even poorly informed students would immediately dismiss them. Make sure to use the same language as the existing exercise."),
         ("human", "{standardized_exercise}")
     ],
 template_distractors_brainstorm_2 = ChatPromptTemplate(
     messages=[
+        ("system", "You are a brainstorming assistant. Based on the given multiple choice exercise, come up with{intermediate_distractors_specification}additional high-quality distractors: "
                    "alternative answer options that are not correct, yet not so implausible that even poorly informed students would immediately dismiss them. Go about this very methodically: "
                    "Really try to think outside of the box and get creative here, providing potential alternative distractors across a wide range of options. "
                    "Before you present your final selection, take your time to really consider the entire solution space, weighing your different ideas an options, then to list the distractors. Make sure to use the same language as the existing exercise."),
 template_consolidate_distractors  = ChatPromptTemplate(
     messages=[
         ("system", "You are given several lists of potential distractors (answer options to a multiple choice exercise), that need to be consolidated into one list. "
+                   "Filter out duplicates, do some logical sorting among them, and just return one plain list{final_distractors_specification}. "
+                   "Only focus on the distractors (answer options) themselves, ignore any reasoning about them. Return only the list, nothing else. Format the list without numbering or bullet points, just put every distractor on its own line. Use the same language as the existing exercise. "),
+        ("human", "For context, this is the exercise that the distractors are about:\n "
                   "{standardized_exercise} "
+                  "Here are the lists:\n "
+                  "{brainstorm_outputs} ")
     ],
     input_variables=["standardized_exercise", "brainstorm_outputs", "final_distractors_specification"]
 )