Spaces:

BtB-ExpC
/

Exercises

Sleeping

App Files Files Community

BtB-ExpC commited on Feb 9, 2025

Commit

f462dee

1 Parent(s): 21f7b3d

refactored diagnoser chain

Browse files

Files changed (6) hide show

app.py +29 -14
chains/diagnoser_chain.py +20 -24
config/chain_configs.py +11 -6
config/llm_config.py +2 -1
config/templates.py +8 -8
test exercises.txt → test exercises.md +0 -0

app.py CHANGED Viewed

@@ -4,7 +4,8 @@ import os
 import asyncio
 import logging
-from utils.auth import login as auth_login  # Simple authentication
 from config.chain_configs import chain_configs
 from config.llm_config import llms
@@ -16,7 +17,7 @@ def update_exercise_format(selected_model: str):
     if selected_model == "Claude 3.5":
         return gr.update(value="XML")
     else:
-        return gr.update(value="Markdown")
 # A generic async runner for chains.
 async def run_chain(chain_name: str, input_variables: dict, selected_model: str):
@@ -61,26 +62,40 @@ async def run_chain(chain_name: str, input_variables: dict, selected_model: str)
 # Async wrappers for each chain.
 async def run_diagnoser(user_query: str, chosen_model: str, exercise_format: str, sampling_count: str) -> tuple:
     num_samples = int("".join(filter(str.isdigit, sampling_count)))
     # Fetch the DiagnoserChain configuration.
     config = chain_configs["diagnoser"]
-    # Instantiate DiagnoserChain using:
-    # - A fixed LLM for standardizing (gpt4o-mini)
-    # - The user-selected model for diagnosis (overriding the default)
     chain_instance = config["class"](
-        template_standardize=config["template_standardize"],
         templates_diagnose=config["templates_diagnose"],
         template_diagnose_scorecard=config["template_diagnose_scorecard"],
-        llm_standardize=config["llm_standardize"],  # Fixed: gpt4o-mini
-        llm_diagnose=llms.get(chosen_model, config["llm_diagnose"])  # Override or fallback to default
     )
-    responses = []
-    for i in range(num_samples):
-        response = await chain_instance.run(user_query, exercise_format)
-        responses.append(response)
-    # Fill missing responses (if any) up to 5 outputs.
-    all_responses = responses + [""] * (5 - len(responses))
     # Return a tuple of exactly 5 responses.
     return tuple(all_responses)

 import asyncio
 import logging
+from config.exercise_standardizer import standardize_exercise
+from utils.auth import login as auth_login
 from config.chain_configs import chain_configs
 from config.llm_config import llms
     if selected_model == "Claude 3.5":
         return gr.update(value="XML")
     else:
+        return gr.update(value="Plaintext")
 # A generic async runner for chains.
 async def run_chain(chain_name: str, input_variables: dict, selected_model: str):
 # Async wrappers for each chain.
 async def run_diagnoser(user_query: str, chosen_model: str, exercise_format: str, sampling_count: str) -> tuple:
+    # figure out how many times to run
     num_samples = int("".join(filter(str.isdigit, sampling_count)))
     # Fetch the DiagnoserChain configuration.
     config = chain_configs["diagnoser"]
+    # 1) Standardize the user query exactly once
+    standardized_exercise = await standardize_exercise(
+        user_query,
+        exercise_format,
+        config["template_standardize"],
+        config["llm_standardize"]
+    )
+    # 2) Instantiate the DiagnoserChain using the user-selected LLM for diagnosing
     chain_instance = config["class"](
         templates_diagnose=config["templates_diagnose"],
+        llm_diagnose=llms.get(chosen_model, config["llm_diagnose"]),
         template_diagnose_scorecard=config["template_diagnose_scorecard"],
+        llm_4o_mini=config["llm_4o_mini"]
     )
+    # 3) Run the multiple samples in parallel
+    # Create a short helper that does only the "diagnose" steps:
+    tasks = [
+        chain_instance.diagnose_only(standardized_exercise)
+        for _ in range(num_samples)
+    ]
+    # run concurrently
+    responses = await asyncio.gather(*tasks)
+    # pad up to 5 if needed
+    all_responses = list(responses) + [""] * (5 - len(responses))
     # Return a tuple of exactly 5 responses.
     return tuple(all_responses)

chains/diagnoser_chain.py CHANGED Viewed

@@ -7,49 +7,45 @@ from config.exercise_standardizer import standardize_exercise
 class DiagnoserChain(BaseModel):
-    template_standardize: ChatPromptTemplate
-    llm_standardize: Any  # Fixed LLM for step 1
     templates_diagnose: List[ChatPromptTemplate]
-    llm_diagnose: Any  # User-selectable LLM for step 2
     template_diagnose_scorecard: ChatPromptTemplate
-    async def run(self, user_query: str, exercise_format: str) -> str:
         """
-        Runs the composite chain:
-          1. Standardizes the exercise formatting
-          2. Feeds the standardized exercise into multiple diagnosis prompts in parallel
-          3. Combines the outputs from each prompt.
-          4. Generates one-line scorecard of combined diagnoses
         """
-        # Step 1: Standardize the exercise.
-        standardized_exercise = await standardize_exercise(
-            user_query, exercise_format, self.template_standardize, self.llm_standardize
-        )
-        # Step 2: Define an async helper to run a single diagnosis prompt.
         async def run_single_diagnosis(template: ChatPromptTemplate, idx: int) -> str:
             prompt = await template.aformat_prompt(standardized_exercise=standardized_exercise)
             messages = prompt.to_messages()
             diagnosis_response = await self.llm_diagnose.ainvoke(messages)
-            content = diagnosis_response.content if hasattr(diagnosis_response, "content") else diagnosis_response
-            return f"--- [DIAGNOSIS {idx}] --- \n{content}"
-        # Launch all diagnosis tasks concurrently.
         tasks = [
             run_single_diagnosis(template, idx)
             for idx, template in enumerate(self.templates_diagnose, start=1)
         ]
         diagnoses = await asyncio.gather(*tasks)
-        # Step 3: Combine the outputs from each prompt.
-        combined_diagnosis = "\n\n".join(diagnoses)
-        # Step 4: Generate scorecard
-        prompt = await self.template_diagnose_scorecard.aformat_prompt(combined_diagnosis=combined_diagnosis)
         scorecard_messages = prompt.to_messages()
-        scorecard_response = await self.llm_diagnose.ainvoke(scorecard_messages)
-        scorecard = scorecard_response.content if hasattr(scorecard_response, "content") else scorecard_response
         return combined_diagnosis + "\n\n" + scorecard

 class DiagnoserChain(BaseModel):
     templates_diagnose: List[ChatPromptTemplate]
     template_diagnose_scorecard: ChatPromptTemplate
+    llm_diagnose: Any
+    llm_4o_mini: Any
+    async def diagnose_only(self, standardized_exercise: str) -> str:
         """
+        Takes a PRE-standardized exercise and:
+          (1) Runs multiple diagnosis prompts in parallel,
+          (2) Merges the results,
+          (3) Generates a scorecard line,
+          (4) Returns the combined text + scorecard.
         """
+        # Step 1: define an async helper to run each diagnosis in parallel
         async def run_single_diagnosis(template: ChatPromptTemplate, idx: int) -> str:
             prompt = await template.aformat_prompt(standardized_exercise=standardized_exercise)
             messages = prompt.to_messages()
             diagnosis_response = await self.llm_diagnose.ainvoke(messages)
+            content = getattr(diagnosis_response, "content", diagnosis_response)
+            return f"--- [DIAGNOSIS {idx}] ---\n{content}"
+        # Step 2: launch all diagnoses concurrently
         tasks = [
             run_single_diagnosis(template, idx)
             for idx, template in enumerate(self.templates_diagnose, start=1)
         ]
         diagnoses = await asyncio.gather(*tasks)
+        # Step 3: combine the outputs
+        combined_diagnosis = "\n".join(diagnoses)
+        # Step 4: Generate a one-line scorecard
+        prompt = await self.template_diagnose_scorecard.aformat_prompt(
+            combined_diagnosis=combined_diagnosis
+        )
         scorecard_messages = prompt.to_messages()
+        scorecard_response = await self.llm_4o_mini.ainvoke(scorecard_messages)
+        scorecard = getattr(scorecard_response, "content", scorecard_response)
         return combined_diagnosis + "\n\n" + scorecard

config/chain_configs.py CHANGED Viewed

@@ -1,8 +1,14 @@
 # config/chain_configs.py
-from config.templates import standardize_template, diagnose_template, distractors_template, \
-    template_diagnose_double_negation, template_diagnose_correct_answer_stands_out, \
-    template_diagnose_distractor_clearly_wrong, template_diagnose_distractor_partially_correct, \
     diagnose_scorecard_template
 from chains.diagnoser_chain import DiagnoserChain
 from chains.distractors_chain import DistractorsChain
 from config.llm_config import llms
@@ -11,8 +17,7 @@ from config.llm_config import llms
 chain_configs = {
     "diagnoser": {
         "class": DiagnoserChain,
-        "template_standardize": standardize_template,
-        "llm_standardize": llms["GPT-4o-mini"],     # Always fixed
         # 4 different diagnosis templates (to run in parallel:
         "templates_diagnose": [
             template_diagnose_double_negation,
@@ -26,7 +31,7 @@ chain_configs = {
     "distractors": {
         "class": DistractorsChain,
         "template_standardize": standardize_template,
-        "llm_standardize": llms["GPT-4o-mini"],     # Always fixed
         "template_distractors": distractors_template,
         "llm_distractors": llms["GPT-4o"],                # Default; can be replaced in UI
     },

 # config/chain_configs.py
+from config.templates import (
+    standardize_template,
+    diagnose_template,
+    distractors_template,
+    template_diagnose_double_negation,
+    template_diagnose_correct_answer_stands_out,
+    template_diagnose_distractor_clearly_wrong,
+    template_diagnose_distractor_partially_correct,
     diagnose_scorecard_template
+)
 from chains.diagnoser_chain import DiagnoserChain
 from chains.distractors_chain import DistractorsChain
 from config.llm_config import llms
 chain_configs = {
     "diagnoser": {
         "class": DiagnoserChain,
+        "llm_4o_mini": llms["GPT-4o-mini"],
         # 4 different diagnosis templates (to run in parallel:
         "templates_diagnose": [
             template_diagnose_double_negation,
     "distractors": {
         "class": DistractorsChain,
         "template_standardize": standardize_template,
+        "llm_standardize": llms["GPT-4o-mini-zero"],     # Always fixed
         "template_distractors": distractors_template,
         "llm_distractors": llms["GPT-4o"],                # Default; can be replaced in UI
     },

config/llm_config.py CHANGED Viewed

@@ -30,7 +30,8 @@ def create_deepseek_llm(model_name: str, temperature: float):
 llms = {
     "GPT-4o": create_openai_llm("gpt-4o", LOW),
-    "GPT-4o-mini": create_openai_llm("gpt-4o-mini", ZERO),
     "GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
     "GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
     "GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", HIGH),

 llms = {
     "GPT-4o": create_openai_llm("gpt-4o", LOW),
+    "GPT-4o-mini-zero": create_openai_llm("gpt-4o-mini", ZERO),
+    "GPT-4o-mini": create_openai_llm("gpt-4o-mini", LOW),
     "GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
     "GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
     "GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", HIGH),

config/templates.py CHANGED Viewed

@@ -60,7 +60,7 @@ template_diagnose_double_negation = ChatPromptTemplate(
         </double negative explanation>
         </example 2>.
         If it's obvious that there is or isn't a double negative in this exercise, just give a short one-sentence diagnosis on this.
-        If you're not quite sure, do some reasoning first, and give your diagnosis then."""),
         ("human", "{standardized_exercise}")
     ],
     input_variables=["standardized_exercise"]
@@ -114,7 +114,7 @@ template_diagnose_correct_answer_stands_out = ChatPromptTemplate(
         </example where the correct answer is grammatically different>
         Your only focus is to accurately diagnose this issue, no need to provide a fix. If the correct answer in the given exercise clearly does or does not stand out, just give a short one-sentence diagnosis on this.
-        If you're not quite sure, do some reasoning first, and give your diagnosis then."""),
         ("human", "{standardized_exercise}")
     ],
     input_variables=["standardized_exercise"]
@@ -134,7 +134,7 @@ template_diagnose_distractor_clearly_wrong = ChatPromptTemplate(
         be plausible but incorrect.
         Identify distractors that are obviously wrong, such that even students that are completely uninformed about the topic can eliminate them.
         Your only focus is to accurately diagnose this issue, no need to provide a fix. If all distractors in the given exercise clearly are or aren't obviously incorrect, just give a short one-sentence diagnosis on this.
-        If you're not quite sure, do some reasoning first, and give your diagnosis then."""),
         ("human", "{standardized_exercise}")
     ],
     input_variables=["standardized_exercise"]
@@ -146,7 +146,7 @@ template_diagnose_distractor_partially_correct = ChatPromptTemplate(
         partially correct. Some answer choices may contain elements of truth, leading to
         ambiguity. Identify such cases. Really stress-test them: is there a story you could tell where the distractor, in the context of this exercise, could be considered a (partially) correct answer?
         Your only focus is to accurately diagnose this issue, no need to provide a fix. If all distractors in the given exercise clearly are or aren't unambiguously false, just give a short one-sentence diagnosis on this.
-        If you're not quite sure, do some reasoning first, and give your diagnosis then.
         """),
         ("human", "{standardized_exercise}")
     ],
@@ -166,16 +166,16 @@ diagnose_scorecard_template = ChatPromptTemplate(
         (and a third icon if need be: - ❔ means the diagnosis is unclear)
         The scorecard should always look like this:
         <template>
-        |The exercise does not contain/contains a double negative: ✅/❌| |The correct answer does not/does stand out: ✅/❌| |None/Some of the distractors are too obviously false: ✅/❌| |None/Some of the distractors are actually also kinda correct: ✅/❌|
         </template>
         <example 1>
-        |The exercise doesn't contain a double negative: ✅| |The correct answer does not stand out: ✅| |None of the distractors are too obviously false: ✅| |None of the distractors are actually also kinda correct: ✅|
         </example 1>
         <example 2>
-        |The exercise doesn't contain a double negative: ✅| |The correct answer does stand out: ❌| |None of the distractors are too obviously false: ✅| |Some of the distractors are actually also kinda correct: ❌|
         </example 2>
         <example 3>
-        |The exercise contains a double negative: ❌| |The correct answer does not stand out: ✅| |Some of the distractors are too obviously false: ❌| |None of the distractors are actually also kinda correct: ✅|
         </example 3>
         """),
         ("human", "{combined_diagnosis}")

         </double negative explanation>
         </example 2>.
         If it's obvious that there is or isn't a double negative in this exercise, just give a short one-sentence diagnosis on this.
+        If the issue is more nuanced, do some reasoning first, and give your diagnosis then."""),
         ("human", "{standardized_exercise}")
     ],
     input_variables=["standardized_exercise"]
         </example where the correct answer is grammatically different>
         Your only focus is to accurately diagnose this issue, no need to provide a fix. If the correct answer in the given exercise clearly does or does not stand out, just give a short one-sentence diagnosis on this.
+        If the issue is more nuanced, do some reasoning first, and give your diagnosis then."""),
         ("human", "{standardized_exercise}")
     ],
     input_variables=["standardized_exercise"]
         be plausible but incorrect.
         Identify distractors that are obviously wrong, such that even students that are completely uninformed about the topic can eliminate them.
         Your only focus is to accurately diagnose this issue, no need to provide a fix. If all distractors in the given exercise clearly are or aren't obviously incorrect, just give a short one-sentence diagnosis on this.
+        If the issue is more nuanced, do some reasoning first, and give your diagnosis then."""),
         ("human", "{standardized_exercise}")
     ],
     input_variables=["standardized_exercise"]
         partially correct. Some answer choices may contain elements of truth, leading to
         ambiguity. Identify such cases. Really stress-test them: is there a story you could tell where the distractor, in the context of this exercise, could be considered a (partially) correct answer?
         Your only focus is to accurately diagnose this issue, no need to provide a fix. If all distractors in the given exercise clearly are or aren't unambiguously false, just give a short one-sentence diagnosis on this.
+        If the issue is more nuanced, do some reasoning first, and give your diagnosis then.
         """),
         ("human", "{standardized_exercise}")
     ],
         (and a third icon if need be: - ❔ means the diagnosis is unclear)
         The scorecard should always look like this:
         <template>
+        The exercise does not contain/contains a double negative: ✅/❌ -- The correct answer does not/does stand out: ✅/❌ -- None/Some of the distractors are too obviously false: ✅/❌ -- None/Some of the distractors are actually also kinda correct: ✅/❌
         </template>
         <example 1>
+        The exercise doesn't contain a double negative: ✅ -- The correct answer does not stand out: ✅ -- None of the distractors are too obviously false: ✅ -- None of the distractors are actually also kinda correct: ✅
         </example 1>
         <example 2>
+        The exercise doesn't contain a double negative: ✅ -- The correct answer does stand out: ❌ -- None of the distractors are too obviously false: ✅ -- Some of the distractors are actually also kinda correct: ❌
         </example 2>
         <example 3>
+        The exercise contains a double negative: ❌ -- The correct answer does not stand out: ✅ -- Some of the distractors are too obviously false: ❌ -- None of the distractors are actually also kinda correct: ��
         </example 3>
         """),
         ("human", "{combined_diagnosis}")

test exercises.txt → test exercises.md RENAMED Viewed

File without changes