Spaces:

BtB-ExpC
/

Exercises

Sleeping

App Files Files Community

BtB-ExpC commited on Feb 9, 2025

Commit

40f7f2d

1 Parent(s): 19d0e21

scorecard step added to diagnosis

Browse files

Files changed (3) hide show

chains/diagnoser_chain.py +19 -9
config/chain_configs.py +9 -7
config/templates.py +33 -4

chains/diagnoser_chain.py CHANGED Viewed

@@ -8,16 +8,19 @@ from config.exercise_standardizer import standardize_exercise
 class DiagnoserChain(BaseModel):
     template_standardize: ChatPromptTemplate
-    llm_standardize: Any  # Fixed LLM for step 1
     templates_diagnose: List[ChatPromptTemplate]
     llm_diagnose: Any  # User-selectable LLM for step 2
     async def run(self, user_query: str, exercise_format: str) -> str:
         """
         Runs the composite chain:
-          1. Standardizes the exercise formatting.
-          2. Feeds the standardized exercise into multiple diagnosis prompts in parallel.
-          3. Combines the outputs from all prompts.
         """
         # Step 1: Standardize the exercise.
         standardized_exercise = await standardize_exercise(
@@ -25,23 +28,30 @@ class DiagnoserChain(BaseModel):
         )
         # Step 2: Define an async helper to run a single diagnosis prompt.
-        async def run_single(template: ChatPromptTemplate, idx: int) -> str:
             prompt = await template.aformat_prompt(standardized_exercise=standardized_exercise)
             messages = prompt.to_messages()
             diagnosis_response = await self.llm_diagnose.ainvoke(messages)
             content = diagnosis_response.content if hasattr(diagnosis_response, "content") else diagnosis_response
-            return f"**Diagnosis {idx}:**\n{content}"
         # Launch all diagnosis tasks concurrently.
         tasks = [
-            run_single(template, idx)
             for idx, template in enumerate(self.templates_diagnose, start=1)
         ]
         diagnoses = await asyncio.gather(*tasks)
         # Step 3: Combine the outputs from each prompt.
-        combined_diagnosis = "\n\n---\n\n".join(diagnoses)
-        return combined_diagnosis
     class Config:
         arbitrary_types_allowed = True

 class DiagnoserChain(BaseModel):
     template_standardize: ChatPromptTemplate
+    llm_standardize: Any  # Fixed LLM for step 1 and 3
     templates_diagnose: List[ChatPromptTemplate]
     llm_diagnose: Any  # User-selectable LLM for step 2
+    template_diagnose_scorecard: ChatPromptTemplate
     async def run(self, user_query: str, exercise_format: str) -> str:
         """
         Runs the composite chain:
+          1. Standardizes the exercise formatting
+          2. Feeds the standardized exercise into multiple diagnosis prompts in parallel
+          3. Combines the outputs from each prompt.
+          4. Generates one-line scorecard of combined diagnoses
         """
         # Step 1: Standardize the exercise.
         standardized_exercise = await standardize_exercise(
         )
         # Step 2: Define an async helper to run a single diagnosis prompt.
+        async def run_single_diagnosis(template: ChatPromptTemplate, idx: int) -> str:
             prompt = await template.aformat_prompt(standardized_exercise=standardized_exercise)
             messages = prompt.to_messages()
             diagnosis_response = await self.llm_diagnose.ainvoke(messages)
             content = diagnosis_response.content if hasattr(diagnosis_response, "content") else diagnosis_response
+            return f"[DIAGNOSIS {idx}]{content}"
         # Launch all diagnosis tasks concurrently.
         tasks = [
+            run_single_diagnosis(template, idx)
             for idx, template in enumerate(self.templates_diagnose, start=1)
         ]
         diagnoses = await asyncio.gather(*tasks)
         # Step 3: Combine the outputs from each prompt.
+        combined_diagnosis = "\n\n---\n".join(diagnoses)
+        # Step 4: Generate scorecard
+        prompt = await self.template_diagnose_scorecard.aformat_prompt(combined_diagnosis=combined_diagnosis)
+        scorecard_messages = prompt.to_messages()
+        scorecard_response = await self.llm_diagnose.ainvoke(scorecard_messages)
+        scorecard = scorecard_response.content if hasattr(scorecard_response, "content") else scorecard_response
+        return scorecard + "\n" + combined_diagnosis
     class Config:
         arbitrary_types_allowed = True

config/chain_configs.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # config/chain_configs.py
 from config.templates import standardize_template, diagnose_template, distractors_template, \
-    diagnose_double_negation_template, diagnose_correct_answer_stands_out_template, \
-    diagnose_distractor_clearly_wrong_template, diagnose_distractor_partially_correct_template
 from chains.diagnoser_chain import DiagnoserChain
 from chains.distractors_chain import DistractorsChain
 from config.llm_config import llms
@@ -12,13 +13,14 @@ chain_configs = {
         "class": DiagnoserChain,
         "template_standardize": standardize_template,
         "llm_standardize": llms["GPT-4o-mini"],     # Always fixed
-        # Provide a list of 4 different diagnosis templates:
         "templates_diagnose": [
-            diagnose_double_negation_template,
-            diagnose_correct_answer_stands_out_template,
-            diagnose_distractor_clearly_wrong_template,
-            diagnose_distractor_partially_correct_template,
         ],
         "llm_diagnose": llms["GPT-4o"],             # Default; can be replaced in UI
     },
     "distractors": {

 # config/chain_configs.py
 from config.templates import standardize_template, diagnose_template, distractors_template, \
+    template_diagnose_double_negation, template_diagnose_correct_answer_stands_out, \
+    template_diagnose_distractor_clearly_wrong, template_diagnose_distractor_partially_correct, \
+    diagnose_scorecard_template
 from chains.diagnoser_chain import DiagnoserChain
 from chains.distractors_chain import DistractorsChain
 from config.llm_config import llms
         "class": DiagnoserChain,
         "template_standardize": standardize_template,
         "llm_standardize": llms["GPT-4o-mini"],     # Always fixed
+        # 4 different diagnosis templates (to run in parallel:
         "templates_diagnose": [
+            template_diagnose_double_negation,
+            template_diagnose_correct_answer_stands_out,
+            template_diagnose_distractor_clearly_wrong,
+            template_diagnose_distractor_partially_correct,
         ],
+        "template_diagnose_scorecard": diagnose_scorecard_template,
         "llm_diagnose": llms["GPT-4o"],             # Default; can be replaced in UI
     },
     "distractors": {

config/templates.py CHANGED Viewed

@@ -19,7 +19,7 @@ diagnose_template = ChatPromptTemplate(
     input_variables=["standardized_exercise"]
 )
-diagnose_double_negation_template = ChatPromptTemplate(
     messages=[
         ("system", """You analyze a multiple-choice exercise for the presence of double negatives.
         Here are some examples of double negatives:
@@ -66,7 +66,7 @@ diagnose_double_negation_template = ChatPromptTemplate(
     input_variables=["standardized_exercise"]
 )
-diagnose_correct_answer_stands_out_template = ChatPromptTemplate(
     messages=[
         ("system", """You evaluate a multiple-choice exercise to determine if the correct answer
         stands out too much compared to the distractors. If the correct answer is significantly
@@ -127,7 +127,7 @@ diagnose_correct_answer_stands_out_template = ChatPromptTemplate(
 # </explanation how the correct answer stands out>
 # </example where X>
-diagnose_distractor_clearly_wrong_template = ChatPromptTemplate(
     messages=[
         ("system", """You assess a multiple-choice exercise to determine if any distractors
         are clearly incorrect and therefore too easy to eliminate. Effective distractors should
@@ -140,7 +140,7 @@ diagnose_distractor_clearly_wrong_template = ChatPromptTemplate(
     input_variables=["standardized_exercise"]
 )
-diagnose_distractor_partially_correct_template = ChatPromptTemplate(
     messages=[
         ("system", """You analyze a multiple-choice exercise to detect distractors that are
         partially correct. Some answer choices may contain elements of truth, leading to
@@ -153,6 +153,35 @@ diagnose_distractor_partially_correct_template = ChatPromptTemplate(
     input_variables=["standardized_exercise"]
 )
 # Template for the distractors brainstorm
 distractors_template = ChatPromptTemplate(
     messages=[

     input_variables=["standardized_exercise"]
 )
+template_diagnose_double_negation = ChatPromptTemplate(
     messages=[
         ("system", """You analyze a multiple-choice exercise for the presence of double negatives.
         Here are some examples of double negatives:
     input_variables=["standardized_exercise"]
 )
+template_diagnose_correct_answer_stands_out = ChatPromptTemplate(
     messages=[
         ("system", """You evaluate a multiple-choice exercise to determine if the correct answer
         stands out too much compared to the distractors. If the correct answer is significantly
 # </explanation how the correct answer stands out>
 # </example where X>
+template_diagnose_distractor_clearly_wrong = ChatPromptTemplate(
     messages=[
         ("system", """You assess a multiple-choice exercise to determine if any distractors
         are clearly incorrect and therefore too easy to eliminate. Effective distractors should
     input_variables=["standardized_exercise"]
 )
+template_diagnose_distractor_partially_correct = ChatPromptTemplate(
     messages=[
         ("system", """You analyze a multiple-choice exercise to detect distractors that are
         partially correct. Some answer choices may contain elements of truth, leading to
     input_variables=["standardized_exercise"]
 )
+diagnose_scorecard_template = ChatPromptTemplate(
+    messages=[
+        ("system", """You analyze the results of the diagnoses of 4 issues, and consolidate that into a very simple one-line visual scorecard that summarizes all diagnoses, immediately giving an overview of the 4 results.
+        Use these two icons:
+        - ✅ means the diagnosis came back negative, the issues is not present.
+        - ❌ means the diagnosis came back positive, the issues is present.
+        (and a third icon if need be: - ❔ means you don't understand the diagnosis result)
+        The scorecard should always look like this:
+        <template>
+        |Double negative: [icon] |Correct answer stands out: [icon] |Distractor clearly false: [icon] |Distractor kinda correct: [icon] |
+        </template>
+        <example 1>
+        |Double negative:✅||Correct answer stands out:✅||Distractor clearly false:✅||Distractor kinda correct:✅|
+        </example1 >
+        <example 2>
+        |Double negative:✅||Correct answer stands out:❌||Distractor clearly false:✅||Distractor kinda correct:❌|
+        </example 2>
+        <example 3>
+        |Double negative:❌||Correct answer stands out:❌||Distractor clearly false:✅||Distractor kinda correct:❔|
+        </example 3>
+        <example 4>
+        |Double negative:✅||Correct answer stands out:✅||Distractor clearly false:❌||Distractor kinda correct:✅|
+        </example 4>
+        """),
+        ("human", "{combined_diagnosis}")
+    ],
+    input_variables=["combined_diagnosis"]
+)
 # Template for the distractors brainstorm
 distractors_template = ChatPromptTemplate(
     messages=[