scorecard step added to diagnosis
Browse files- chains/diagnoser_chain.py +19 -9
- config/chain_configs.py +9 -7
- config/templates.py +33 -4
chains/diagnoser_chain.py
CHANGED
|
@@ -8,16 +8,19 @@ from config.exercise_standardizer import standardize_exercise
|
|
| 8 |
|
| 9 |
class DiagnoserChain(BaseModel):
|
| 10 |
template_standardize: ChatPromptTemplate
|
| 11 |
-
llm_standardize: Any # Fixed LLM for step 1
|
| 12 |
templates_diagnose: List[ChatPromptTemplate]
|
| 13 |
llm_diagnose: Any # User-selectable LLM for step 2
|
|
|
|
| 14 |
|
| 15 |
async def run(self, user_query: str, exercise_format: str) -> str:
|
| 16 |
"""
|
| 17 |
Runs the composite chain:
|
| 18 |
-
1. Standardizes the exercise formatting
|
| 19 |
-
2. Feeds the standardized exercise into multiple diagnosis prompts in parallel
|
| 20 |
-
3. Combines the outputs from
|
|
|
|
|
|
|
| 21 |
"""
|
| 22 |
# Step 1: Standardize the exercise.
|
| 23 |
standardized_exercise = await standardize_exercise(
|
|
@@ -25,23 +28,30 @@ class DiagnoserChain(BaseModel):
|
|
| 25 |
)
|
| 26 |
|
| 27 |
# Step 2: Define an async helper to run a single diagnosis prompt.
|
| 28 |
-
async def
|
| 29 |
prompt = await template.aformat_prompt(standardized_exercise=standardized_exercise)
|
| 30 |
messages = prompt.to_messages()
|
| 31 |
diagnosis_response = await self.llm_diagnose.ainvoke(messages)
|
| 32 |
content = diagnosis_response.content if hasattr(diagnosis_response, "content") else diagnosis_response
|
| 33 |
-
return f"
|
| 34 |
|
| 35 |
# Launch all diagnosis tasks concurrently.
|
| 36 |
tasks = [
|
| 37 |
-
|
| 38 |
for idx, template in enumerate(self.templates_diagnose, start=1)
|
| 39 |
]
|
| 40 |
diagnoses = await asyncio.gather(*tasks)
|
| 41 |
|
| 42 |
# Step 3: Combine the outputs from each prompt.
|
| 43 |
-
combined_diagnosis = "\n\n---\n
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
class Config:
|
| 47 |
arbitrary_types_allowed = True
|
|
|
|
| 8 |
|
| 9 |
class DiagnoserChain(BaseModel):
|
| 10 |
template_standardize: ChatPromptTemplate
|
| 11 |
+
llm_standardize: Any # Fixed LLM for step 1 and 3
|
| 12 |
templates_diagnose: List[ChatPromptTemplate]
|
| 13 |
llm_diagnose: Any # User-selectable LLM for step 2
|
| 14 |
+
template_diagnose_scorecard: ChatPromptTemplate
|
| 15 |
|
| 16 |
async def run(self, user_query: str, exercise_format: str) -> str:
|
| 17 |
"""
|
| 18 |
Runs the composite chain:
|
| 19 |
+
1. Standardizes the exercise formatting
|
| 20 |
+
2. Feeds the standardized exercise into multiple diagnosis prompts in parallel
|
| 21 |
+
3. Combines the outputs from each prompt.
|
| 22 |
+
4. Generates one-line scorecard of combined diagnoses
|
| 23 |
+
|
| 24 |
"""
|
| 25 |
# Step 1: Standardize the exercise.
|
| 26 |
standardized_exercise = await standardize_exercise(
|
|
|
|
| 28 |
)
|
| 29 |
|
| 30 |
# Step 2: Define an async helper to run a single diagnosis prompt.
|
| 31 |
+
async def run_single_diagnosis(template: ChatPromptTemplate, idx: int) -> str:
|
| 32 |
prompt = await template.aformat_prompt(standardized_exercise=standardized_exercise)
|
| 33 |
messages = prompt.to_messages()
|
| 34 |
diagnosis_response = await self.llm_diagnose.ainvoke(messages)
|
| 35 |
content = diagnosis_response.content if hasattr(diagnosis_response, "content") else diagnosis_response
|
| 36 |
+
return f"[DIAGNOSIS {idx}]{content}"
|
| 37 |
|
| 38 |
# Launch all diagnosis tasks concurrently.
|
| 39 |
tasks = [
|
| 40 |
+
run_single_diagnosis(template, idx)
|
| 41 |
for idx, template in enumerate(self.templates_diagnose, start=1)
|
| 42 |
]
|
| 43 |
diagnoses = await asyncio.gather(*tasks)
|
| 44 |
|
| 45 |
# Step 3: Combine the outputs from each prompt.
|
| 46 |
+
combined_diagnosis = "\n\n---\n".join(diagnoses)
|
| 47 |
+
|
| 48 |
+
# Step 4: Generate scorecard
|
| 49 |
+
prompt = await self.template_diagnose_scorecard.aformat_prompt(combined_diagnosis=combined_diagnosis)
|
| 50 |
+
scorecard_messages = prompt.to_messages()
|
| 51 |
+
scorecard_response = await self.llm_diagnose.ainvoke(scorecard_messages)
|
| 52 |
+
scorecard = scorecard_response.content if hasattr(scorecard_response, "content") else scorecard_response
|
| 53 |
+
|
| 54 |
+
return scorecard + "\n" + combined_diagnosis
|
| 55 |
|
| 56 |
class Config:
|
| 57 |
arbitrary_types_allowed = True
|
config/chain_configs.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
# config/chain_configs.py
|
| 2 |
from config.templates import standardize_template, diagnose_template, distractors_template, \
|
| 3 |
-
|
| 4 |
-
|
|
|
|
| 5 |
from chains.diagnoser_chain import DiagnoserChain
|
| 6 |
from chains.distractors_chain import DistractorsChain
|
| 7 |
from config.llm_config import llms
|
|
@@ -12,13 +13,14 @@ chain_configs = {
|
|
| 12 |
"class": DiagnoserChain,
|
| 13 |
"template_standardize": standardize_template,
|
| 14 |
"llm_standardize": llms["GPT-4o-mini"], # Always fixed
|
| 15 |
-
#
|
| 16 |
"templates_diagnose": [
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
],
|
|
|
|
| 22 |
"llm_diagnose": llms["GPT-4o"], # Default; can be replaced in UI
|
| 23 |
},
|
| 24 |
"distractors": {
|
|
|
|
| 1 |
# config/chain_configs.py
|
| 2 |
from config.templates import standardize_template, diagnose_template, distractors_template, \
|
| 3 |
+
template_diagnose_double_negation, template_diagnose_correct_answer_stands_out, \
|
| 4 |
+
template_diagnose_distractor_clearly_wrong, template_diagnose_distractor_partially_correct, \
|
| 5 |
+
diagnose_scorecard_template
|
| 6 |
from chains.diagnoser_chain import DiagnoserChain
|
| 7 |
from chains.distractors_chain import DistractorsChain
|
| 8 |
from config.llm_config import llms
|
|
|
|
| 13 |
"class": DiagnoserChain,
|
| 14 |
"template_standardize": standardize_template,
|
| 15 |
"llm_standardize": llms["GPT-4o-mini"], # Always fixed
|
| 16 |
+
# 4 different diagnosis templates (to run in parallel:
|
| 17 |
"templates_diagnose": [
|
| 18 |
+
template_diagnose_double_negation,
|
| 19 |
+
template_diagnose_correct_answer_stands_out,
|
| 20 |
+
template_diagnose_distractor_clearly_wrong,
|
| 21 |
+
template_diagnose_distractor_partially_correct,
|
| 22 |
],
|
| 23 |
+
"template_diagnose_scorecard": diagnose_scorecard_template,
|
| 24 |
"llm_diagnose": llms["GPT-4o"], # Default; can be replaced in UI
|
| 25 |
},
|
| 26 |
"distractors": {
|
config/templates.py
CHANGED
|
@@ -19,7 +19,7 @@ diagnose_template = ChatPromptTemplate(
|
|
| 19 |
input_variables=["standardized_exercise"]
|
| 20 |
)
|
| 21 |
|
| 22 |
-
|
| 23 |
messages=[
|
| 24 |
("system", """You analyze a multiple-choice exercise for the presence of double negatives.
|
| 25 |
Here are some examples of double negatives:
|
|
@@ -66,7 +66,7 @@ diagnose_double_negation_template = ChatPromptTemplate(
|
|
| 66 |
input_variables=["standardized_exercise"]
|
| 67 |
)
|
| 68 |
|
| 69 |
-
|
| 70 |
messages=[
|
| 71 |
("system", """You evaluate a multiple-choice exercise to determine if the correct answer
|
| 72 |
stands out too much compared to the distractors. If the correct answer is significantly
|
|
@@ -127,7 +127,7 @@ diagnose_correct_answer_stands_out_template = ChatPromptTemplate(
|
|
| 127 |
# </explanation how the correct answer stands out>
|
| 128 |
# </example where X>
|
| 129 |
|
| 130 |
-
|
| 131 |
messages=[
|
| 132 |
("system", """You assess a multiple-choice exercise to determine if any distractors
|
| 133 |
are clearly incorrect and therefore too easy to eliminate. Effective distractors should
|
|
@@ -140,7 +140,7 @@ diagnose_distractor_clearly_wrong_template = ChatPromptTemplate(
|
|
| 140 |
input_variables=["standardized_exercise"]
|
| 141 |
)
|
| 142 |
|
| 143 |
-
|
| 144 |
messages=[
|
| 145 |
("system", """You analyze a multiple-choice exercise to detect distractors that are
|
| 146 |
partially correct. Some answer choices may contain elements of truth, leading to
|
|
@@ -153,6 +153,35 @@ diagnose_distractor_partially_correct_template = ChatPromptTemplate(
|
|
| 153 |
input_variables=["standardized_exercise"]
|
| 154 |
)
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
# Template for the distractors brainstorm
|
| 157 |
distractors_template = ChatPromptTemplate(
|
| 158 |
messages=[
|
|
|
|
| 19 |
input_variables=["standardized_exercise"]
|
| 20 |
)
|
| 21 |
|
| 22 |
+
template_diagnose_double_negation = ChatPromptTemplate(
|
| 23 |
messages=[
|
| 24 |
("system", """You analyze a multiple-choice exercise for the presence of double negatives.
|
| 25 |
Here are some examples of double negatives:
|
|
|
|
| 66 |
input_variables=["standardized_exercise"]
|
| 67 |
)
|
| 68 |
|
| 69 |
+
template_diagnose_correct_answer_stands_out = ChatPromptTemplate(
|
| 70 |
messages=[
|
| 71 |
("system", """You evaluate a multiple-choice exercise to determine if the correct answer
|
| 72 |
stands out too much compared to the distractors. If the correct answer is significantly
|
|
|
|
| 127 |
# </explanation how the correct answer stands out>
|
| 128 |
# </example where X>
|
| 129 |
|
| 130 |
+
template_diagnose_distractor_clearly_wrong = ChatPromptTemplate(
|
| 131 |
messages=[
|
| 132 |
("system", """You assess a multiple-choice exercise to determine if any distractors
|
| 133 |
are clearly incorrect and therefore too easy to eliminate. Effective distractors should
|
|
|
|
| 140 |
input_variables=["standardized_exercise"]
|
| 141 |
)
|
| 142 |
|
| 143 |
+
template_diagnose_distractor_partially_correct = ChatPromptTemplate(
|
| 144 |
messages=[
|
| 145 |
("system", """You analyze a multiple-choice exercise to detect distractors that are
|
| 146 |
partially correct. Some answer choices may contain elements of truth, leading to
|
|
|
|
| 153 |
input_variables=["standardized_exercise"]
|
| 154 |
)
|
| 155 |
|
| 156 |
+
diagnose_scorecard_template = ChatPromptTemplate(
|
| 157 |
+
messages=[
|
| 158 |
+
("system", """You analyze the results of the diagnoses of 4 issues, and consolidate that into a very simple one-line visual scorecard that summarizes all diagnoses, immediately giving an overview of the 4 results.
|
| 159 |
+
Use these two icons:
|
| 160 |
+
- β
means the diagnosis came back negative, the issues is not present.
|
| 161 |
+
- β means the diagnosis came back positive, the issues is present.
|
| 162 |
+
(and a third icon if need be: - β means you don't understand the diagnosis result)
|
| 163 |
+
The scorecard should always look like this:
|
| 164 |
+
<template>
|
| 165 |
+
|Double negative: [icon] |Correct answer stands out: [icon] |Distractor clearly false: [icon] |Distractor kinda correct: [icon] |
|
| 166 |
+
</template>
|
| 167 |
+
<example 1>
|
| 168 |
+
|Double negative:β
||Correct answer stands out:β
||Distractor clearly false:β
||Distractor kinda correct:β
|
|
| 169 |
+
</example1 >
|
| 170 |
+
<example 2>
|
| 171 |
+
|Double negative:β
||Correct answer stands out:β||Distractor clearly false:β
||Distractor kinda correct:β|
|
| 172 |
+
</example 2>
|
| 173 |
+
<example 3>
|
| 174 |
+
|Double negative:β||Correct answer stands out:β||Distractor clearly false:β
||Distractor kinda correct:β|
|
| 175 |
+
</example 3>
|
| 176 |
+
<example 4>
|
| 177 |
+
|Double negative:β
||Correct answer stands out:β
||Distractor clearly false:β||Distractor kinda correct:β
|
|
| 178 |
+
</example 4>
|
| 179 |
+
"""),
|
| 180 |
+
("human", "{combined_diagnosis}")
|
| 181 |
+
],
|
| 182 |
+
input_variables=["combined_diagnosis"]
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
# Template for the distractors brainstorm
|
| 186 |
distractors_template = ChatPromptTemplate(
|
| 187 |
messages=[
|