refactored diagnoser chain
Browse files- app.py +29 -14
- chains/diagnoser_chain.py +20 -24
- config/chain_configs.py +11 -6
- config/llm_config.py +2 -1
- config/templates.py +8 -8
- test exercises.txt β test exercises.md +0 -0
app.py
CHANGED
|
@@ -4,7 +4,8 @@ import os
|
|
| 4 |
import asyncio
|
| 5 |
import logging
|
| 6 |
|
| 7 |
-
from
|
|
|
|
| 8 |
from config.chain_configs import chain_configs
|
| 9 |
from config.llm_config import llms
|
| 10 |
|
|
@@ -16,7 +17,7 @@ def update_exercise_format(selected_model: str):
|
|
| 16 |
if selected_model == "Claude 3.5":
|
| 17 |
return gr.update(value="XML")
|
| 18 |
else:
|
| 19 |
-
return gr.update(value="
|
| 20 |
|
| 21 |
# A generic async runner for chains.
|
| 22 |
async def run_chain(chain_name: str, input_variables: dict, selected_model: str):
|
|
@@ -61,26 +62,40 @@ async def run_chain(chain_name: str, input_variables: dict, selected_model: str)
|
|
| 61 |
|
| 62 |
# Async wrappers for each chain.
|
| 63 |
async def run_diagnoser(user_query: str, chosen_model: str, exercise_format: str, sampling_count: str) -> tuple:
|
|
|
|
| 64 |
num_samples = int("".join(filter(str.isdigit, sampling_count)))
|
|
|
|
| 65 |
# Fetch the DiagnoserChain configuration.
|
| 66 |
config = chain_configs["diagnoser"]
|
| 67 |
|
| 68 |
-
#
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
chain_instance = config["class"](
|
| 72 |
-
template_standardize=config["template_standardize"],
|
| 73 |
templates_diagnose=config["templates_diagnose"],
|
|
|
|
| 74 |
template_diagnose_scorecard=config["template_diagnose_scorecard"],
|
| 75 |
-
|
| 76 |
-
llm_diagnose=llms.get(chosen_model, config["llm_diagnose"]) # Override or fallback to default
|
| 77 |
)
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
# Return a tuple of exactly 5 responses.
|
| 85 |
return tuple(all_responses)
|
| 86 |
|
|
|
|
| 4 |
import asyncio
|
| 5 |
import logging
|
| 6 |
|
| 7 |
+
from config.exercise_standardizer import standardize_exercise
|
| 8 |
+
from utils.auth import login as auth_login
|
| 9 |
from config.chain_configs import chain_configs
|
| 10 |
from config.llm_config import llms
|
| 11 |
|
|
|
|
| 17 |
if selected_model == "Claude 3.5":
|
| 18 |
return gr.update(value="XML")
|
| 19 |
else:
|
| 20 |
+
return gr.update(value="Plaintext")
|
| 21 |
|
| 22 |
# A generic async runner for chains.
|
| 23 |
async def run_chain(chain_name: str, input_variables: dict, selected_model: str):
|
|
|
|
| 62 |
|
| 63 |
# Async wrappers for each chain.
|
| 64 |
async def run_diagnoser(user_query: str, chosen_model: str, exercise_format: str, sampling_count: str) -> tuple:
|
| 65 |
+
# figure out how many times to run
|
| 66 |
num_samples = int("".join(filter(str.isdigit, sampling_count)))
|
| 67 |
+
|
| 68 |
# Fetch the DiagnoserChain configuration.
|
| 69 |
config = chain_configs["diagnoser"]
|
| 70 |
|
| 71 |
+
# 1) Standardize the user query exactly once
|
| 72 |
+
standardized_exercise = await standardize_exercise(
|
| 73 |
+
user_query,
|
| 74 |
+
exercise_format,
|
| 75 |
+
config["template_standardize"],
|
| 76 |
+
config["llm_standardize"]
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# 2) Instantiate the DiagnoserChain using the user-selected LLM for diagnosing
|
| 80 |
chain_instance = config["class"](
|
|
|
|
| 81 |
templates_diagnose=config["templates_diagnose"],
|
| 82 |
+
llm_diagnose=llms.get(chosen_model, config["llm_diagnose"]),
|
| 83 |
template_diagnose_scorecard=config["template_diagnose_scorecard"],
|
| 84 |
+
llm_4o_mini=config["llm_4o_mini"]
|
|
|
|
| 85 |
)
|
| 86 |
+
|
| 87 |
+
# 3) Run the multiple samples in parallel
|
| 88 |
+
# Create a short helper that does only the "diagnose" steps:
|
| 89 |
+
tasks = [
|
| 90 |
+
chain_instance.diagnose_only(standardized_exercise)
|
| 91 |
+
for _ in range(num_samples)
|
| 92 |
+
]
|
| 93 |
+
# run concurrently
|
| 94 |
+
responses = await asyncio.gather(*tasks)
|
| 95 |
+
|
| 96 |
+
# pad up to 5 if needed
|
| 97 |
+
all_responses = list(responses) + [""] * (5 - len(responses))
|
| 98 |
+
|
| 99 |
# Return a tuple of exactly 5 responses.
|
| 100 |
return tuple(all_responses)
|
| 101 |
|
chains/diagnoser_chain.py
CHANGED
|
@@ -7,49 +7,45 @@ from config.exercise_standardizer import standardize_exercise
|
|
| 7 |
|
| 8 |
|
| 9 |
class DiagnoserChain(BaseModel):
|
| 10 |
-
template_standardize: ChatPromptTemplate
|
| 11 |
-
llm_standardize: Any # Fixed LLM for step 1
|
| 12 |
templates_diagnose: List[ChatPromptTemplate]
|
| 13 |
-
llm_diagnose: Any # User-selectable LLM for step 2
|
| 14 |
template_diagnose_scorecard: ChatPromptTemplate
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
async def
|
| 17 |
"""
|
| 18 |
-
|
| 19 |
-
1
|
| 20 |
-
2
|
| 21 |
-
3
|
| 22 |
-
4
|
| 23 |
-
|
| 24 |
"""
|
| 25 |
-
# Step 1: Standardize the exercise.
|
| 26 |
-
standardized_exercise = await standardize_exercise(
|
| 27 |
-
user_query, exercise_format, self.template_standardize, self.llm_standardize
|
| 28 |
-
)
|
| 29 |
|
| 30 |
-
# Step
|
| 31 |
async def run_single_diagnosis(template: ChatPromptTemplate, idx: int) -> str:
|
| 32 |
prompt = await template.aformat_prompt(standardized_exercise=standardized_exercise)
|
| 33 |
messages = prompt.to_messages()
|
| 34 |
diagnosis_response = await self.llm_diagnose.ainvoke(messages)
|
| 35 |
-
content =
|
| 36 |
-
return f"--- [DIAGNOSIS {idx}]
|
| 37 |
|
| 38 |
-
#
|
| 39 |
tasks = [
|
| 40 |
run_single_diagnosis(template, idx)
|
| 41 |
for idx, template in enumerate(self.templates_diagnose, start=1)
|
| 42 |
]
|
| 43 |
diagnoses = await asyncio.gather(*tasks)
|
| 44 |
|
| 45 |
-
# Step 3:
|
| 46 |
-
combined_diagnosis = "\n
|
| 47 |
|
| 48 |
-
# Step 4: Generate scorecard
|
| 49 |
-
prompt = await self.template_diagnose_scorecard.aformat_prompt(
|
|
|
|
|
|
|
| 50 |
scorecard_messages = prompt.to_messages()
|
| 51 |
-
scorecard_response = await self.
|
| 52 |
-
scorecard =
|
| 53 |
|
| 54 |
return combined_diagnosis + "\n\n" + scorecard
|
| 55 |
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
class DiagnoserChain(BaseModel):
|
|
|
|
|
|
|
| 10 |
templates_diagnose: List[ChatPromptTemplate]
|
|
|
|
| 11 |
template_diagnose_scorecard: ChatPromptTemplate
|
| 12 |
+
llm_diagnose: Any
|
| 13 |
+
llm_4o_mini: Any
|
| 14 |
|
| 15 |
+
async def diagnose_only(self, standardized_exercise: str) -> str:
|
| 16 |
"""
|
| 17 |
+
Takes a PRE-standardized exercise and:
|
| 18 |
+
(1) Runs multiple diagnosis prompts in parallel,
|
| 19 |
+
(2) Merges the results,
|
| 20 |
+
(3) Generates a scorecard line,
|
| 21 |
+
(4) Returns the combined text + scorecard.
|
|
|
|
| 22 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
# Step 1: define an async helper to run each diagnosis in parallel
|
| 25 |
async def run_single_diagnosis(template: ChatPromptTemplate, idx: int) -> str:
|
| 26 |
prompt = await template.aformat_prompt(standardized_exercise=standardized_exercise)
|
| 27 |
messages = prompt.to_messages()
|
| 28 |
diagnosis_response = await self.llm_diagnose.ainvoke(messages)
|
| 29 |
+
content = getattr(diagnosis_response, "content", diagnosis_response)
|
| 30 |
+
return f"--- [DIAGNOSIS {idx}] ---\n{content}"
|
| 31 |
|
| 32 |
+
# Step 2: launch all diagnoses concurrently
|
| 33 |
tasks = [
|
| 34 |
run_single_diagnosis(template, idx)
|
| 35 |
for idx, template in enumerate(self.templates_diagnose, start=1)
|
| 36 |
]
|
| 37 |
diagnoses = await asyncio.gather(*tasks)
|
| 38 |
|
| 39 |
+
# Step 3: combine the outputs
|
| 40 |
+
combined_diagnosis = "\n".join(diagnoses)
|
| 41 |
|
| 42 |
+
# Step 4: Generate a one-line scorecard
|
| 43 |
+
prompt = await self.template_diagnose_scorecard.aformat_prompt(
|
| 44 |
+
combined_diagnosis=combined_diagnosis
|
| 45 |
+
)
|
| 46 |
scorecard_messages = prompt.to_messages()
|
| 47 |
+
scorecard_response = await self.llm_4o_mini.ainvoke(scorecard_messages)
|
| 48 |
+
scorecard = getattr(scorecard_response, "content", scorecard_response)
|
| 49 |
|
| 50 |
return combined_diagnosis + "\n\n" + scorecard
|
| 51 |
|
config/chain_configs.py
CHANGED
|
@@ -1,8 +1,14 @@
|
|
| 1 |
# config/chain_configs.py
|
| 2 |
-
from config.templates import
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
diagnose_scorecard_template
|
|
|
|
| 6 |
from chains.diagnoser_chain import DiagnoserChain
|
| 7 |
from chains.distractors_chain import DistractorsChain
|
| 8 |
from config.llm_config import llms
|
|
@@ -11,8 +17,7 @@ from config.llm_config import llms
|
|
| 11 |
chain_configs = {
|
| 12 |
"diagnoser": {
|
| 13 |
"class": DiagnoserChain,
|
| 14 |
-
"
|
| 15 |
-
"llm_standardize": llms["GPT-4o-mini"], # Always fixed
|
| 16 |
# 4 different diagnosis templates (to run in parallel:
|
| 17 |
"templates_diagnose": [
|
| 18 |
template_diagnose_double_negation,
|
|
@@ -26,7 +31,7 @@ chain_configs = {
|
|
| 26 |
"distractors": {
|
| 27 |
"class": DistractorsChain,
|
| 28 |
"template_standardize": standardize_template,
|
| 29 |
-
"llm_standardize": llms["GPT-4o-mini"], # Always fixed
|
| 30 |
"template_distractors": distractors_template,
|
| 31 |
"llm_distractors": llms["GPT-4o"], # Default; can be replaced in UI
|
| 32 |
},
|
|
|
|
| 1 |
# config/chain_configs.py
|
| 2 |
+
from config.templates import (
|
| 3 |
+
standardize_template,
|
| 4 |
+
diagnose_template,
|
| 5 |
+
distractors_template,
|
| 6 |
+
template_diagnose_double_negation,
|
| 7 |
+
template_diagnose_correct_answer_stands_out,
|
| 8 |
+
template_diagnose_distractor_clearly_wrong,
|
| 9 |
+
template_diagnose_distractor_partially_correct,
|
| 10 |
diagnose_scorecard_template
|
| 11 |
+
)
|
| 12 |
from chains.diagnoser_chain import DiagnoserChain
|
| 13 |
from chains.distractors_chain import DistractorsChain
|
| 14 |
from config.llm_config import llms
|
|
|
|
| 17 |
chain_configs = {
|
| 18 |
"diagnoser": {
|
| 19 |
"class": DiagnoserChain,
|
| 20 |
+
"llm_4o_mini": llms["GPT-4o-mini"],
|
|
|
|
| 21 |
# 4 different diagnosis templates (to run in parallel:
|
| 22 |
"templates_diagnose": [
|
| 23 |
template_diagnose_double_negation,
|
|
|
|
| 31 |
"distractors": {
|
| 32 |
"class": DistractorsChain,
|
| 33 |
"template_standardize": standardize_template,
|
| 34 |
+
"llm_standardize": llms["GPT-4o-mini-zero"], # Always fixed
|
| 35 |
"template_distractors": distractors_template,
|
| 36 |
"llm_distractors": llms["GPT-4o"], # Default; can be replaced in UI
|
| 37 |
},
|
config/llm_config.py
CHANGED
|
@@ -30,7 +30,8 @@ def create_deepseek_llm(model_name: str, temperature: float):
|
|
| 30 |
|
| 31 |
llms = {
|
| 32 |
"GPT-4o": create_openai_llm("gpt-4o", LOW),
|
| 33 |
-
"GPT-4o-mini": create_openai_llm("gpt-4o-mini", ZERO),
|
|
|
|
| 34 |
"GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
|
| 35 |
"GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
|
| 36 |
"GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", HIGH),
|
|
|
|
| 30 |
|
| 31 |
llms = {
|
| 32 |
"GPT-4o": create_openai_llm("gpt-4o", LOW),
|
| 33 |
+
"GPT-4o-mini-zero": create_openai_llm("gpt-4o-mini", ZERO),
|
| 34 |
+
"GPT-4o-mini": create_openai_llm("gpt-4o-mini", LOW),
|
| 35 |
"GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
|
| 36 |
"GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
|
| 37 |
"GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", HIGH),
|
config/templates.py
CHANGED
|
@@ -60,7 +60,7 @@ template_diagnose_double_negation = ChatPromptTemplate(
|
|
| 60 |
</double negative explanation>
|
| 61 |
</example 2>.
|
| 62 |
If it's obvious that there is or isn't a double negative in this exercise, just give a short one-sentence diagnosis on this.
|
| 63 |
-
If
|
| 64 |
("human", "{standardized_exercise}")
|
| 65 |
],
|
| 66 |
input_variables=["standardized_exercise"]
|
|
@@ -114,7 +114,7 @@ template_diagnose_correct_answer_stands_out = ChatPromptTemplate(
|
|
| 114 |
</example where the correct answer is grammatically different>
|
| 115 |
|
| 116 |
Your only focus is to accurately diagnose this issue, no need to provide a fix. If the correct answer in the given exercise clearly does or does not stand out, just give a short one-sentence diagnosis on this.
|
| 117 |
-
If
|
| 118 |
("human", "{standardized_exercise}")
|
| 119 |
],
|
| 120 |
input_variables=["standardized_exercise"]
|
|
@@ -134,7 +134,7 @@ template_diagnose_distractor_clearly_wrong = ChatPromptTemplate(
|
|
| 134 |
be plausible but incorrect.
|
| 135 |
Identify distractors that are obviously wrong, such that even students that are completely uninformed about the topic can eliminate them.
|
| 136 |
Your only focus is to accurately diagnose this issue, no need to provide a fix. If all distractors in the given exercise clearly are or aren't obviously incorrect, just give a short one-sentence diagnosis on this.
|
| 137 |
-
If
|
| 138 |
("human", "{standardized_exercise}")
|
| 139 |
],
|
| 140 |
input_variables=["standardized_exercise"]
|
|
@@ -146,7 +146,7 @@ template_diagnose_distractor_partially_correct = ChatPromptTemplate(
|
|
| 146 |
partially correct. Some answer choices may contain elements of truth, leading to
|
| 147 |
ambiguity. Identify such cases. Really stress-test them: is there a story you could tell where the distractor, in the context of this exercise, could be considered a (partially) correct answer?
|
| 148 |
Your only focus is to accurately diagnose this issue, no need to provide a fix. If all distractors in the given exercise clearly are or aren't unambiguously false, just give a short one-sentence diagnosis on this.
|
| 149 |
-
If
|
| 150 |
"""),
|
| 151 |
("human", "{standardized_exercise}")
|
| 152 |
],
|
|
@@ -166,16 +166,16 @@ diagnose_scorecard_template = ChatPromptTemplate(
|
|
| 166 |
(and a third icon if need be: - β means the diagnosis is unclear)
|
| 167 |
The scorecard should always look like this:
|
| 168 |
<template>
|
| 169 |
-
|
| 170 |
</template>
|
| 171 |
<example 1>
|
| 172 |
-
|
| 173 |
</example 1>
|
| 174 |
<example 2>
|
| 175 |
-
|
| 176 |
</example 2>
|
| 177 |
<example 3>
|
| 178 |
-
|
| 179 |
</example 3>
|
| 180 |
"""),
|
| 181 |
("human", "{combined_diagnosis}")
|
|
|
|
| 60 |
</double negative explanation>
|
| 61 |
</example 2>.
|
| 62 |
If it's obvious that there is or isn't a double negative in this exercise, just give a short one-sentence diagnosis on this.
|
| 63 |
+
If the issue is more nuanced, do some reasoning first, and give your diagnosis then."""),
|
| 64 |
("human", "{standardized_exercise}")
|
| 65 |
],
|
| 66 |
input_variables=["standardized_exercise"]
|
|
|
|
| 114 |
</example where the correct answer is grammatically different>
|
| 115 |
|
| 116 |
Your only focus is to accurately diagnose this issue, no need to provide a fix. If the correct answer in the given exercise clearly does or does not stand out, just give a short one-sentence diagnosis on this.
|
| 117 |
+
If the issue is more nuanced, do some reasoning first, and give your diagnosis then."""),
|
| 118 |
("human", "{standardized_exercise}")
|
| 119 |
],
|
| 120 |
input_variables=["standardized_exercise"]
|
|
|
|
| 134 |
be plausible but incorrect.
|
| 135 |
Identify distractors that are obviously wrong, such that even students that are completely uninformed about the topic can eliminate them.
|
| 136 |
Your only focus is to accurately diagnose this issue, no need to provide a fix. If all distractors in the given exercise clearly are or aren't obviously incorrect, just give a short one-sentence diagnosis on this.
|
| 137 |
+
If the issue is more nuanced, do some reasoning first, and give your diagnosis then."""),
|
| 138 |
("human", "{standardized_exercise}")
|
| 139 |
],
|
| 140 |
input_variables=["standardized_exercise"]
|
|
|
|
| 146 |
partially correct. Some answer choices may contain elements of truth, leading to
|
| 147 |
ambiguity. Identify such cases. Really stress-test them: is there a story you could tell where the distractor, in the context of this exercise, could be considered a (partially) correct answer?
|
| 148 |
Your only focus is to accurately diagnose this issue, no need to provide a fix. If all distractors in the given exercise clearly are or aren't unambiguously false, just give a short one-sentence diagnosis on this.
|
| 149 |
+
If the issue is more nuanced, do some reasoning first, and give your diagnosis then.
|
| 150 |
"""),
|
| 151 |
("human", "{standardized_exercise}")
|
| 152 |
],
|
|
|
|
| 166 |
(and a third icon if need be: - β means the diagnosis is unclear)
|
| 167 |
The scorecard should always look like this:
|
| 168 |
<template>
|
| 169 |
+
The exercise does not contain/contains a double negative: β
/β -- The correct answer does not/does stand out: β
/β -- None/Some of the distractors are too obviously false: β
/β -- None/Some of the distractors are actually also kinda correct: β
/β
|
| 170 |
</template>
|
| 171 |
<example 1>
|
| 172 |
+
The exercise doesn't contain a double negative: β
-- The correct answer does not stand out: β
-- None of the distractors are too obviously false: β
-- None of the distractors are actually also kinda correct: β
|
| 173 |
</example 1>
|
| 174 |
<example 2>
|
| 175 |
+
The exercise doesn't contain a double negative: β
-- The correct answer does stand out: β -- None of the distractors are too obviously false: β
-- Some of the distractors are actually also kinda correct: β
|
| 176 |
</example 2>
|
| 177 |
<example 3>
|
| 178 |
+
The exercise contains a double negative: β -- The correct answer does not stand out: β
-- Some of the distractors are too obviously false: β -- None of the distractors are actually also kinda correct: οΏ½οΏ½
|
| 179 |
</example 3>
|
| 180 |
"""),
|
| 181 |
("human", "{combined_diagnosis}")
|
test exercises.txt β test exercises.md
RENAMED
|
File without changes
|