Biig changes (moved dropdowns into tabs, implemented distractors chain)
Browse files- app.py +181 -80
- chains/distractors_chain.py +75 -17
- config/chain_configs.py +13 -7
- config/llm_config.py +24 -8
- config/templates.py +1 -0
app.py
CHANGED
|
@@ -14,12 +14,98 @@ logger = logging.getLogger(__name__)
|
|
| 14 |
# --- Callback to update the exercise format dropdown based on LLM selection ---
|
| 15 |
def update_exercise_format(selected_model: str):
|
| 16 |
# When "Claude3.5" is selected, default the format to XML; otherwise, default to Markdown.
|
| 17 |
-
if
|
| 18 |
return gr.update(value="XML")
|
| 19 |
else:
|
| 20 |
return gr.update(value="Plaintext")
|
| 21 |
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
async def run_chain(chain_name: str, input_variables: dict, selected_model: str):
|
| 24 |
try:
|
| 25 |
chain_config = chain_configs.get(chain_name)
|
|
@@ -60,48 +146,6 @@ async def run_chain(chain_name: str, input_variables: dict, selected_model: str)
|
|
| 60 |
logger.error(f"Error in run_chain for '{chain_name}': {e}")
|
| 61 |
return f"Error: {e}"
|
| 62 |
|
| 63 |
-
# Async wrappers for each chain.
|
| 64 |
-
async def run_diagnoser(user_query: str, chosen_model: str, exercise_format: str, sampling_count: str) -> tuple:
|
| 65 |
-
# figure out how many times to run
|
| 66 |
-
num_samples = int("".join(filter(str.isdigit, sampling_count)))
|
| 67 |
-
|
| 68 |
-
# Fetch the DiagnoserChain configuration.
|
| 69 |
-
config = chain_configs["diagnoser"]
|
| 70 |
-
|
| 71 |
-
# 1) Standardize the user query exactly once
|
| 72 |
-
standardized_exercise = await standardize_exercise(
|
| 73 |
-
user_query,
|
| 74 |
-
exercise_format,
|
| 75 |
-
config["template_standardize"], # Only if you kept them in config
|
| 76 |
-
config["llm_standardize"]
|
| 77 |
-
)
|
| 78 |
-
|
| 79 |
-
# 2) Instantiate the DiagnoserChain using the user-selected LLM for diagnosing
|
| 80 |
-
chain_instance = config["class"](
|
| 81 |
-
templates_diagnose=config["templates_diagnose"],
|
| 82 |
-
llm_diagnose=llms.get(chosen_model, config["llm_diagnose"]),
|
| 83 |
-
template_diagnose_scorecard=config["template_diagnose_scorecard"],
|
| 84 |
-
llm_4o_mini=config["llm_4o_mini"],
|
| 85 |
-
llm_4o=config["llm_4o"]
|
| 86 |
-
)
|
| 87 |
-
|
| 88 |
-
# 3) Run the multiple samples in parallel
|
| 89 |
-
# Create a short helper that does only the "diagnose" steps:
|
| 90 |
-
tasks = [
|
| 91 |
-
chain_instance.diagnose_only(standardized_exercise)
|
| 92 |
-
for _ in range(num_samples)
|
| 93 |
-
]
|
| 94 |
-
# run concurrently
|
| 95 |
-
responses = await asyncio.gather(*tasks)
|
| 96 |
-
|
| 97 |
-
# pad up to 5 if needed
|
| 98 |
-
all_responses = list(responses) + [""] * (10 - len(responses))
|
| 99 |
-
|
| 100 |
-
# Return a tuple of exactly 5 responses.
|
| 101 |
-
return tuple(all_responses)
|
| 102 |
-
|
| 103 |
-
async def run_distractors(user_query: str, model_choice: str) -> str:
|
| 104 |
-
return await run_chain("distractors", {"user_query": user_query}, model_choice)
|
| 105 |
|
| 106 |
# -------------------------------
|
| 107 |
# Build the Gradio Interface
|
|
@@ -116,46 +160,48 @@ with gr.Blocks() as interface:
|
|
| 116 |
|
| 117 |
# --- Main App (initially hidden) ---
|
| 118 |
with gr.Column(visible=False, elem_id="main_app") as app_container:
|
| 119 |
-
gr.Markdown("## Pick the tab for your task of choice
|
| 120 |
-
|
| 121 |
-
# Create a row for the control dropdowns
|
| 122 |
-
with gr.Row():
|
| 123 |
-
model_choice = gr.Dropdown(
|
| 124 |
-
choices=list(llms.keys()),
|
| 125 |
-
value="GPT-4o",
|
| 126 |
-
label="Select LLM",
|
| 127 |
-
interactive=True,
|
| 128 |
-
)
|
| 129 |
-
exercise_format = gr.Dropdown(
|
| 130 |
-
choices=["Markdown", "XML", "Plaintext", "Raw (original)"],
|
| 131 |
-
value="Markdown",
|
| 132 |
-
label="Exercise Format",
|
| 133 |
-
interactive=True,
|
| 134 |
-
)
|
| 135 |
-
sampling_count = gr.Dropdown(
|
| 136 |
-
choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
|
| 137 |
-
value="1",
|
| 138 |
-
label="Sampling Count",
|
| 139 |
-
interactive=True,
|
| 140 |
-
)
|
| 141 |
-
# Set up a change callback so that if the user selects "Claude 3.5", the exercise format updates to "XML"
|
| 142 |
-
model_choice.change(
|
| 143 |
-
fn=update_exercise_format,
|
| 144 |
-
inputs=[model_choice],
|
| 145 |
-
outputs=[exercise_format]
|
| 146 |
-
)
|
| 147 |
with gr.Tabs():
|
| 148 |
-
with gr.TabItem("π©Ί
|
| 149 |
# Insert an HTML info icon with a tooltip at the top of the tab content.
|
| 150 |
gr.HTML(
|
| 151 |
"""
|
| 152 |
<div style="margin-bottom: 10px;">
|
| 153 |
-
<span style="font-size: 1.5em; cursor: help;" title="
|
| 154 |
-
βΉοΈ <i>β mouseover
|
| 155 |
</span>
|
| 156 |
</div>
|
| 157 |
"""
|
| 158 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
diagnoser_input = gr.Textbox(label="Enter exercise(s) in any format", placeholder="Exercise body: <mc:exercise xmlns:mc= ...")
|
| 160 |
diagnoser_button = gr.Button("Submit")
|
| 161 |
diagnoser_response_1 = gr.Textbox(label="Response 1", interactive=False)
|
|
@@ -168,6 +214,8 @@ with gr.Blocks() as interface:
|
|
| 168 |
diagnoser_response_8 = gr.Textbox(label="Response 8", interactive=False)
|
| 169 |
diagnoser_response_9 = gr.Textbox(label="Response 9", interactive=False)
|
| 170 |
diagnoser_response_10 = gr.Textbox(label="Response 10", interactive=False)
|
|
|
|
|
|
|
| 171 |
with gr.TabItem("π€ Generate distractors"):
|
| 172 |
# Insert an HTML info icon with a tooltip at the top of the tab content.
|
| 173 |
gr.HTML(
|
|
@@ -179,10 +227,52 @@ with gr.Blocks() as interface:
|
|
| 179 |
</div>
|
| 180 |
"""
|
| 181 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
distractors_input = gr.Textbox(label="Enter exercise(s) in any format", placeholder="Stelling: Dit is een ..... voorbeeld van een stelling. A. Mooi B. Lelijk ...")
|
| 183 |
distractors_button = gr.Button("Submit")
|
| 184 |
-
gr.
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
with gr.TabItem("π§ Generate learning objectives"):
|
| 187 |
# Insert an HTML info icon with a tooltip at the top of the tab content.
|
| 188 |
gr.HTML(
|
|
@@ -211,7 +301,7 @@ with gr.Blocks() as interface:
|
|
| 211 |
|
| 212 |
diagnoser_button.click(
|
| 213 |
fn=run_diagnoser,
|
| 214 |
-
inputs=[diagnoser_input,
|
| 215 |
outputs=[
|
| 216 |
diagnoser_response_1,
|
| 217 |
diagnoser_response_2,
|
|
@@ -228,8 +318,19 @@ with gr.Blocks() as interface:
|
|
| 228 |
|
| 229 |
distractors_button.click(
|
| 230 |
fn=run_distractors,
|
| 231 |
-
inputs=[distractors_input,
|
| 232 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
)
|
| 234 |
|
| 235 |
# Launch the app.
|
|
|
|
| 14 |
# --- Callback to update the exercise format dropdown based on LLM selection ---
|
| 15 |
def update_exercise_format(selected_model: str):
|
| 16 |
# When "Claude3.5" is selected, default the format to XML; otherwise, default to Markdown.
|
| 17 |
+
if "Claude" in selected_model:
|
| 18 |
return gr.update(value="XML")
|
| 19 |
else:
|
| 20 |
return gr.update(value="Plaintext")
|
| 21 |
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# Async wrappers for each chain.
|
| 25 |
+
async def run_diagnoser(user_query: str, model_choice_validate: str, exercise_format_validate: str, sampling_count_validate: str) -> tuple:
|
| 26 |
+
# figure out how many times to run
|
| 27 |
+
num_samples = int("".join(filter(str.isdigit, sampling_count)))
|
| 28 |
+
|
| 29 |
+
# Fetch the DiagnoserChain configuration.
|
| 30 |
+
config = chain_configs["diagnoser"]
|
| 31 |
+
|
| 32 |
+
# 1) Standardize the user query exactly once
|
| 33 |
+
standardized_exercise = await standardize_exercise(
|
| 34 |
+
user_query,
|
| 35 |
+
exercise_format,
|
| 36 |
+
config["template_standardize"], # Only if you kept them in config
|
| 37 |
+
config["llm_standardize"]
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# 2) Instantiate the DiagnoserChain using the user-selected LLM for diagnosing
|
| 41 |
+
chain_instance = config["class"](
|
| 42 |
+
templates_diagnose=config["templates_diagnose"],
|
| 43 |
+
llm_diagnose=llms.get(model_choice_validate, config["llm_diagnose"]),
|
| 44 |
+
template_diagnose_scorecard=config["template_diagnose_scorecard"],
|
| 45 |
+
llm_4o_mini=config["llm_4o_mini"],
|
| 46 |
+
llm_4o=config["llm_4o"]
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
# 3) Run the multiple samples in parallel
|
| 50 |
+
# Create a short helper that does only the "diagnose" steps:
|
| 51 |
+
tasks = [
|
| 52 |
+
chain_instance.diagnose_only(standardized_exercise)
|
| 53 |
+
for _ in range(num_samples)
|
| 54 |
+
]
|
| 55 |
+
# run concurrently
|
| 56 |
+
responses = await asyncio.gather(*tasks)
|
| 57 |
+
|
| 58 |
+
# pad up to 10 if needed
|
| 59 |
+
all_responses = list(responses) + [""] * (10 - len(responses))
|
| 60 |
+
|
| 61 |
+
# Return a tuple of exactly 5 responses.
|
| 62 |
+
return tuple(all_responses)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
async def run_distractors(
|
| 66 |
+
user_query: str,
|
| 67 |
+
model_choice_distractors_1: str,
|
| 68 |
+
model_choice_distractors_2: str,
|
| 69 |
+
exercise_format_distractors: str,
|
| 70 |
+
sampling_count_distractors: str
|
| 71 |
+
) -> tuple:
|
| 72 |
+
# 0) Parse how many concurrent runs (samples) we want
|
| 73 |
+
num_samples = int("".join(filter(str.isdigit, sampling_count_distractors)))
|
| 74 |
+
# Fetch the DistractorsChain configuration.
|
| 75 |
+
config = chain_configs["distractors"]
|
| 76 |
+
|
| 77 |
+
# 1) Standardize the user query exactly once
|
| 78 |
+
standardized_exercise = await standardize_exercise(
|
| 79 |
+
user_query,
|
| 80 |
+
exercise_format_distractors,
|
| 81 |
+
config["template_standardize"],
|
| 82 |
+
config["llm_standardize"]
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
# 2) Build the DistractorsChain instance
|
| 86 |
+
chain_instance = config["class"](
|
| 87 |
+
template_distractors_brainstorm_1=config["template_distractors_brainstorm_1"],
|
| 88 |
+
template_distractors_brainstorm_2=config["template_distractors_brainstorm_2"],
|
| 89 |
+
llm_brainstorm_1=llms.get(model_choice_distractors_1, config["llm_brainstorm_1"]), # User-selected (low and high temp GPT-4o by default)
|
| 90 |
+
llm_brainstorm_2=llms.get(model_choice_distractors_2, config["llm_brainstorm_2"]),
|
| 91 |
+
template_consolidate=config["template_consolidate"],
|
| 92 |
+
llm_consolidate=config["llm_consolidate"],
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# 3) Create N tasks in parallel (one full distractor generation pipeline per sample)
|
| 96 |
+
tasks = [
|
| 97 |
+
chain_instance.run(standardized_exercise) for _ in range(num_samples)
|
| 98 |
+
]
|
| 99 |
+
results = await asyncio.gather(*tasks)
|
| 100 |
+
|
| 101 |
+
# 4) Pad up to 10 outputs to correspond to 10 response fields
|
| 102 |
+
all_responses = list(results) + [""] * (10 - len(results))
|
| 103 |
+
|
| 104 |
+
return tuple(all_responses)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
# A generic async runner for simple chains (currently not used)
|
| 109 |
async def run_chain(chain_name: str, input_variables: dict, selected_model: str):
|
| 110 |
try:
|
| 111 |
chain_config = chain_configs.get(chain_name)
|
|
|
|
| 146 |
logger.error(f"Error in run_chain for '{chain_name}': {e}")
|
| 147 |
return f"Error: {e}"
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
# -------------------------------
|
| 151 |
# Build the Gradio Interface
|
|
|
|
| 160 |
|
| 161 |
# --- Main App (initially hidden) ---
|
| 162 |
with gr.Column(visible=False, elem_id="main_app") as app_container:
|
| 163 |
+
gr.Markdown("## Pick the tab for your task of choice")
|
| 164 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
with gr.Tabs():
|
| 166 |
+
with gr.TabItem("π©Ί Diagnose exercise"):
|
| 167 |
# Insert an HTML info icon with a tooltip at the top of the tab content.
|
| 168 |
gr.HTML(
|
| 169 |
"""
|
| 170 |
<div style="margin-bottom: 10px;">
|
| 171 |
+
<span style="font-size: 1.5em; cursor: help;" title="Diagnose exercise: Diagnoses potential issues for the given exercise(s).">
|
| 172 |
+
βΉοΈ <i>β mouseover</i>
|
| 173 |
</span>
|
| 174 |
</div>
|
| 175 |
"""
|
| 176 |
)
|
| 177 |
+
|
| 178 |
+
# Create a row for the control dropdowns: LLM selection, exercise format, sampling count etc.
|
| 179 |
+
with gr.Row():
|
| 180 |
+
model_choice_validate = gr.Dropdown(
|
| 181 |
+
choices=list(llms.keys()),
|
| 182 |
+
value="GPT-4o (low temp)",
|
| 183 |
+
label="Select LLM",
|
| 184 |
+
interactive=True,
|
| 185 |
+
)
|
| 186 |
+
exercise_format_validate = gr.Dropdown(
|
| 187 |
+
choices=["Markdown", "XML", "Plaintext", "Raw (original)"],
|
| 188 |
+
value="Markdown",
|
| 189 |
+
label="Exercise Format",
|
| 190 |
+
interactive=True,
|
| 191 |
+
)
|
| 192 |
+
sampling_count_validate = gr.Dropdown(
|
| 193 |
+
choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
|
| 194 |
+
value="1",
|
| 195 |
+
label="Sampling Count",
|
| 196 |
+
interactive=True,
|
| 197 |
+
)
|
| 198 |
+
# Set up a change callback so that if the user selects any model with "Claude" in the name, the exercise format updates to "XML"
|
| 199 |
+
model_choice_validate.change(
|
| 200 |
+
fn=update_exercise_format,
|
| 201 |
+
inputs=[model_choice_validate],
|
| 202 |
+
outputs=[exercise_format_validate]
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
diagnoser_input = gr.Textbox(label="Enter exercise(s) in any format", placeholder="Exercise body: <mc:exercise xmlns:mc= ...")
|
| 206 |
diagnoser_button = gr.Button("Submit")
|
| 207 |
diagnoser_response_1 = gr.Textbox(label="Response 1", interactive=False)
|
|
|
|
| 214 |
diagnoser_response_8 = gr.Textbox(label="Response 8", interactive=False)
|
| 215 |
diagnoser_response_9 = gr.Textbox(label="Response 9", interactive=False)
|
| 216 |
diagnoser_response_10 = gr.Textbox(label="Response 10", interactive=False)
|
| 217 |
+
|
| 218 |
+
|
| 219 |
with gr.TabItem("π€ Generate distractors"):
|
| 220 |
# Insert an HTML info icon with a tooltip at the top of the tab content.
|
| 221 |
gr.HTML(
|
|
|
|
| 227 |
</div>
|
| 228 |
"""
|
| 229 |
)
|
| 230 |
+
|
| 231 |
+
# Create a row for the control dropdowns: LLM selection, exercise format, sampling count etc.
|
| 232 |
+
with gr.Row():
|
| 233 |
+
model_choice_distractors_1 = gr.Dropdown(
|
| 234 |
+
choices=list(llms.keys()),
|
| 235 |
+
value="GPT-4o (low temp)",
|
| 236 |
+
label="Select first LLM",
|
| 237 |
+
interactive=True,
|
| 238 |
+
)
|
| 239 |
+
model_choice_distractors_2 = gr.Dropdown(
|
| 240 |
+
choices=list(llms.keys()),
|
| 241 |
+
value="GPT-4o (mid temp)",
|
| 242 |
+
label="Select second LLM",
|
| 243 |
+
interactive=True,
|
| 244 |
+
)
|
| 245 |
+
exercise_format_distractors = gr.Dropdown(
|
| 246 |
+
choices=["Markdown", "XML", "Plaintext", "Raw (original)"],
|
| 247 |
+
value="Plaintext",
|
| 248 |
+
label="Exercise Format",
|
| 249 |
+
interactive=True,
|
| 250 |
+
)
|
| 251 |
+
sampling_count_distractors = gr.Dropdown(
|
| 252 |
+
choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
|
| 253 |
+
value="1",
|
| 254 |
+
label="Sampling Count",
|
| 255 |
+
interactive=True,
|
| 256 |
+
)
|
| 257 |
+
# Set up a change callback so that if the user selects any model with "Claude" in the name, the exercise format updates to "XML"
|
| 258 |
+
model_choice_distractors_1.change(
|
| 259 |
+
fn=update_exercise_format,
|
| 260 |
+
inputs=[model_choice_distractors_1],
|
| 261 |
+
outputs=[exercise_format_distractors]
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
distractors_input = gr.Textbox(label="Enter exercise(s) in any format", placeholder="Stelling: Dit is een ..... voorbeeld van een stelling. A. Mooi B. Lelijk ...")
|
| 265 |
distractors_button = gr.Button("Submit")
|
| 266 |
+
distractors_response_1 = gr.Textbox(label="Response 1", interactive=False)
|
| 267 |
+
distractors_response_2 = gr.Textbox(label="Response 2", interactive=False)
|
| 268 |
+
distractors_response_3 = gr.Textbox(label="Response 3", interactive=False)
|
| 269 |
+
distractors_response_4 = gr.Textbox(label="Response 4", interactive=False)
|
| 270 |
+
distractors_response_5 = gr.Textbox(label="Response 5", interactive=False)
|
| 271 |
+
distractors_response_6 = gr.Textbox(label="Response 6", interactive=False)
|
| 272 |
+
distractors_response_7 = gr.Textbox(label="Response 7", interactive=False)
|
| 273 |
+
distractors_response_8 = gr.Textbox(label="Response 8", interactive=False)
|
| 274 |
+
distractors_response_9 = gr.Textbox(label="Response 9", interactive=False)
|
| 275 |
+
distractors_response_10 = gr.Textbox(label="Response 10", interactive=False)
|
| 276 |
with gr.TabItem("π§ Generate learning objectives"):
|
| 277 |
# Insert an HTML info icon with a tooltip at the top of the tab content.
|
| 278 |
gr.HTML(
|
|
|
|
| 301 |
|
| 302 |
diagnoser_button.click(
|
| 303 |
fn=run_diagnoser,
|
| 304 |
+
inputs=[diagnoser_input, model_choice_validate, exercise_format_validate, sampling_count_validate],
|
| 305 |
outputs=[
|
| 306 |
diagnoser_response_1,
|
| 307 |
diagnoser_response_2,
|
|
|
|
| 318 |
|
| 319 |
distractors_button.click(
|
| 320 |
fn=run_distractors,
|
| 321 |
+
inputs=[distractors_input, model_choice_distractors_1, model_choice_distractors_2, exercise_format_distractors, sampling_count_distractors],
|
| 322 |
+
outputs=[
|
| 323 |
+
distractors_response_1,
|
| 324 |
+
distractors_response_2,
|
| 325 |
+
distractors_response_3,
|
| 326 |
+
distractors_response_4,
|
| 327 |
+
distractors_response_5,
|
| 328 |
+
distractors_response_6,
|
| 329 |
+
distractors_response_7,
|
| 330 |
+
distractors_response_8,
|
| 331 |
+
distractors_response_9,
|
| 332 |
+
distractors_response_10
|
| 333 |
+
]
|
| 334 |
)
|
| 335 |
|
| 336 |
# Launch the app.
|
chains/distractors_chain.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
# chains/distractors_chain.py
|
|
|
|
| 2 |
from pydantic import BaseModel
|
| 3 |
from typing import Any
|
| 4 |
from langchain_core.prompts.chat import ChatPromptTemplate
|
|
@@ -6,29 +7,86 @@ from config.exercise_standardizer import standardize_exercise
|
|
| 6 |
|
| 7 |
|
| 8 |
class DistractorsChain(BaseModel):
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
-
async def run(self, user_query: str, exercise_format: str) -> str:
|
| 16 |
"""
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"""
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
)
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
distractors_messages = prompt_distractors.to_messages()
|
| 29 |
-
distractors = await self.llm_distr.ainvoke(distractors_messages)
|
| 30 |
|
| 31 |
-
|
|
|
|
| 32 |
|
| 33 |
class Config:
|
| 34 |
arbitrary_types_allowed = True
|
|
|
|
| 1 |
# chains/distractors_chain.py
|
| 2 |
+
import asyncio
|
| 3 |
from pydantic import BaseModel
|
| 4 |
from typing import Any
|
| 5 |
from langchain_core.prompts.chat import ChatPromptTemplate
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
class DistractorsChain(BaseModel):
|
| 10 |
+
template_distractors_brainstorm_1: ChatPromptTemplate
|
| 11 |
+
template_distractors_brainstorm_2: ChatPromptTemplate
|
| 12 |
+
llm_brainstorm_low: Any # User-selectable LLMs for brainstorm
|
| 13 |
+
llm_brainstorm_high: Any
|
| 14 |
+
template_consolidate: ChatPromptTemplate
|
| 15 |
+
llm_consolidate: Any
|
| 16 |
|
| 17 |
+
async def run(self, standardized_exercise: str) -> str:
|
|
|
|
| 18 |
"""
|
| 19 |
+
Overall flow:
|
| 20 |
+
2) Run 4 parallel brainstorming calls:
|
| 21 |
+
- 2 uses 'template_distractors_brainstorm_1' with (low-temp, high-temp)
|
| 22 |
+
- 2 uses 'template_distractors_brainstorm_2' with (low-temp, high-temp)
|
| 23 |
+
3) Merge those four partial results in a single final answer
|
| 24 |
+
via a "consolidation" prompt.
|
| 25 |
+
4) Return the final string
|
| 26 |
"""
|
| 27 |
+
|
| 28 |
+
# --- Step 2: Brainstorm in parallel ---
|
| 29 |
+
async def run_brainstorm(
|
| 30 |
+
prompt_template: ChatPromptTemplate,
|
| 31 |
+
llm_brainstorm: Any,
|
| 32 |
+
index_label: str
|
| 33 |
+
) -> str:
|
| 34 |
+
# Format prompt
|
| 35 |
+
prompt = await prompt_template.aformat_prompt(
|
| 36 |
+
standardized_exercise=standardized_exercise
|
| 37 |
+
)
|
| 38 |
+
messages = prompt.to_messages()
|
| 39 |
+
|
| 40 |
+
# Call the specified LLM
|
| 41 |
+
response = await llm_brainstorm.ainvoke(messages)
|
| 42 |
+
content = getattr(response, "content", response)
|
| 43 |
+
|
| 44 |
+
return f"[Brainstorm {index_label}]\n{content}"
|
| 45 |
+
|
| 46 |
+
tasks = []
|
| 47 |
+
# Template 1, low-temp
|
| 48 |
+
tasks.append(run_brainstorm(
|
| 49 |
+
self.template_distractors_brainstorm_1,
|
| 50 |
+
self.llm_brainstorm_1,
|
| 51 |
+
"T1-Low"
|
| 52 |
+
))
|
| 53 |
+
# Template 1, high-temp
|
| 54 |
+
tasks.append(run_brainstorm(
|
| 55 |
+
self.template_distractors_brainstorm_1,
|
| 56 |
+
self.llm_brainstorm_2,
|
| 57 |
+
"T1-High"
|
| 58 |
+
))
|
| 59 |
+
# Template 2, low-temp
|
| 60 |
+
tasks.append(run_brainstorm(
|
| 61 |
+
self.template_distractors_brainstorm_2,
|
| 62 |
+
self.llm_brainstorm_1,
|
| 63 |
+
"T2-Low"
|
| 64 |
+
))
|
| 65 |
+
# Template 2, high-temp
|
| 66 |
+
tasks.append(run_brainstorm(
|
| 67 |
+
self.template_distractors_brainstorm_2,
|
| 68 |
+
self.llm_brainstorm_2,
|
| 69 |
+
"T2-High"
|
| 70 |
+
))
|
| 71 |
+
|
| 72 |
+
# Kick them off concurrently
|
| 73 |
+
brainstorm_results = await asyncio.gather(*tasks)
|
| 74 |
+
|
| 75 |
+
# Combine them in a single multiline string
|
| 76 |
+
combined_brainstorms = "\n\n".join(brainstorm_results)
|
| 77 |
+
|
| 78 |
+
# --- Step 3: Consolidate the 4 partial outputs into a final response ---
|
| 79 |
+
consolidation_prompt = await self.template_consolidate.aformat_prompt(
|
| 80 |
+
brainstorm_outputs=combined_brainstorms,
|
| 81 |
+
standardized_exercise=standardized_exercise
|
| 82 |
)
|
| 83 |
+
consolidation_messages = consolidation_prompt.to_messages()
|
| 84 |
|
| 85 |
+
consolidation_response = await self.llm_consolidate.ainvoke(consolidation_messages)
|
| 86 |
+
final_output = getattr(consolidation_response, "content", consolidation_response)
|
|
|
|
|
|
|
| 87 |
|
| 88 |
+
# Return the final merged distractors response
|
| 89 |
+
return final_output
|
| 90 |
|
| 91 |
class Config:
|
| 92 |
arbitrary_types_allowed = True
|
config/chain_configs.py
CHANGED
|
@@ -7,20 +7,23 @@ from config.templates import (
|
|
| 7 |
template_diagnose_correct_answer_stands_out,
|
| 8 |
template_diagnose_distractor_clearly_wrong,
|
| 9 |
template_diagnose_distractor_partially_correct,
|
| 10 |
-
diagnose_scorecard_template
|
|
|
|
|
|
|
|
|
|
| 11 |
)
|
| 12 |
from chains.diagnoser_chain import DiagnoserChain
|
| 13 |
from chains.distractors_chain import DistractorsChain
|
| 14 |
from config.llm_config import llms
|
| 15 |
|
| 16 |
-
# Note: The default LLM here is 4o; the UI can override this choice.
|
| 17 |
chain_configs = {
|
| 18 |
"diagnoser": {
|
| 19 |
"class": DiagnoserChain,
|
| 20 |
"template_standardize": standardize_template,
|
| 21 |
"llm_standardize": llms["GPT-4o-mini-zero"], # Always fixed
|
| 22 |
"llm_4o_mini": llms["GPT-4o-mini"],
|
| 23 |
-
"llm_4o": llms["GPT-4o"],
|
| 24 |
# 4 different diagnosis templates (to run in parallel:
|
| 25 |
"templates_diagnose": [
|
| 26 |
template_diagnose_double_negation,
|
|
@@ -29,14 +32,17 @@ chain_configs = {
|
|
| 29 |
template_diagnose_distractor_partially_correct,
|
| 30 |
],
|
| 31 |
"template_diagnose_scorecard": diagnose_scorecard_template,
|
| 32 |
-
"llm_diagnose": llms["GPT-4o"], # Default; can be replaced in UI
|
| 33 |
},
|
| 34 |
"distractors": {
|
| 35 |
"class": DistractorsChain,
|
| 36 |
"template_standardize": standardize_template,
|
| 37 |
"llm_standardize": llms["GPT-4o-mini-zero"], # Always fixed
|
| 38 |
-
"
|
| 39 |
-
"
|
| 40 |
-
"
|
|
|
|
|
|
|
|
|
|
| 41 |
},
|
| 42 |
}
|
|
|
|
| 7 |
template_diagnose_correct_answer_stands_out,
|
| 8 |
template_diagnose_distractor_clearly_wrong,
|
| 9 |
template_diagnose_distractor_partially_correct,
|
| 10 |
+
diagnose_scorecard_template,
|
| 11 |
+
template_distractors_brainstorm_1,
|
| 12 |
+
template_distractors_brainstorm_2,
|
| 13 |
+
distractors_consolidate_template
|
| 14 |
)
|
| 15 |
from chains.diagnoser_chain import DiagnoserChain
|
| 16 |
from chains.distractors_chain import DistractorsChain
|
| 17 |
from config.llm_config import llms
|
| 18 |
|
| 19 |
+
# Note: The default LLM here is GPT-4o (low temp); the UI can override this choice.
|
| 20 |
chain_configs = {
|
| 21 |
"diagnoser": {
|
| 22 |
"class": DiagnoserChain,
|
| 23 |
"template_standardize": standardize_template,
|
| 24 |
"llm_standardize": llms["GPT-4o-mini-zero"], # Always fixed
|
| 25 |
"llm_4o_mini": llms["GPT-4o-mini"],
|
| 26 |
+
"llm_4o": llms["GPT-4o (low temp)"],
|
| 27 |
# 4 different diagnosis templates (to run in parallel:
|
| 28 |
"templates_diagnose": [
|
| 29 |
template_diagnose_double_negation,
|
|
|
|
| 32 |
template_diagnose_distractor_partially_correct,
|
| 33 |
],
|
| 34 |
"template_diagnose_scorecard": diagnose_scorecard_template,
|
| 35 |
+
"llm_diagnose": llms["GPT-4o (low temp)"], # Default; can be replaced in UI
|
| 36 |
},
|
| 37 |
"distractors": {
|
| 38 |
"class": DistractorsChain,
|
| 39 |
"template_standardize": standardize_template,
|
| 40 |
"llm_standardize": llms["GPT-4o-mini-zero"], # Always fixed
|
| 41 |
+
"template_distractors_brainstorm_1": template_distractors_brainstorm_1,
|
| 42 |
+
"template_distractors_brainstorm_2": template_distractors_brainstorm_2,
|
| 43 |
+
"llm_brainstorm_1": llms["GPT-4o (low temp)"],
|
| 44 |
+
"llm_brainstorm_2": llms["GPT-4o (mid temp"],
|
| 45 |
+
"template_consolidate": distractors_consolidate_template,
|
| 46 |
+
"llm_consolidate": llms["GPT-4o (low temp)"], # or something else
|
| 47 |
},
|
| 48 |
}
|
config/llm_config.py
CHANGED
|
@@ -19,8 +19,12 @@ HIGH = 1.2
|
|
| 19 |
def create_openai_llm(model_name: str, temperature: float):
|
| 20 |
return ChatOpenAI(api_key=OPENAI_API_KEY, model_name=model_name, temperature=temperature)
|
| 21 |
|
| 22 |
-
def create_openai_reasoning_llm(model_name: str):
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def create_anthropic_llm(model_name: str, temperature: float):
|
| 26 |
return ChatAnthropic(api_key=ANTHROPIC_API_KEY, model_name=model_name, temperature=temperature)
|
|
@@ -29,14 +33,26 @@ def create_deepseek_llm(model_name: str, temperature: float):
|
|
| 29 |
return ChatAnthropic(api_key=ANTHROPIC_API_KEY, model_name=model_name, temperature=temperature)
|
| 30 |
|
| 31 |
llms = {
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
"GPT-4o-mini-zero": create_openai_llm("gpt-4o-mini", ZERO),
|
| 34 |
"GPT-4o-mini": create_openai_llm("gpt-4o-mini", LOW),
|
| 35 |
"GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
|
| 36 |
"GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
|
| 37 |
-
"GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09",
|
|
|
|
|
|
|
| 38 |
"o1": create_openai_reasoning_llm("o1-2024-12-17"),
|
| 39 |
-
"o3-mini": create_openai_reasoning_llm("o3-mini
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
def create_openai_llm(model_name: str, temperature: float):
|
| 20 |
return ChatOpenAI(api_key=OPENAI_API_KEY, model_name=model_name, temperature=temperature)
|
| 21 |
|
| 22 |
+
def create_openai_reasoning_llm(model_name: str, reasoning_effort: str = None):
|
| 23 |
+
# If reasoning_effort is provided, pass it; otherwise, avoid sending the parameter.
|
| 24 |
+
if reasoning_effort:
|
| 25 |
+
return ChatOpenAI(api_key=OPENAI_API_KEY, model_name=model_name, reasoning_effort=reasoning_effort)
|
| 26 |
+
else:
|
| 27 |
+
return ChatOpenAI(api_key=OPENAI_API_KEY, model_name=model_name)
|
| 28 |
|
| 29 |
def create_anthropic_llm(model_name: str, temperature: float):
|
| 30 |
return ChatAnthropic(api_key=ANTHROPIC_API_KEY, model_name=model_name, temperature=temperature)
|
|
|
|
| 33 |
return ChatAnthropic(api_key=ANTHROPIC_API_KEY, model_name=model_name, temperature=temperature)
|
| 34 |
|
| 35 |
llms = {
|
| 36 |
+
# OpenAI models with temperature
|
| 37 |
+
|
| 38 |
+
"GPT-4o (low temp)": create_openai_llm("gpt-4o", LOW),
|
| 39 |
+
"GPT-4o (mid temp)": create_openai_llm("gpt-4o", MID),
|
| 40 |
+
"GPT-4o (high temp)": create_openai_llm("gpt-4o", HIGH),
|
| 41 |
"GPT-4o-mini-zero": create_openai_llm("gpt-4o-mini", ZERO),
|
| 42 |
"GPT-4o-mini": create_openai_llm("gpt-4o-mini", LOW),
|
| 43 |
"GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
|
| 44 |
"GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
|
| 45 |
+
"GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", LOW),
|
| 46 |
+
|
| 47 |
+
# OpenAI reasoning models (no temperature)
|
| 48 |
"o1": create_openai_reasoning_llm("o1-2024-12-17"),
|
| 49 |
+
"o3-mini (high-reasoning version)": create_openai_reasoning_llm("o3-mini", reasoning_effort="high"),
|
| 50 |
+
|
| 51 |
+
# Anthropic models (Claude)
|
| 52 |
+
"Claude 3.5 (low temp)": create_anthropic_llm("claude-3-5-sonnet-latest", LOW),
|
| 53 |
+
"Claude 3.5 (mid temp)": create_anthropic_llm("claude-3-5-sonnet-latest", MID),
|
| 54 |
+
"Claude 3.5 (high temp)": create_anthropic_llm("claude-3-5-sonnet-latest", HIGH),
|
| 55 |
+
|
| 56 |
+
# DeepSeek
|
| 57 |
+
"Deepseek R1 (low temp)π§": create_anthropic_llm("deepseek-reasoner", LOW),
|
| 58 |
+
}
|
config/templates.py
CHANGED
|
@@ -178,6 +178,7 @@ diagnose_scorecard_template = ChatPromptTemplate(
|
|
| 178 |
<example 3>
|
| 179 |
1. The exercise contains a double negative: β -- 2. The correct answer does not stand out: β
-- 3. Some of the distractors are too obviously false: β -- 4. None of the distractors are actually also kinda correct: β
|
| 180 |
</example 3>
|
|
|
|
| 181 |
"""),
|
| 182 |
("human", "{combined_diagnosis}")
|
| 183 |
],
|
|
|
|
| 178 |
<example 3>
|
| 179 |
1. The exercise contains a double negative: β -- 2. The correct answer does not stand out: β
-- 3. Some of the distractors are too obviously false: β -- 4. None of the distractors are actually also kinda correct: β
|
| 180 |
</example 3>
|
| 181 |
+
Sometimes the diagnoses will be short and clear, but sometimes they will also be elaborate and view the issue from different angles. In that case, overweight the final sentence of the diagnosis. Here, usually the conclusion is drawn
|
| 182 |
"""),
|
| 183 |
("human", "{combined_diagnosis}")
|
| 184 |
],
|