refactor into app/ui
Browse files- app.py +27 -279
- app/ui/common.py +33 -0
- app/ui/diagnoser_tab.py +77 -0
- app/ui/distractors_tab.py +98 -0
- chains/{diagnoser_chain.py → diagnoser/diagnoser_chain.py} +1 -1
- chains/diagnoser/runner.py +64 -0
- chains/{distractors_chain.py → distractors/distractors_chain.py} +2 -2
- chains/distractors/runner.py +60 -0
- config/chain_configs.py +2 -2
- config/templates.py +14 -12
app.py
CHANGED
|
@@ -1,140 +1,18 @@
|
|
| 1 |
# app.py
|
| 2 |
import gradio as gr
|
| 3 |
-
import os
|
| 4 |
-
import asyncio
|
| 5 |
import logging
|
| 6 |
|
| 7 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
from utils.auth import login as auth_login
|
| 9 |
from config.chain_configs import chain_configs
|
| 10 |
from config.llm_config import llms
|
| 11 |
|
| 12 |
logger = logging.getLogger(__name__)
|
| 13 |
|
| 14 |
-
# --- Callback to update the exercise format dropdown based on LLM selection ---
|
| 15 |
-
def update_exercise_format(selected_model: str):
|
| 16 |
-
# When "Claude3.5" is selected, default the format to XML; otherwise, default to Markdown.
|
| 17 |
-
if "Claude" in selected_model:
|
| 18 |
-
return gr.update(value="XML")
|
| 19 |
-
else:
|
| 20 |
-
return gr.update(value="Plaintext")
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
# Async wrappers for each chain.
|
| 25 |
-
async def run_diagnoser(user_query: str, model_choice_validate: str, exercise_format_validate: str, sampling_count_validate: str) -> tuple:
|
| 26 |
-
"""
|
| 27 |
-
Diagnose exercise(s) in parallel using a configured DiagnoserChain.
|
| 28 |
-
|
| 29 |
-
This function:
|
| 30 |
-
1. Standardizes the exercise text once using the chain's fixed LLM.
|
| 31 |
-
2. Instantiates the DiagnoserChain with a user-selected diagnosing LLM.
|
| 32 |
-
3. Performs multiple diagnoses in parallel (as many times as `sampling_count_validate`).
|
| 33 |
-
4. Pads the results to ensure a fixed number of output fields (10).
|
| 34 |
-
|
| 35 |
-
Args:
|
| 36 |
-
user_query (str): Raw exercise data submitted by the user.
|
| 37 |
-
model_choice_validate (str): The key/name of the chosen LLM for diagnosing.
|
| 38 |
-
exercise_format_validate (str): The desired format for standardizing the exercise.
|
| 39 |
-
sampling_count_validate (str): A string representing how many diagnoses to run concurrently (e.g., "3").
|
| 40 |
-
|
| 41 |
-
Returns:
|
| 42 |
-
tuple: A tuple of length 10, each containing a diagnosis result (or empty string if not enough samples).
|
| 43 |
-
"""
|
| 44 |
-
# figure out how many times to run
|
| 45 |
-
num_samples = int("".join(filter(str.isdigit, sampling_count_validate)))
|
| 46 |
-
|
| 47 |
-
# Fetch the DiagnoserChain configuration.
|
| 48 |
-
config = chain_configs["diagnoser"]
|
| 49 |
-
|
| 50 |
-
# 1) Standardize the user query exactly once
|
| 51 |
-
standardized_exercise = await standardize_exercise(
|
| 52 |
-
user_query,
|
| 53 |
-
exercise_format_validate,
|
| 54 |
-
config["template_standardize"], # Only if you kept them in config
|
| 55 |
-
config["llm_standardize"]
|
| 56 |
-
)
|
| 57 |
-
|
| 58 |
-
# 2) Instantiate the DiagnoserChain using the user-selected LLM for diagnosing
|
| 59 |
-
chain_instance = config["class"](
|
| 60 |
-
templates_diagnose=config["templates_diagnose"],
|
| 61 |
-
llm_diagnose=llms.get(model_choice_validate, config["llm_diagnose"]),
|
| 62 |
-
template_diagnose_scorecard=config["template_diagnose_scorecard"],
|
| 63 |
-
llm_4o_mini=config["llm_4o_mini"],
|
| 64 |
-
llm_4o=config["llm_4o"]
|
| 65 |
-
)
|
| 66 |
-
|
| 67 |
-
# 3) Run the multiple samples in parallel
|
| 68 |
-
# Create a short helper that does only the "diagnose" steps:
|
| 69 |
-
tasks = [
|
| 70 |
-
chain_instance.diagnose_only(standardized_exercise)
|
| 71 |
-
for _ in range(num_samples)
|
| 72 |
-
]
|
| 73 |
-
# run concurrently
|
| 74 |
-
responses = await asyncio.gather(*tasks)
|
| 75 |
-
|
| 76 |
-
# pad up to 10 if needed
|
| 77 |
-
all_responses = list(responses) + [""] * (10 - len(responses))
|
| 78 |
-
|
| 79 |
-
# Return a tuple of exactly 5 responses.
|
| 80 |
-
return tuple(all_responses)
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
async def run_distractors(
|
| 84 |
-
user_query: str,
|
| 85 |
-
model_choice_distractors_1: str,
|
| 86 |
-
model_choice_distractors_2: str,
|
| 87 |
-
model_choice_distractors_3: str,
|
| 88 |
-
exercise_format_distractors: str,
|
| 89 |
-
sampling_count_distractors: str,
|
| 90 |
-
intermediate_distractors_specification: str,
|
| 91 |
-
final_distractors_specification: str,
|
| 92 |
-
) -> tuple:
|
| 93 |
-
"""
|
| 94 |
-
Generate distractors by running the DistractorsChain multiple times in parallel.
|
| 95 |
-
|
| 96 |
-
1. Standardizes the exercise text once using a fixed LLM.
|
| 97 |
-
2. Constructs a DistractorsChain, where the user can pick two LLMs
|
| 98 |
-
(e.g. one low-temp, one mid-temp) for parallel brainstorming steps.
|
| 99 |
-
3. Invokes the chain ``num_samples`` times in parallel (based on ``sampling_count_distractors``),
|
| 100 |
-
each time producing one consolidated distractors output.
|
| 101 |
-
4. Pads the results to fill 10 output fields.
|
| 102 |
-
"""
|
| 103 |
-
# 0) Parse how many concurrent runs (samples) we want
|
| 104 |
-
num_samples = int("".join(filter(str.isdigit, sampling_count_distractors)))
|
| 105 |
-
# Fetch the DistractorsChain configuration.
|
| 106 |
-
config = chain_configs["distractors"]
|
| 107 |
-
|
| 108 |
-
# 1) Standardize the user query exactly once
|
| 109 |
-
standardized_exercise = await standardize_exercise(
|
| 110 |
-
user_query,
|
| 111 |
-
exercise_format_distractors,
|
| 112 |
-
config["template_standardize"],
|
| 113 |
-
config["llm_standardize"]
|
| 114 |
-
)
|
| 115 |
-
|
| 116 |
-
# 2) Build the DistractorsChain instance
|
| 117 |
-
chain_instance = config["class"](
|
| 118 |
-
template_distractors_brainstorm_1=config["template_distractors_brainstorm_1"],
|
| 119 |
-
template_distractors_brainstorm_2=config["template_distractors_brainstorm_2"],
|
| 120 |
-
llm_brainstorm_1=llms.get(model_choice_distractors_1, config["llm_brainstorm_1"]), # User-selected LLM 1
|
| 121 |
-
llm_brainstorm_2=llms.get(model_choice_distractors_2, config["llm_brainstorm_2"]), # User-selected LLM 2
|
| 122 |
-
template_consolidate=config["template_consolidate"],
|
| 123 |
-
llm_consolidate=llms.get(model_choice_distractors_3, config["llm_consolidate"]), # User-selected LLM 3
|
| 124 |
-
)
|
| 125 |
-
|
| 126 |
-
# 3) Create N tasks in parallel (one full distractor generation pipeline per sample)
|
| 127 |
-
tasks = [
|
| 128 |
-
chain_instance.run(standardized_exercise, intermediate_distractors_specification, final_distractors_specification) for _ in range(num_samples)
|
| 129 |
-
]
|
| 130 |
-
results = await asyncio.gather(*tasks)
|
| 131 |
-
|
| 132 |
-
# 4) Pad up to 10 outputs to correspond to 10 response fields
|
| 133 |
-
all_responses = list(results) + [""] * (10 - len(results))
|
| 134 |
-
|
| 135 |
-
return tuple(all_responses)
|
| 136 |
-
|
| 137 |
-
|
| 138 |
|
| 139 |
# A generic async runner for simple chains (currently not used)
|
| 140 |
async def run_chain(chain_name: str, input_variables: dict, selected_model: str):
|
|
@@ -194,135 +72,27 @@ with gr.Blocks() as interface:
|
|
| 194 |
gr.Markdown("## Pick the tab for your task of choice")
|
| 195 |
|
| 196 |
with gr.Tabs():
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
)
|
| 217 |
-
exercise_format_validate = gr.Dropdown(
|
| 218 |
-
choices=["Markdown", "XML", "Plaintext", "Raw (input not reformatted)"],
|
| 219 |
-
value="Markdown",
|
| 220 |
-
label="Exercise Reformat",
|
| 221 |
-
interactive=True,
|
| 222 |
-
)
|
| 223 |
-
sampling_count_validate = gr.Dropdown(
|
| 224 |
-
choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
|
| 225 |
-
value="1",
|
| 226 |
-
label="Response Count",
|
| 227 |
-
interactive=True,
|
| 228 |
-
)
|
| 229 |
-
# Set up a change callback so that if the user selects any model with "Claude" in the name, the exercise format updates to "XML"
|
| 230 |
-
model_choice_validate.change(
|
| 231 |
-
fn=update_exercise_format,
|
| 232 |
-
inputs=[model_choice_validate],
|
| 233 |
-
outputs=[exercise_format_validate]
|
| 234 |
-
)
|
| 235 |
|
| 236 |
-
diagnoser_input = gr.Textbox(label="Enter exercise in any format", placeholder="Exercise body: <mc:exercise xmlns:mc= ...")
|
| 237 |
-
diagnoser_button = gr.Button("Submit")
|
| 238 |
-
diagnoser_response_1 = gr.Textbox(label="Response 1", interactive=False)
|
| 239 |
-
diagnoser_response_2 = gr.Textbox(label="Response 2", interactive=False)
|
| 240 |
-
diagnoser_response_3 = gr.Textbox(label="Response 3", interactive=False)
|
| 241 |
-
diagnoser_response_4 = gr.Textbox(label="Response 4", interactive=False)
|
| 242 |
-
diagnoser_response_5 = gr.Textbox(label="Response 5", interactive=False)
|
| 243 |
-
diagnoser_response_6 = gr.Textbox(label="Response 6", interactive=False)
|
| 244 |
-
diagnoser_response_7 = gr.Textbox(label="Response 7", interactive=False)
|
| 245 |
-
diagnoser_response_8 = gr.Textbox(label="Response 8", interactive=False)
|
| 246 |
-
diagnoser_response_9 = gr.Textbox(label="Response 9", interactive=False)
|
| 247 |
-
diagnoser_response_10 = gr.Textbox(label="Response 10", interactive=False)
|
| 248 |
|
| 249 |
-
|
| 250 |
-
with gr.TabItem("🤔 Brainstorm distractors"):
|
| 251 |
-
# Insert an HTML info icon with a tooltip at the top of the tab content.
|
| 252 |
-
gr.HTML(
|
| 253 |
-
"""
|
| 254 |
-
<div style="margin-bottom: 10px;">
|
| 255 |
-
<span style="font-size: 1.5em; cursor: help;" title="Generates alternative distractors for the given exercise in two stages. First, 2x2 brainstorming prompts (2 approaches, each using LLM 1 & LLM 2 once) generate a bunch of options, then a final consolidation prompt (using LLM 3) combines all results together for presentation below.\n\nFor both stages, prompts can be customized via dropdowns to influence the amount of distractors that will be generated during each (brainstormed and displayed).\n5-6 LLM calls per final response.">
|
| 256 |
-
ℹ️
|
| 257 |
-
</span>
|
| 258 |
-
</div>
|
| 259 |
-
"""
|
| 260 |
-
)
|
| 261 |
-
|
| 262 |
-
# Create a row for the control dropdowns: LLM selection, exercise format, sampling count etc.
|
| 263 |
-
with gr.Row():
|
| 264 |
-
model_choice_distractors_1 = gr.Dropdown(
|
| 265 |
-
choices=list(llms.keys()),
|
| 266 |
-
value="GPT-4o (mid temp)",
|
| 267 |
-
label="LLM 1 - for brainstorming",
|
| 268 |
-
interactive=True,
|
| 269 |
-
)
|
| 270 |
-
model_choice_distractors_2 = gr.Dropdown(
|
| 271 |
-
choices=list(llms.keys()),
|
| 272 |
-
value="Claude 3.5 (mid temp)",
|
| 273 |
-
label="LLM 2 - for brainstorming",
|
| 274 |
-
interactive=True,
|
| 275 |
-
)
|
| 276 |
-
exercise_format_distractors = gr.Dropdown(
|
| 277 |
-
choices=["Markdown", "XML", "Plaintext", "Raw (input not reformatted)"],
|
| 278 |
-
value="Plaintext",
|
| 279 |
-
label="Exercise Reformat",
|
| 280 |
-
interactive=True,
|
| 281 |
-
)
|
| 282 |
-
intermediate_distractors_specification = gr.Dropdown(
|
| 283 |
-
choices=[" ", " 2 ", " 3 ", " 4 ", " 5 ", " 6 ", " 7 ", " 8 ", " 9 ", " 10 ", " a few ", " some ", " a whole lot of ", " a wide range of ", " novel "],
|
| 284 |
-
value=" 8 ",
|
| 285 |
-
label="Brainstorm X distractors x4",
|
| 286 |
-
interactive=True,
|
| 287 |
-
)
|
| 288 |
-
model_choice_distractors_3 = gr.Dropdown(
|
| 289 |
-
choices=list(llms.keys()),
|
| 290 |
-
value="GPT-4o (low temp)",
|
| 291 |
-
label="LLM 3 - for consolidation",
|
| 292 |
-
interactive=True,
|
| 293 |
-
)
|
| 294 |
-
final_distractors_specification = gr.Dropdown(
|
| 295 |
-
choices=[" ", " of all unique distractors", " of the top 5", " of the best distractors", " of only the very best", " of the best 4", " of the best 5", " of the best 6", " of the best 7", " of the best 8", " of the best 9", " of the best 10", " of the best 11", " of the best 12", " of a few of them", " of some of them", " of most of them",
|
| 296 |
-
" of a wide range of", " of the 3 worst"],
|
| 297 |
-
value=" of all unique distractors",
|
| 298 |
-
label="Finally display X distractors",
|
| 299 |
-
interactive=True,
|
| 300 |
-
)
|
| 301 |
-
sampling_count_distractors = gr.Dropdown(
|
| 302 |
-
choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
|
| 303 |
-
value="1",
|
| 304 |
-
label="Response Count",
|
| 305 |
-
interactive=True,
|
| 306 |
-
)
|
| 307 |
-
# Set up a change callback so that if the user selects any model with "Claude" in the name, the exercise format updates to "XML"
|
| 308 |
-
model_choice_distractors_1.change(
|
| 309 |
-
fn=update_exercise_format,
|
| 310 |
-
inputs=[model_choice_distractors_1],
|
| 311 |
-
outputs=[exercise_format_distractors]
|
| 312 |
-
)
|
| 313 |
-
|
| 314 |
-
distractors_input = gr.Textbox(label="Enter exercise(s) in any format", placeholder="Stelling: Dit is een ..... voorbeeld van een stelling. A. Mooi B. Lelijk ...")
|
| 315 |
-
distractors_button = gr.Button("Submit")
|
| 316 |
-
distractors_response_1 = gr.Textbox(label="Response 1", interactive=False)
|
| 317 |
-
distractors_response_2 = gr.Textbox(label="Response 2", interactive=False)
|
| 318 |
-
distractors_response_3 = gr.Textbox(label="Response 3", interactive=False)
|
| 319 |
-
distractors_response_4 = gr.Textbox(label="Response 4", interactive=False)
|
| 320 |
-
distractors_response_5 = gr.Textbox(label="Response 5", interactive=False)
|
| 321 |
-
distractors_response_6 = gr.Textbox(label="Response 6", interactive=False)
|
| 322 |
-
distractors_response_7 = gr.Textbox(label="Response 7", interactive=False)
|
| 323 |
-
distractors_response_8 = gr.Textbox(label="Response 8", interactive=False)
|
| 324 |
-
distractors_response_9 = gr.Textbox(label="Response 9", interactive=False)
|
| 325 |
-
distractors_response_10 = gr.Textbox(label="Response 10", interactive=False)
|
| 326 |
with gr.TabItem("🚧 Generate learning objectives"):
|
| 327 |
# Insert an HTML info icon with a tooltip at the top of the tab content.
|
| 328 |
gr.HTML(
|
|
@@ -351,19 +121,8 @@ with gr.Blocks() as interface:
|
|
| 351 |
|
| 352 |
diagnoser_button.click(
|
| 353 |
fn=run_diagnoser,
|
| 354 |
-
inputs=[diagnoser_input,
|
| 355 |
-
outputs=[
|
| 356 |
-
diagnoser_response_1,
|
| 357 |
-
diagnoser_response_2,
|
| 358 |
-
diagnoser_response_3,
|
| 359 |
-
diagnoser_response_4,
|
| 360 |
-
diagnoser_response_5,
|
| 361 |
-
diagnoser_response_6,
|
| 362 |
-
diagnoser_response_7,
|
| 363 |
-
diagnoser_response_8,
|
| 364 |
-
diagnoser_response_9,
|
| 365 |
-
diagnoser_response_10
|
| 366 |
-
]
|
| 367 |
)
|
| 368 |
|
| 369 |
distractors_button.click(
|
|
@@ -386,18 +145,7 @@ with gr.Blocks() as interface:
|
|
| 386 |
# 8) final_distractors_specification
|
| 387 |
final_distractors_specification,
|
| 388 |
],
|
| 389 |
-
outputs=[
|
| 390 |
-
distractors_response_1,
|
| 391 |
-
distractors_response_2,
|
| 392 |
-
distractors_response_3,
|
| 393 |
-
distractors_response_4,
|
| 394 |
-
distractors_response_5,
|
| 395 |
-
distractors_response_6,
|
| 396 |
-
distractors_response_7,
|
| 397 |
-
distractors_response_8,
|
| 398 |
-
distractors_response_9,
|
| 399 |
-
distractors_response_10
|
| 400 |
-
]
|
| 401 |
)
|
| 402 |
|
| 403 |
# Launch the app.
|
|
|
|
| 1 |
# app.py
|
| 2 |
import gradio as gr
|
|
|
|
|
|
|
| 3 |
import logging
|
| 4 |
|
| 5 |
+
from app.ui.common import update_exercise_format
|
| 6 |
+
from app.ui.diagnoser_tab import build_diagnoser_tab
|
| 7 |
+
from app.ui.distractors_tab import build_distractors_tab
|
| 8 |
+
from chains.diagnoser.runner import run_diagnoser
|
| 9 |
+
from chains.distractors.runner import run_distractors
|
| 10 |
from utils.auth import login as auth_login
|
| 11 |
from config.chain_configs import chain_configs
|
| 12 |
from config.llm_config import llms
|
| 13 |
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# A generic async runner for simple chains (currently not used)
|
| 18 |
async def run_chain(chain_name: str, input_variables: dict, selected_model: str):
|
|
|
|
| 72 |
gr.Markdown("## Pick the tab for your task of choice")
|
| 73 |
|
| 74 |
with gr.Tabs():
|
| 75 |
+
(model_choice_diagnose,
|
| 76 |
+
exercise_format_diagnose,
|
| 77 |
+
sampling_count_diagnose,
|
| 78 |
+
diagnoser_input,
|
| 79 |
+
diagnoser_button,
|
| 80 |
+
diagnoser_responses
|
| 81 |
+
) = build_diagnoser_tab()
|
| 82 |
+
|
| 83 |
+
(model_choice_distractors_1,
|
| 84 |
+
model_choice_distractors_2,
|
| 85 |
+
model_choice_distractors_3,
|
| 86 |
+
exercise_format_distractors,
|
| 87 |
+
sampling_count_distractors,
|
| 88 |
+
distractors_input,
|
| 89 |
+
distractors_button,
|
| 90 |
+
distractors_responses,
|
| 91 |
+
intermediate_distractors_specification,
|
| 92 |
+
final_distractors_specification,
|
| 93 |
+
) = build_distractors_tab()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
with gr.TabItem("🚧 Generate learning objectives"):
|
| 97 |
# Insert an HTML info icon with a tooltip at the top of the tab content.
|
| 98 |
gr.HTML(
|
|
|
|
| 121 |
|
| 122 |
diagnoser_button.click(
|
| 123 |
fn=run_diagnoser,
|
| 124 |
+
inputs=[diagnoser_input, model_choice_diagnose, exercise_format_diagnose, sampling_count_diagnose],
|
| 125 |
+
outputs=[diagnoser_responses]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
)
|
| 127 |
|
| 128 |
distractors_button.click(
|
|
|
|
| 145 |
# 8) final_distractors_specification
|
| 146 |
final_distractors_specification,
|
| 147 |
],
|
| 148 |
+
outputs=[distractors_responses]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
)
|
| 150 |
|
| 151 |
# Launch the app.
|
app/ui/common.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ui/common.py
|
| 2 |
+
import gradio as gr
|
| 3 |
+
|
| 4 |
+
# --- Callback to update the exercise format dropdown based on LLM selection ---
|
| 5 |
+
def update_exercise_format(selected_model: str):
|
| 6 |
+
"""
|
| 7 |
+
When the user picks a new model:
|
| 8 |
+
- If it has 'Claude' in the name, default format to XML.
|
| 9 |
+
- Otherwise, default to Plaintext.
|
| 10 |
+
"""
|
| 11 |
+
if "Claude" in selected_model:
|
| 12 |
+
return gr.update(value="XML")
|
| 13 |
+
else:
|
| 14 |
+
return gr.update(value="Plaintext")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def update_response_textboxes_amount(sampling_count: str):
|
| 18 |
+
"""
|
| 19 |
+
Dynamically show/hide Response textboxes based on sampling count.
|
| 20 |
+
"""
|
| 21 |
+
# Convert string to integer
|
| 22 |
+
num = int(sampling_count)
|
| 23 |
+
|
| 24 |
+
# We'll return a list of 10 updates, one for each textbox.
|
| 25 |
+
updates = []
|
| 26 |
+
for i in range(10):
|
| 27 |
+
if i < num:
|
| 28 |
+
# Show and label (i+1), if you like
|
| 29 |
+
updates.append(gr.Textbox.update(visible=True, label=f"Response {i+1}"))
|
| 30 |
+
else:
|
| 31 |
+
# Hide the rest
|
| 32 |
+
updates.append(gr.Textbox.update(visible=False, label=f"Response {i+1}"))
|
| 33 |
+
return updates
|
app/ui/diagnoser_tab.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ui/diagnoser_tab.py
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from config.llm_config import llms
|
| 4 |
+
from common import update_exercise_format, update_response_textboxes_amount
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def build_diagnoser_tab():
|
| 8 |
+
"""
|
| 9 |
+
Builds and returns the Diagnoser tab UI elements (and any references).
|
| 10 |
+
"""
|
| 11 |
+
with gr.TabItem("🩺 Diagnose exercise"):
|
| 12 |
+
# Insert an HTML info icon with a tooltip at the top of the tab content.
|
| 13 |
+
gr.HTML(
|
| 14 |
+
"""
|
| 15 |
+
<div style="margin-bottom: 10px;">
|
| 16 |
+
<span style="font-size: 1.5em; cursor: help;" title="Diagnoses exercise for their 4 most common issues.\n\nThe exercise format dropdown decides into what standardized format the exercise is converted initially for intermediate processing, to ensure reliable performance irrespective of source format.\nAnthropic models typically work better with XML, OpenAI's with markdown.\n\nResponse count is the amount of times a final response will be generated in the fields below (5-6 LLM queries for each).">
|
| 17 |
+
ℹ️ <i>←</i>
|
| 18 |
+
</span>
|
| 19 |
+
</div>
|
| 20 |
+
"""
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# Create a row for the control dropdowns: LLM selection, exercise format, sampling count etc.
|
| 24 |
+
with gr.Row():
|
| 25 |
+
model_choice_diagnose = gr.Dropdown(
|
| 26 |
+
choices=list(llms.keys()),
|
| 27 |
+
value="GPT-4o (low temp)",
|
| 28 |
+
label="Select LLM",
|
| 29 |
+
interactive=True,
|
| 30 |
+
)
|
| 31 |
+
exercise_format_diagnose = gr.Dropdown(
|
| 32 |
+
choices=["Markdown", "XML", "Plaintext", "Raw (input not reformatted)"],
|
| 33 |
+
value="Markdown",
|
| 34 |
+
label="Exercise Reformat",
|
| 35 |
+
interactive=True,
|
| 36 |
+
)
|
| 37 |
+
sampling_count_diagnose = gr.Dropdown(
|
| 38 |
+
choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
|
| 39 |
+
value="1",
|
| 40 |
+
label="Response Count",
|
| 41 |
+
interactive=True,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# The user input
|
| 45 |
+
diagnoser_input = gr.Textbox(label="Enter exercise in any format",
|
| 46 |
+
placeholder="Exercise body: <mc:exercise xmlns:mc= ...")
|
| 47 |
+
# A button to run the chain
|
| 48 |
+
diagnoser_button = gr.Button("Submit")
|
| 49 |
+
|
| 50 |
+
# Create 10 Response textboxes
|
| 51 |
+
diagnoser_responses = [
|
| 52 |
+
gr.Textbox(label=f"Response {i + 1}", interactive=False, visible=(i == 0))
|
| 53 |
+
for i in range(10)
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
# Set up a change callback so that if the user selects any model with "Claude" in the name, the exercise format updates to "XML"
|
| 57 |
+
model_choice_diagnose.change(
|
| 58 |
+
fn=update_exercise_format,
|
| 59 |
+
inputs=[model_choice_diagnose],
|
| 60 |
+
outputs=[exercise_format_diagnose]
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# Callback to show/hide Response textboxes
|
| 64 |
+
sampling_count_diagnose.change(
|
| 65 |
+
fn=update_response_textboxes_amount,
|
| 66 |
+
inputs=[sampling_count_diagnose],
|
| 67 |
+
outputs=diagnoser_responses
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
return (
|
| 71 |
+
model_choice_diagnose,
|
| 72 |
+
exercise_format_diagnose,
|
| 73 |
+
sampling_count_diagnose,
|
| 74 |
+
diagnoser_input,
|
| 75 |
+
diagnoser_button,
|
| 76 |
+
diagnoser_responses
|
| 77 |
+
)
|
app/ui/distractors_tab.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ui/distractors_tab.py
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from common import update_response_textboxes_amount, update_exercise_format
|
| 4 |
+
from config.llm_config import llms
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def build_distractors_tab():
|
| 8 |
+
"""
|
| 9 |
+
Builds and returns the Diagnoser tab UI elements (and any references).
|
| 10 |
+
"""
|
| 11 |
+
with gr.TabItem("🤔 Brainstorm distractors"):
|
| 12 |
+
# Insert an HTML info icon with a tooltip at the top of the tab content.
|
| 13 |
+
gr.HTML(
|
| 14 |
+
"""
|
| 15 |
+
<div style="margin-bottom: 10px;">
|
| 16 |
+
<span style="font-size: 1.5em; cursor: help;" title="Generates alternative distractors for the given exercise in two stages. First, 2x2 brainstorming prompts (2 approaches, each using LLM 1 & LLM 2 once) generate a bunch of options, then a final consolidation prompt (using LLM 3) combines all results together for presentation below.\n\nFor both stages, prompts can be customized via dropdowns to influence the amount of distractors that will be generated during each (brainstormed and displayed).\n5-6 LLM calls per final response.">
|
| 17 |
+
ℹ️
|
| 18 |
+
</span>
|
| 19 |
+
</div>
|
| 20 |
+
"""
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# Create a row for the control dropdowns: LLM selection, exercise format, sampling count etc.
|
| 24 |
+
with gr.Row():
|
| 25 |
+
model_choice_distractors_1 = gr.Dropdown(
|
| 26 |
+
choices=list(llms.keys()),
|
| 27 |
+
value="GPT-4o (mid temp)",
|
| 28 |
+
label="LLM 1 - for brainstorming",
|
| 29 |
+
interactive=True,
|
| 30 |
+
)
|
| 31 |
+
model_choice_distractors_2 = gr.Dropdown(
|
| 32 |
+
choices=list(llms.keys()),
|
| 33 |
+
value="Claude 3.5 (mid temp)",
|
| 34 |
+
label="LLM 2 - for brainstorming",
|
| 35 |
+
interactive=True,
|
| 36 |
+
)
|
| 37 |
+
exercise_format_distractors = gr.Dropdown(
|
| 38 |
+
choices=["Markdown", "XML", "Plaintext", "Raw (input not reformatted)"],
|
| 39 |
+
value="Plaintext",
|
| 40 |
+
label="Exercise Reformat",
|
| 41 |
+
interactive=True,
|
| 42 |
+
)
|
| 43 |
+
intermediate_distractors_specification = gr.Dropdown(
|
| 44 |
+
choices=[" ", " 2 ", " 3 ", " 4 ", " 5 ", " 6 ", " 7 ", " 8 ", " 9 ", " 10 ", " a few ", " some ",
|
| 45 |
+
" a whole lot of ", " a wide range of ", " novel "],
|
| 46 |
+
value=" 8 ",
|
| 47 |
+
label="Brainstorm X distractors x4",
|
| 48 |
+
interactive=True,
|
| 49 |
+
)
|
| 50 |
+
model_choice_distractors_3 = gr.Dropdown(
|
| 51 |
+
choices=list(llms.keys()),
|
| 52 |
+
value="GPT-4o (low temp)",
|
| 53 |
+
label="LLM 3 - for consolidation",
|
| 54 |
+
interactive=True,
|
| 55 |
+
)
|
| 56 |
+
final_distractors_specification = gr.Dropdown(
|
| 57 |
+
choices=[" ", " of all unique distractors", " of the top 5", " of the best distractors",
|
| 58 |
+
" of only the very best", " of the best 4", " of the best 5", " of the best 6",
|
| 59 |
+
" of the best 7", " of the best 8", " of the best 9", " of the best 10", " of the best 11",
|
| 60 |
+
" of the best 12", " of a few of them", " of some of them", " of most of them",
|
| 61 |
+
" of a wide range of", " of the 3 worst"],
|
| 62 |
+
value=" of all unique distractors",
|
| 63 |
+
label="Finally display X distractors",
|
| 64 |
+
interactive=True,
|
| 65 |
+
)
|
| 66 |
+
sampling_count_distractors = gr.Dropdown(
|
| 67 |
+
choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
|
| 68 |
+
value="1",
|
| 69 |
+
label="Response Count",
|
| 70 |
+
interactive=True,
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
distractors_input = gr.Textbox(label="Enter exercise(s) in any format",
|
| 74 |
+
placeholder="Stelling: Dit is een ..... voorbeeld van een stelling. A. Mooi B. Lelijk ...")
|
| 75 |
+
distractors_button = gr.Button("Submit")
|
| 76 |
+
|
| 77 |
+
# Create 10 Response textboxes
|
| 78 |
+
distractors_responses = [
|
| 79 |
+
gr.Textbox(label=f"Response {i + 1}", interactive=False, visible=(i == 0))
|
| 80 |
+
for i in range(10)
|
| 81 |
+
]
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# Set up a change callback so that if the user selects any model with "Claude" in the name, the exercise format updates to "XML"
|
| 85 |
+
model_choice_distractors_1.change(
|
| 86 |
+
fn=update_exercise_format,
|
| 87 |
+
inputs=[model_choice_distractors_1],
|
| 88 |
+
outputs=[exercise_format_distractors]
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# Callback to show/hide Response textboxes
|
| 92 |
+
sampling_count_distractors.change(
|
| 93 |
+
fn=update_response_textboxes_amount,
|
| 94 |
+
inputs=[sampling_count_distractors],
|
| 95 |
+
outputs=distractors_responses
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
return()
|
chains/{diagnoser_chain.py → diagnoser/diagnoser_chain.py}
RENAMED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# chains/diagnoser_chain.py
|
| 2 |
import asyncio
|
| 3 |
from pydantic import BaseModel
|
| 4 |
from typing import Any, List
|
|
|
|
| 1 |
+
# chains/diagnoser/diagnoser_chain.py
|
| 2 |
import asyncio
|
| 3 |
from pydantic import BaseModel
|
| 4 |
from typing import Any, List
|
chains/diagnoser/runner.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# chains/diagnoser/diagnoser_runner.py
|
| 2 |
+
import asyncio
|
| 3 |
+
|
| 4 |
+
from config.chain_configs import chain_configs
|
| 5 |
+
from config.exercise_standardizer import standardize_exercise
|
| 6 |
+
from config.llm_config import llms
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
async def run_diagnoser(user_query: str, model_choice_validate: str, exercise_format_validate: str, sampling_count_validate: str) -> tuple:
|
| 10 |
+
"""
|
| 11 |
+
Diagnose exercise(s) in parallel using a configured DiagnoserChain.
|
| 12 |
+
|
| 13 |
+
This function:
|
| 14 |
+
1. Standardizes the exercise text once using the chain's fixed LLM.
|
| 15 |
+
2. Instantiates the DiagnoserChain with a user-selected diagnosing LLM.
|
| 16 |
+
3. Performs multiple diagnoses in parallel (as many times as `sampling_count_validate`).
|
| 17 |
+
4. Pads the results to ensure a fixed number of output fields (10).
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
user_query (str): Raw exercise data submitted by the user.
|
| 21 |
+
model_choice_validate (str): The key/name of the chosen LLM for diagnosing.
|
| 22 |
+
exercise_format_validate (str): The desired format for standardizing the exercise.
|
| 23 |
+
sampling_count_validate (str): A string representing how many diagnoses to run concurrently (e.g., "3").
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
tuple: A tuple of length 10, each containing a diagnosis result (or empty string if not enough samples).
|
| 27 |
+
"""
|
| 28 |
+
# figure out how many times to run
|
| 29 |
+
num_samples = int("".join(filter(str.isdigit, sampling_count_validate)))
|
| 30 |
+
|
| 31 |
+
# Fetch the DiagnoserChain configuration.
|
| 32 |
+
config = chain_configs["diagnoser"]
|
| 33 |
+
|
| 34 |
+
# 1) Standardize the user query exactly once
|
| 35 |
+
standardized_exercise = await standardize_exercise(
|
| 36 |
+
user_query,
|
| 37 |
+
exercise_format_validate,
|
| 38 |
+
config["template_standardize"], # Only if you kept them in config
|
| 39 |
+
config["llm_standardize"]
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# 2) Instantiate the DiagnoserChain using the user-selected LLM for diagnosing
|
| 43 |
+
chain_instance = config["class"](
|
| 44 |
+
templates_diagnose=config["templates_diagnose"],
|
| 45 |
+
llm_diagnose=llms.get(model_choice_validate, config["llm_diagnose"]),
|
| 46 |
+
template_diagnose_scorecard=config["template_diagnose_scorecard"],
|
| 47 |
+
llm_4o_mini=config["llm_4o_mini"],
|
| 48 |
+
llm_4o=config["llm_4o"]
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# 3) Run the multiple samples in parallel
|
| 52 |
+
# Create a short helper that does only the "diagnose" steps:
|
| 53 |
+
tasks = [
|
| 54 |
+
chain_instance.diagnose_only(standardized_exercise)
|
| 55 |
+
for _ in range(num_samples)
|
| 56 |
+
]
|
| 57 |
+
# run concurrently
|
| 58 |
+
responses = await asyncio.gather(*tasks)
|
| 59 |
+
|
| 60 |
+
# pad up to 10 if needed
|
| 61 |
+
all_responses = list(responses) + [""] * (10 - len(responses))
|
| 62 |
+
|
| 63 |
+
# Return a tuple of exactly 5 responses.
|
| 64 |
+
return tuple(all_responses)
|
chains/{distractors_chain.py → distractors/distractors_chain.py}
RENAMED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# chains/distractors_chain.py
|
| 2 |
import asyncio
|
| 3 |
from pydantic import BaseModel
|
| 4 |
from typing import Any
|
|
@@ -73,7 +73,7 @@ class DistractorsChain(BaseModel):
|
|
| 73 |
brainstorm_results = await asyncio.gather(*tasks)
|
| 74 |
|
| 75 |
# Combine them in a single multiline string
|
| 76 |
-
combined_brainstorms = "\n\n
|
| 77 |
|
| 78 |
# --- Step 3: Consolidate the 4 partial outputs into a final response ---
|
| 79 |
consolidation_prompt = await self.template_consolidate.aformat_prompt(
|
|
|
|
| 1 |
+
# chains/distractors/distractors_chain.py
|
| 2 |
import asyncio
|
| 3 |
from pydantic import BaseModel
|
| 4 |
from typing import Any
|
|
|
|
| 73 |
brainstorm_results = await asyncio.gather(*tasks)
|
| 74 |
|
| 75 |
# Combine them in a single multiline string
|
| 76 |
+
combined_brainstorms = "\n\n".join(brainstorm_results)
|
| 77 |
|
| 78 |
# --- Step 3: Consolidate the 4 partial outputs into a final response ---
|
| 79 |
consolidation_prompt = await self.template_consolidate.aformat_prompt(
|
chains/distractors/runner.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# chains/distractors/runner.py
|
| 2 |
+
import asyncio
|
| 3 |
+
|
| 4 |
+
from config.chain_configs import chain_configs
|
| 5 |
+
from config.exercise_standardizer import standardize_exercise
|
| 6 |
+
from config.llm_config import llms
|
| 7 |
+
|
| 8 |
+
async def run_distractors(
|
| 9 |
+
user_query: str,
|
| 10 |
+
model_choice_distractors_1: str,
|
| 11 |
+
model_choice_distractors_2: str,
|
| 12 |
+
model_choice_distractors_3: str,
|
| 13 |
+
exercise_format_distractors: str,
|
| 14 |
+
sampling_count_distractors: str,
|
| 15 |
+
intermediate_distractors_specification: str,
|
| 16 |
+
final_distractors_specification: str,
|
| 17 |
+
) -> tuple:
|
| 18 |
+
"""
|
| 19 |
+
Generate distractors by running the DistractorsChain multiple times in parallel.
|
| 20 |
+
|
| 21 |
+
1. Standardizes the exercise text once using a fixed LLM.
|
| 22 |
+
2. Constructs a DistractorsChain, where the user can pick two LLMs
|
| 23 |
+
(e.g. one low-temp, one mid-temp) for parallel brainstorming steps.
|
| 24 |
+
3. Invokes the chain ``num_samples`` times in parallel (based on ``sampling_count_distractors``),
|
| 25 |
+
each time producing one consolidated distractors output.
|
| 26 |
+
4. Pads the results to fill 10 output fields.
|
| 27 |
+
"""
|
| 28 |
+
# 0) Parse how many concurrent runs (samples) we want
|
| 29 |
+
num_samples = int("".join(filter(str.isdigit, sampling_count_distractors)))
|
| 30 |
+
# Fetch the DistractorsChain configuration.
|
| 31 |
+
config = chain_configs["distractors"]
|
| 32 |
+
|
| 33 |
+
# 1) Standardize the user query exactly once
|
| 34 |
+
standardized_exercise = await standardize_exercise(
|
| 35 |
+
user_query,
|
| 36 |
+
exercise_format_distractors,
|
| 37 |
+
config["template_standardize"],
|
| 38 |
+
config["llm_standardize"]
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# 2) Build the DistractorsChain instance
|
| 42 |
+
chain_instance = config["class"](
|
| 43 |
+
template_distractors_brainstorm_1=config["template_distractors_brainstorm_1"],
|
| 44 |
+
template_distractors_brainstorm_2=config["template_distractors_brainstorm_2"],
|
| 45 |
+
llm_brainstorm_1=llms.get(model_choice_distractors_1, config["llm_brainstorm_1"]), # User-selected LLM 1
|
| 46 |
+
llm_brainstorm_2=llms.get(model_choice_distractors_2, config["llm_brainstorm_2"]), # User-selected LLM 2
|
| 47 |
+
template_consolidate=config["template_consolidate"],
|
| 48 |
+
llm_consolidate=llms.get(model_choice_distractors_3, config["llm_consolidate"]), # User-selected LLM 3
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# 3) Create N tasks in parallel (one full distractors generation pipeline per sample)
|
| 52 |
+
tasks = [
|
| 53 |
+
chain_instance.run(standardized_exercise, intermediate_distractors_specification, final_distractors_specification) for _ in range(num_samples)
|
| 54 |
+
]
|
| 55 |
+
results = await asyncio.gather(*tasks)
|
| 56 |
+
|
| 57 |
+
# 4) Pad up to 10 outputs to correspond to 10 response fields
|
| 58 |
+
all_responses = list(results) + [""] * (10 - len(results))
|
| 59 |
+
|
| 60 |
+
return tuple(all_responses)
|
config/chain_configs.py
CHANGED
|
@@ -10,8 +10,8 @@ from config.templates import (
|
|
| 10 |
template_distractors_brainstorm_2,
|
| 11 |
template_consolidate_distractors
|
| 12 |
)
|
| 13 |
-
from chains.diagnoser_chain import DiagnoserChain
|
| 14 |
-
from chains.distractors_chain import DistractorsChain
|
| 15 |
from config.llm_config import llms
|
| 16 |
|
| 17 |
# Note: The default LLM here is GPT-4o (low temp); the UI can override this choice.
|
|
|
|
| 10 |
template_distractors_brainstorm_2,
|
| 11 |
template_consolidate_distractors
|
| 12 |
)
|
| 13 |
+
from chains.diagnoser.diagnoser_chain import DiagnoserChain
|
| 14 |
+
from chains.distractors.distractors_chain import DistractorsChain
|
| 15 |
from config.llm_config import llms
|
| 16 |
|
| 17 |
# Note: The default LLM here is GPT-4o (low temp); the UI can override this choice.
|
config/templates.py
CHANGED
|
@@ -144,9 +144,9 @@ template_diagnose_distractor_partially_correct = ChatPromptTemplate(
|
|
| 144 |
messages=[
|
| 145 |
("system", """You analyze a multiple-choice exercise to detect distractors that are
|
| 146 |
partially correct. Some answer choices may contain elements of truth, leading to
|
| 147 |
-
ambiguity. Identify such cases. Really stress-test them: is there a story you could tell where the
|
| 148 |
After this, consider if this is bad enough in the context of this question. It's fine if the correct answer is still obviously most correct, and some distractors that contain elements of truth. This is only a problem if the gap becomes too small.
|
| 149 |
-
As an intuition pump, ask this question: would there be any experts that would consider this
|
| 150 |
Your only focus is to accurately diagnose this issue, no need to provide a fix. If all distractors in the given exercise clearly are or aren't unambiguously false, just give a short one-sentence diagnosis on this.
|
| 151 |
If the issue is more nuanced, do some reasoning first, and give your diagnosis then.
|
| 152 |
"""),
|
|
@@ -160,8 +160,8 @@ diagnose_scorecard_template = ChatPromptTemplate(
|
|
| 160 |
("system", """You analyze the results of the diagnoses of 4 potential issues that multiple choice exercises sometimes have, and consolidate those into a very simple one-line visual scorecard that summarizes all issues' diagnoses, to show the results clearly in one overview. The diagnoses concern the following 4 potential issues:
|
| 161 |
1. Double negatives (if the exercise contains something like 'to not not do something', this is undesirable)
|
| 162 |
2. The correct multiple choice answer option stands out from the rest (this is a hint for the student)
|
| 163 |
-
3. A
|
| 164 |
-
4. A
|
| 165 |
Use these two icons:
|
| 166 |
- ✅ means the diagnosis of the issue came back negative, so the issue is not present.
|
| 167 |
- ❌ means the diagnosis of the issue came back positive, so the issue is present.
|
|
@@ -204,17 +204,17 @@ template_distractors_brainstorm_2 = ChatPromptTemplate(
|
|
| 204 |
"You can think about this as a spectrum between 'too correct' & 'too obviously false'. Or, in other words, a spectrum between two extreme ends that can be described as: "
|
| 205 |
"'An answer option that is not the correct answer to the question, yet extremely similar in meaning and scope to the correct answer, such that it's very debatable whether this answer option is not in fact also actually correct' & "
|
| 206 |
"'An answer option that is exceedingly unlikely, fantastical, off-base or ridiculous and therefore maximally obviously incorrect, such that no one who can read would think this could ever be the correct answer to the question'\n"
|
| 207 |
-
"Whether any particular
|
| 208 |
"This often depends on many aspects to do with question, for example its exact phrasing, specific (background) domain-knowledge related to the subject, "
|
| 209 |
"and assumptions about what test takers in the target group for this exercise already can be assumed to know or not know, and their intelligence.\n"
|
| 210 |
"In other words, it is not easy to pick distractors that are positioned inside the acceptable range on this spectrum. "
|
| 211 |
"Therefore, really try to go about your task here methodically: first establish the borders of the acceptable range of distractors by lingering there for a bit; taking into account the specific context of the given question, as follows.\n\n"
|
| 212 |
"Before drafting the final list, first come up with one or two faulty distractors, that are faulty in the sense that they would be júst too much on the 'too correct' side of the aforementioned spectrum.\n"
|
| 213 |
"Then, come up with one or two distractors that are júst faulty on the other side of that spectrum: júst too much on the side of 'too obviously false'.\n"
|
| 214 |
-
"As an intuition pump for the first category (distractors that are júst too correct), try to imagine experts in the question's domain discussing the answer option, and some of them arguing that the
|
| 215 |
-
"As an intuition pump for the second category (distractors that are júst too obviously incorrect), try to image a student who is both generally stupid (bottom of his class) ánd uninformed about the given topic (didn't prepare for the test). Would even they júst so find it easy to eliminate the faulty
|
| 216 |
"Those are the two bounds of the spectrum range we aim to operate between during brainstorming.\n"
|
| 217 |
-
"So, through the above process of picking some júst faulty distractors in the context of the given question, both barely too correct and barely too obviously false, you establish the two bounds of acceptable distractors. When brainstorming, don't play it entirely safe though; when in doubt about where exactly on the spectrum the
|
| 218 |
"Next, in the brainstorming phase, it's most important that you get really creative and really try to think outside the box, to come up with the required potential alternative answer options to the exercise. We want to approach this task from all different angles, "
|
| 219 |
"to arrive at a varied selection of options, to serve as inspiration for a later stage of final selection (not now) to make the exercise the best it can be. For now, carry out the above-described prep in writing, then draft the list of{intermediate_distractors_specification} alternative distractors (in the same language as the existing exercise)."),
|
| 220 |
("human", "{standardized_exercise}")
|
|
@@ -227,13 +227,15 @@ template_distractors_brainstorm_2 = ChatPromptTemplate(
|
|
| 227 |
|
| 228 |
template_consolidate_distractors = ChatPromptTemplate(
|
| 229 |
messages=[
|
| 230 |
-
("system", "You are given several lists of potential distractors (answer options to a multiple choice exercise), that need to be consolidated into one list. "
|
| 231 |
-
"
|
| 232 |
-
"Only focus on the distractors (answer options) themselves,
|
| 233 |
"For context, this is the exercise that the distractors are about:\n "
|
| 234 |
"{standardized_exercise}"),
|
| 235 |
("human", "Here are the lists:\n "
|
| 236 |
-
"{brainstorm_outputs} "
|
|
|
|
|
|
|
| 237 |
],
|
| 238 |
input_variables=["standardized_exercise", "brainstorm_outputs", "final_distractors_specification"]
|
| 239 |
)
|
|
|
|
| 144 |
messages=[
|
| 145 |
("system", """You analyze a multiple-choice exercise to detect distractors that are
|
| 146 |
partially correct. Some answer choices may contain elements of truth, leading to
|
| 147 |
+
ambiguity. Identify such cases. Really stress-test them: is there a story you could tell where the distractors, in the context of this exercise, could be considered a (partially) correct answer?
|
| 148 |
After this, consider if this is bad enough in the context of this question. It's fine if the correct answer is still obviously most correct, and some distractors that contain elements of truth. This is only a problem if the gap becomes too small.
|
| 149 |
+
As an intuition pump, ask this question: would there be any experts that would consider this distractors also a correct answer? If so, diagnose the problem. If not, it's fine.
|
| 150 |
Your only focus is to accurately diagnose this issue, no need to provide a fix. If all distractors in the given exercise clearly are or aren't unambiguously false, just give a short one-sentence diagnosis on this.
|
| 151 |
If the issue is more nuanced, do some reasoning first, and give your diagnosis then.
|
| 152 |
"""),
|
|
|
|
| 160 |
("system", """You analyze the results of the diagnoses of 4 potential issues that multiple choice exercises sometimes have, and consolidate those into a very simple one-line visual scorecard that summarizes all issues' diagnoses, to show the results clearly in one overview. The diagnoses concern the following 4 potential issues:
|
| 161 |
1. Double negatives (if the exercise contains something like 'to not not do something', this is undesirable)
|
| 162 |
2. The correct multiple choice answer option stands out from the rest (this is a hint for the student)
|
| 163 |
+
3. A distractors answer option is too obviously false (it's useless, no student would ever pick it)
|
| 164 |
+
4. A distractors answer option is actually also kinda correct (it's misleading, if a student picks it they're not 100% wrong)
|
| 165 |
Use these two icons:
|
| 166 |
- ✅ means the diagnosis of the issue came back negative, so the issue is not present.
|
| 167 |
- ❌ means the diagnosis of the issue came back positive, so the issue is present.
|
|
|
|
| 204 |
"You can think about this as a spectrum between 'too correct' & 'too obviously false'. Or, in other words, a spectrum between two extreme ends that can be described as: "
|
| 205 |
"'An answer option that is not the correct answer to the question, yet extremely similar in meaning and scope to the correct answer, such that it's very debatable whether this answer option is not in fact also actually correct' & "
|
| 206 |
"'An answer option that is exceedingly unlikely, fantastical, off-base or ridiculous and therefore maximally obviously incorrect, such that no one who can read would think this could ever be the correct answer to the question'\n"
|
| 207 |
+
"Whether any particular distractors falls on the 'too correct' or 'too obviously incorrect' parts of the spectrum, is highly context-dependent. "
|
| 208 |
"This often depends on many aspects to do with question, for example its exact phrasing, specific (background) domain-knowledge related to the subject, "
|
| 209 |
"and assumptions about what test takers in the target group for this exercise already can be assumed to know or not know, and their intelligence.\n"
|
| 210 |
"In other words, it is not easy to pick distractors that are positioned inside the acceptable range on this spectrum. "
|
| 211 |
"Therefore, really try to go about your task here methodically: first establish the borders of the acceptable range of distractors by lingering there for a bit; taking into account the specific context of the given question, as follows.\n\n"
|
| 212 |
"Before drafting the final list, first come up with one or two faulty distractors, that are faulty in the sense that they would be júst too much on the 'too correct' side of the aforementioned spectrum.\n"
|
| 213 |
"Then, come up with one or two distractors that are júst faulty on the other side of that spectrum: júst too much on the side of 'too obviously false'.\n"
|
| 214 |
+
"As an intuition pump for the first category (distractors that are júst too correct), try to imagine experts in the question's domain discussing the answer option, and some of them arguing that the distractors would also be a valid answer to the given question. "
|
| 215 |
+
"As an intuition pump for the second category (distractors that are júst too obviously incorrect), try to image a student who is both generally stupid (bottom of his class) ánd uninformed about the given topic (didn't prepare for the test). Would even they júst so find it easy to eliminate the faulty distractors as clearly false?\n"
|
| 216 |
"Those are the two bounds of the spectrum range we aim to operate between during brainstorming.\n"
|
| 217 |
+
"So, through the above process of picking some júst faulty distractors in the context of the given question, both barely too correct and barely too obviously false, you establish the two bounds of acceptable distractors. When brainstorming, don't play it entirely safe though; when in doubt about where exactly on the spectrum the distractors would lie, just list the distractors you came up with anyway.\n\n"
|
| 218 |
"Next, in the brainstorming phase, it's most important that you get really creative and really try to think outside the box, to come up with the required potential alternative answer options to the exercise. We want to approach this task from all different angles, "
|
| 219 |
"to arrive at a varied selection of options, to serve as inspiration for a later stage of final selection (not now) to make the exercise the best it can be. For now, carry out the above-described prep in writing, then draft the list of{intermediate_distractors_specification} alternative distractors (in the same language as the existing exercise)."),
|
| 220 |
("human", "{standardized_exercise}")
|
|
|
|
| 227 |
|
| 228 |
template_consolidate_distractors = ChatPromptTemplate(
|
| 229 |
messages=[
|
| 230 |
+
("system", "You are given several lists of potential distractors (answer options to a multiple choice exercise), that need to be consolidated and/or trimmed down into one list. "
|
| 231 |
+
"Always at least filter out duplicates, do some logical sorting, and return one plain list{final_distractors_specification}. "
|
| 232 |
+
"Only focus on the distractors (answer options) themselves, don't carry over any reasoning about them. Return only the list. Format the list without numbering or bullet points, just put one distractors per line. Use the same language as the existing exercise.\n\n"
|
| 233 |
"For context, this is the exercise that the distractors are about:\n "
|
| 234 |
"{standardized_exercise}"),
|
| 235 |
("human", "Here are the lists:\n "
|
| 236 |
+
"{brainstorm_outputs}\n\n "
|
| 237 |
+
"--- end of lists ---\n\n"
|
| 238 |
+
"Now, your task is to return one plain list{final_distractors_specification}.")
|
| 239 |
],
|
| 240 |
input_variables=["standardized_exercise", "brainstorm_outputs", "final_distractors_specification"]
|
| 241 |
)
|