BtB-ExpC commited on
Commit
f462dee
Β·
1 Parent(s): 21f7b3d

refactored diagnoser chain

Browse files
app.py CHANGED
@@ -4,7 +4,8 @@ import os
4
  import asyncio
5
  import logging
6
 
7
- from utils.auth import login as auth_login # Simple authentication
 
8
  from config.chain_configs import chain_configs
9
  from config.llm_config import llms
10
 
@@ -16,7 +17,7 @@ def update_exercise_format(selected_model: str):
16
  if selected_model == "Claude 3.5":
17
  return gr.update(value="XML")
18
  else:
19
- return gr.update(value="Markdown")
20
 
21
  # A generic async runner for chains.
22
  async def run_chain(chain_name: str, input_variables: dict, selected_model: str):
@@ -61,26 +62,40 @@ async def run_chain(chain_name: str, input_variables: dict, selected_model: str)
61
 
62
  # Async wrappers for each chain.
63
  async def run_diagnoser(user_query: str, chosen_model: str, exercise_format: str, sampling_count: str) -> tuple:
 
64
  num_samples = int("".join(filter(str.isdigit, sampling_count)))
 
65
  # Fetch the DiagnoserChain configuration.
66
  config = chain_configs["diagnoser"]
67
 
68
- # Instantiate DiagnoserChain using:
69
- # - A fixed LLM for standardizing (gpt4o-mini)
70
- # - The user-selected model for diagnosis (overriding the default)
 
 
 
 
 
 
71
  chain_instance = config["class"](
72
- template_standardize=config["template_standardize"],
73
  templates_diagnose=config["templates_diagnose"],
 
74
  template_diagnose_scorecard=config["template_diagnose_scorecard"],
75
- llm_standardize=config["llm_standardize"], # Fixed: gpt4o-mini
76
- llm_diagnose=llms.get(chosen_model, config["llm_diagnose"]) # Override or fallback to default
77
  )
78
- responses = []
79
- for i in range(num_samples):
80
- response = await chain_instance.run(user_query, exercise_format)
81
- responses.append(response)
82
- # Fill missing responses (if any) up to 5 outputs.
83
- all_responses = responses + [""] * (5 - len(responses))
 
 
 
 
 
 
 
84
  # Return a tuple of exactly 5 responses.
85
  return tuple(all_responses)
86
 
 
4
  import asyncio
5
  import logging
6
 
7
+ from config.exercise_standardizer import standardize_exercise
8
+ from utils.auth import login as auth_login
9
  from config.chain_configs import chain_configs
10
  from config.llm_config import llms
11
 
 
17
  if selected_model == "Claude 3.5":
18
  return gr.update(value="XML")
19
  else:
20
+ return gr.update(value="Plaintext")
21
 
22
  # A generic async runner for chains.
23
  async def run_chain(chain_name: str, input_variables: dict, selected_model: str):
 
62
 
63
  # Async wrappers for each chain.
64
  async def run_diagnoser(user_query: str, chosen_model: str, exercise_format: str, sampling_count: str) -> tuple:
65
+ # figure out how many times to run
66
  num_samples = int("".join(filter(str.isdigit, sampling_count)))
67
+
68
  # Fetch the DiagnoserChain configuration.
69
  config = chain_configs["diagnoser"]
70
 
71
+ # 1) Standardize the user query exactly once
72
+ standardized_exercise = await standardize_exercise(
73
+ user_query,
74
+ exercise_format,
75
+ config["template_standardize"],
76
+ config["llm_standardize"]
77
+ )
78
+
79
+ # 2) Instantiate the DiagnoserChain using the user-selected LLM for diagnosing
80
  chain_instance = config["class"](
 
81
  templates_diagnose=config["templates_diagnose"],
82
+ llm_diagnose=llms.get(chosen_model, config["llm_diagnose"]),
83
  template_diagnose_scorecard=config["template_diagnose_scorecard"],
84
+ llm_4o_mini=config["llm_4o_mini"]
 
85
  )
86
+
87
+ # 3) Run the multiple samples in parallel
88
+ # Create a short helper that does only the "diagnose" steps:
89
+ tasks = [
90
+ chain_instance.diagnose_only(standardized_exercise)
91
+ for _ in range(num_samples)
92
+ ]
93
+ # run concurrently
94
+ responses = await asyncio.gather(*tasks)
95
+
96
+ # pad up to 5 if needed
97
+ all_responses = list(responses) + [""] * (5 - len(responses))
98
+
99
  # Return a tuple of exactly 5 responses.
100
  return tuple(all_responses)
101
 
chains/diagnoser_chain.py CHANGED
@@ -7,49 +7,45 @@ from config.exercise_standardizer import standardize_exercise
7
 
8
 
9
  class DiagnoserChain(BaseModel):
10
- template_standardize: ChatPromptTemplate
11
- llm_standardize: Any # Fixed LLM for step 1
12
  templates_diagnose: List[ChatPromptTemplate]
13
- llm_diagnose: Any # User-selectable LLM for step 2
14
  template_diagnose_scorecard: ChatPromptTemplate
 
 
15
 
16
- async def run(self, user_query: str, exercise_format: str) -> str:
17
  """
18
- Runs the composite chain:
19
- 1. Standardizes the exercise formatting
20
- 2. Feeds the standardized exercise into multiple diagnosis prompts in parallel
21
- 3. Combines the outputs from each prompt.
22
- 4. Generates one-line scorecard of combined diagnoses
23
-
24
  """
25
- # Step 1: Standardize the exercise.
26
- standardized_exercise = await standardize_exercise(
27
- user_query, exercise_format, self.template_standardize, self.llm_standardize
28
- )
29
 
30
- # Step 2: Define an async helper to run a single diagnosis prompt.
31
  async def run_single_diagnosis(template: ChatPromptTemplate, idx: int) -> str:
32
  prompt = await template.aformat_prompt(standardized_exercise=standardized_exercise)
33
  messages = prompt.to_messages()
34
  diagnosis_response = await self.llm_diagnose.ainvoke(messages)
35
- content = diagnosis_response.content if hasattr(diagnosis_response, "content") else diagnosis_response
36
- return f"--- [DIAGNOSIS {idx}] --- \n{content}"
37
 
38
- # Launch all diagnosis tasks concurrently.
39
  tasks = [
40
  run_single_diagnosis(template, idx)
41
  for idx, template in enumerate(self.templates_diagnose, start=1)
42
  ]
43
  diagnoses = await asyncio.gather(*tasks)
44
 
45
- # Step 3: Combine the outputs from each prompt.
46
- combined_diagnosis = "\n\n".join(diagnoses)
47
 
48
- # Step 4: Generate scorecard
49
- prompt = await self.template_diagnose_scorecard.aformat_prompt(combined_diagnosis=combined_diagnosis)
 
 
50
  scorecard_messages = prompt.to_messages()
51
- scorecard_response = await self.llm_diagnose.ainvoke(scorecard_messages)
52
- scorecard = scorecard_response.content if hasattr(scorecard_response, "content") else scorecard_response
53
 
54
  return combined_diagnosis + "\n\n" + scorecard
55
 
 
7
 
8
 
9
  class DiagnoserChain(BaseModel):
 
 
10
  templates_diagnose: List[ChatPromptTemplate]
 
11
  template_diagnose_scorecard: ChatPromptTemplate
12
+ llm_diagnose: Any
13
+ llm_4o_mini: Any
14
 
15
+ async def diagnose_only(self, standardized_exercise: str) -> str:
16
  """
17
+ Takes a PRE-standardized exercise and:
18
+ (1) Runs multiple diagnosis prompts in parallel,
19
+ (2) Merges the results,
20
+ (3) Generates a scorecard line,
21
+ (4) Returns the combined text + scorecard.
 
22
  """
 
 
 
 
23
 
24
+ # Step 1: define an async helper to run each diagnosis in parallel
25
  async def run_single_diagnosis(template: ChatPromptTemplate, idx: int) -> str:
26
  prompt = await template.aformat_prompt(standardized_exercise=standardized_exercise)
27
  messages = prompt.to_messages()
28
  diagnosis_response = await self.llm_diagnose.ainvoke(messages)
29
+ content = getattr(diagnosis_response, "content", diagnosis_response)
30
+ return f"--- [DIAGNOSIS {idx}] ---\n{content}"
31
 
32
+ # Step 2: launch all diagnoses concurrently
33
  tasks = [
34
  run_single_diagnosis(template, idx)
35
  for idx, template in enumerate(self.templates_diagnose, start=1)
36
  ]
37
  diagnoses = await asyncio.gather(*tasks)
38
 
39
+ # Step 3: combine the outputs
40
+ combined_diagnosis = "\n".join(diagnoses)
41
 
42
+ # Step 4: Generate a one-line scorecard
43
+ prompt = await self.template_diagnose_scorecard.aformat_prompt(
44
+ combined_diagnosis=combined_diagnosis
45
+ )
46
  scorecard_messages = prompt.to_messages()
47
+ scorecard_response = await self.llm_4o_mini.ainvoke(scorecard_messages)
48
+ scorecard = getattr(scorecard_response, "content", scorecard_response)
49
 
50
  return combined_diagnosis + "\n\n" + scorecard
51
 
config/chain_configs.py CHANGED
@@ -1,8 +1,14 @@
1
  # config/chain_configs.py
2
- from config.templates import standardize_template, diagnose_template, distractors_template, \
3
- template_diagnose_double_negation, template_diagnose_correct_answer_stands_out, \
4
- template_diagnose_distractor_clearly_wrong, template_diagnose_distractor_partially_correct, \
 
 
 
 
 
5
  diagnose_scorecard_template
 
6
  from chains.diagnoser_chain import DiagnoserChain
7
  from chains.distractors_chain import DistractorsChain
8
  from config.llm_config import llms
@@ -11,8 +17,7 @@ from config.llm_config import llms
11
  chain_configs = {
12
  "diagnoser": {
13
  "class": DiagnoserChain,
14
- "template_standardize": standardize_template,
15
- "llm_standardize": llms["GPT-4o-mini"], # Always fixed
16
  # 4 different diagnosis templates (to run in parallel:
17
  "templates_diagnose": [
18
  template_diagnose_double_negation,
@@ -26,7 +31,7 @@ chain_configs = {
26
  "distractors": {
27
  "class": DistractorsChain,
28
  "template_standardize": standardize_template,
29
- "llm_standardize": llms["GPT-4o-mini"], # Always fixed
30
  "template_distractors": distractors_template,
31
  "llm_distractors": llms["GPT-4o"], # Default; can be replaced in UI
32
  },
 
1
  # config/chain_configs.py
2
+ from config.templates import (
3
+ standardize_template,
4
+ diagnose_template,
5
+ distractors_template,
6
+ template_diagnose_double_negation,
7
+ template_diagnose_correct_answer_stands_out,
8
+ template_diagnose_distractor_clearly_wrong,
9
+ template_diagnose_distractor_partially_correct,
10
  diagnose_scorecard_template
11
+ )
12
  from chains.diagnoser_chain import DiagnoserChain
13
  from chains.distractors_chain import DistractorsChain
14
  from config.llm_config import llms
 
17
  chain_configs = {
18
  "diagnoser": {
19
  "class": DiagnoserChain,
20
+ "llm_4o_mini": llms["GPT-4o-mini"],
 
21
  # 4 different diagnosis templates (to run in parallel:
22
  "templates_diagnose": [
23
  template_diagnose_double_negation,
 
31
  "distractors": {
32
  "class": DistractorsChain,
33
  "template_standardize": standardize_template,
34
+ "llm_standardize": llms["GPT-4o-mini-zero"], # Always fixed
35
  "template_distractors": distractors_template,
36
  "llm_distractors": llms["GPT-4o"], # Default; can be replaced in UI
37
  },
config/llm_config.py CHANGED
@@ -30,7 +30,8 @@ def create_deepseek_llm(model_name: str, temperature: float):
30
 
31
  llms = {
32
  "GPT-4o": create_openai_llm("gpt-4o", LOW),
33
- "GPT-4o-mini": create_openai_llm("gpt-4o-mini", ZERO),
 
34
  "GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
35
  "GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
36
  "GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", HIGH),
 
30
 
31
  llms = {
32
  "GPT-4o": create_openai_llm("gpt-4o", LOW),
33
+ "GPT-4o-mini-zero": create_openai_llm("gpt-4o-mini", ZERO),
34
+ "GPT-4o-mini": create_openai_llm("gpt-4o-mini", LOW),
35
  "GPT-4o_high_temp": create_openai_llm("gpt-4o", HIGH),
36
  "GPT-4o-mini_high_temp": create_openai_llm("gpt-4o-mini", HIGH),
37
  "GPT-4 Turbo": create_openai_llm("gpt-4-turbo-2024-04-09", HIGH),
config/templates.py CHANGED
@@ -60,7 +60,7 @@ template_diagnose_double_negation = ChatPromptTemplate(
60
  </double negative explanation>
61
  </example 2>.
62
  If it's obvious that there is or isn't a double negative in this exercise, just give a short one-sentence diagnosis on this.
63
- If you're not quite sure, do some reasoning first, and give your diagnosis then."""),
64
  ("human", "{standardized_exercise}")
65
  ],
66
  input_variables=["standardized_exercise"]
@@ -114,7 +114,7 @@ template_diagnose_correct_answer_stands_out = ChatPromptTemplate(
114
  </example where the correct answer is grammatically different>
115
 
116
  Your only focus is to accurately diagnose this issue, no need to provide a fix. If the correct answer in the given exercise clearly does or does not stand out, just give a short one-sentence diagnosis on this.
117
- If you're not quite sure, do some reasoning first, and give your diagnosis then."""),
118
  ("human", "{standardized_exercise}")
119
  ],
120
  input_variables=["standardized_exercise"]
@@ -134,7 +134,7 @@ template_diagnose_distractor_clearly_wrong = ChatPromptTemplate(
134
  be plausible but incorrect.
135
  Identify distractors that are obviously wrong, such that even students that are completely uninformed about the topic can eliminate them.
136
  Your only focus is to accurately diagnose this issue, no need to provide a fix. If all distractors in the given exercise clearly are or aren't obviously incorrect, just give a short one-sentence diagnosis on this.
137
- If you're not quite sure, do some reasoning first, and give your diagnosis then."""),
138
  ("human", "{standardized_exercise}")
139
  ],
140
  input_variables=["standardized_exercise"]
@@ -146,7 +146,7 @@ template_diagnose_distractor_partially_correct = ChatPromptTemplate(
146
  partially correct. Some answer choices may contain elements of truth, leading to
147
  ambiguity. Identify such cases. Really stress-test them: is there a story you could tell where the distractor, in the context of this exercise, could be considered a (partially) correct answer?
148
  Your only focus is to accurately diagnose this issue, no need to provide a fix. If all distractors in the given exercise clearly are or aren't unambiguously false, just give a short one-sentence diagnosis on this.
149
- If you're not quite sure, do some reasoning first, and give your diagnosis then.
150
  """),
151
  ("human", "{standardized_exercise}")
152
  ],
@@ -166,16 +166,16 @@ diagnose_scorecard_template = ChatPromptTemplate(
166
  (and a third icon if need be: - ❔ means the diagnosis is unclear)
167
  The scorecard should always look like this:
168
  <template>
169
- |The exercise does not contain/contains a double negative: βœ…/❌| |The correct answer does not/does stand out: βœ…/❌| |None/Some of the distractors are too obviously false: βœ…/❌| |None/Some of the distractors are actually also kinda correct: βœ…/❌|
170
  </template>
171
  <example 1>
172
- |The exercise doesn't contain a double negative: βœ…| |The correct answer does not stand out: βœ…| |None of the distractors are too obviously false: βœ…| |None of the distractors are actually also kinda correct: βœ…|
173
  </example 1>
174
  <example 2>
175
- |The exercise doesn't contain a double negative: βœ…| |The correct answer does stand out: ❌| |None of the distractors are too obviously false: βœ…| |Some of the distractors are actually also kinda correct: ❌|
176
  </example 2>
177
  <example 3>
178
- |The exercise contains a double negative: ❌| |The correct answer does not stand out: βœ…| |Some of the distractors are too obviously false: ❌| |None of the distractors are actually also kinda correct: βœ…|
179
  </example 3>
180
  """),
181
  ("human", "{combined_diagnosis}")
 
60
  </double negative explanation>
61
  </example 2>.
62
  If it's obvious that there is or isn't a double negative in this exercise, just give a short one-sentence diagnosis on this.
63
+ If the issue is more nuanced, do some reasoning first, and give your diagnosis then."""),
64
  ("human", "{standardized_exercise}")
65
  ],
66
  input_variables=["standardized_exercise"]
 
114
  </example where the correct answer is grammatically different>
115
 
116
  Your only focus is to accurately diagnose this issue, no need to provide a fix. If the correct answer in the given exercise clearly does or does not stand out, just give a short one-sentence diagnosis on this.
117
+ If the issue is more nuanced, do some reasoning first, and give your diagnosis then."""),
118
  ("human", "{standardized_exercise}")
119
  ],
120
  input_variables=["standardized_exercise"]
 
134
  be plausible but incorrect.
135
  Identify distractors that are obviously wrong, such that even students that are completely uninformed about the topic can eliminate them.
136
  Your only focus is to accurately diagnose this issue, no need to provide a fix. If all distractors in the given exercise clearly are or aren't obviously incorrect, just give a short one-sentence diagnosis on this.
137
+ If the issue is more nuanced, do some reasoning first, and give your diagnosis then."""),
138
  ("human", "{standardized_exercise}")
139
  ],
140
  input_variables=["standardized_exercise"]
 
146
  partially correct. Some answer choices may contain elements of truth, leading to
147
  ambiguity. Identify such cases. Really stress-test them: is there a story you could tell where the distractor, in the context of this exercise, could be considered a (partially) correct answer?
148
  Your only focus is to accurately diagnose this issue, no need to provide a fix. If all distractors in the given exercise clearly are or aren't unambiguously false, just give a short one-sentence diagnosis on this.
149
+ If the issue is more nuanced, do some reasoning first, and give your diagnosis then.
150
  """),
151
  ("human", "{standardized_exercise}")
152
  ],
 
166
  (and a third icon if need be: - ❔ means the diagnosis is unclear)
167
  The scorecard should always look like this:
168
  <template>
169
+ The exercise does not contain/contains a double negative: βœ…/❌ -- The correct answer does not/does stand out: βœ…/❌ -- None/Some of the distractors are too obviously false: βœ…/❌ -- None/Some of the distractors are actually also kinda correct: βœ…/❌
170
  </template>
171
  <example 1>
172
+ The exercise doesn't contain a double negative: βœ… -- The correct answer does not stand out: βœ… -- None of the distractors are too obviously false: βœ… -- None of the distractors are actually also kinda correct: βœ…
173
  </example 1>
174
  <example 2>
175
+ The exercise doesn't contain a double negative: βœ… -- The correct answer does stand out: ❌ -- None of the distractors are too obviously false: βœ… -- Some of the distractors are actually also kinda correct: ❌
176
  </example 2>
177
  <example 3>
178
+ The exercise contains a double negative: ❌ -- The correct answer does not stand out: βœ… -- Some of the distractors are too obviously false: ❌ -- None of the distractors are actually also kinda correct: οΏ½οΏ½
179
  </example 3>
180
  """),
181
  ("human", "{combined_diagnosis}")
test exercises.txt β†’ test exercises.md RENAMED
File without changes