Pulastya B commited on
Commit
7502356
Β·
1 Parent(s): 5ce70d3

Added Query Awareness , Success/Failure Marking further improving pipeline accuracy

Browse files
src/orchestrator.py CHANGED
@@ -3240,8 +3240,12 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
3240
 
3241
  tool_result = self._execute_tool(tool_name, tool_args)
3242
 
 
 
 
 
3243
  # Track output file for next iteration β€” ONLY update for data files
3244
- if tool_result.get("success", True):
3245
  result_data = tool_result.get("result", {})
3246
  if isinstance(result_data, dict):
3247
  new_file = result_data.get("output_file") or result_data.get("output_path")
@@ -3265,7 +3269,10 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
3265
  print(f" βœ“ Tool completed successfully")
3266
  else:
3267
  error_msg = tool_result.get("error", "Unknown error")
 
3268
  print(f" ❌ Tool failed: {error_msg}")
 
 
3269
  if hasattr(self, 'session') and self.session:
3270
  progress_manager.emit(self.session.session_id, {
3271
  'type': 'tool_failed',
@@ -3286,7 +3293,7 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
3286
  self._update_workflow_state(tool_name, tool_result)
3287
 
3288
  # Checkpoint
3289
- if tool_result.get("success", True):
3290
  session_id = self.http_session_key or "default"
3291
  self.recovery_manager.checkpoint_manager.save_checkpoint(
3292
  session_id=session_id,
@@ -3333,7 +3340,9 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
3333
  tool_name=tool_name,
3334
  arguments=tool_args,
3335
  result_summary=compressed_result,
3336
- evaluation=evaluation
 
 
3337
  )
3338
  findings.add_finding(finding)
3339
 
 
3240
 
3241
  tool_result = self._execute_tool(tool_name, tool_args)
3242
 
3243
+ # Determine success/failure
3244
+ tool_success = tool_result.get("success", True)
3245
+ tool_error = ""
3246
+
3247
  # Track output file for next iteration β€” ONLY update for data files
3248
+ if tool_success:
3249
  result_data = tool_result.get("result", {})
3250
  if isinstance(result_data, dict):
3251
  new_file = result_data.get("output_file") or result_data.get("output_path")
 
3269
  print(f" βœ“ Tool completed successfully")
3270
  else:
3271
  error_msg = tool_result.get("error", "Unknown error")
3272
+ tool_error = str(error_msg)[:300]
3273
  print(f" ❌ Tool failed: {error_msg}")
3274
+ # Record failure so Reasoner won't retry this tool
3275
+ findings.add_failed_tool(tool_name, tool_error)
3276
  if hasattr(self, 'session') and self.session:
3277
  progress_manager.emit(self.session.session_id, {
3278
  'type': 'tool_failed',
 
3293
  self._update_workflow_state(tool_name, tool_result)
3294
 
3295
  # Checkpoint
3296
+ if tool_success:
3297
  session_id = self.http_session_key or "default"
3298
  self.recovery_manager.checkpoint_manager.save_checkpoint(
3299
  session_id=session_id,
 
3340
  tool_name=tool_name,
3341
  arguments=tool_args,
3342
  result_summary=compressed_result,
3343
+ evaluation=evaluation,
3344
+ success=tool_success,
3345
+ error_message=tool_error
3346
  )
3347
  findings.add_finding(finding)
3348
 
src/reasoning/evaluator.py CHANGED
@@ -177,7 +177,9 @@ class Evaluator:
177
  tool_name: str,
178
  arguments: Dict[str, Any],
179
  result_summary: str,
180
- evaluation: "EvaluationOutput"
 
 
181
  ) -> Finding:
182
  """
183
  Build a Finding from a completed iteration.
@@ -192,9 +194,11 @@ class Evaluator:
192
  arguments=arguments,
193
  result_summary=result_summary[:1000], # Cap size
194
  interpretation=evaluation.interpretation,
195
- confidence=evaluation.confidence,
196
- answered_question=evaluation.answered,
197
- next_questions=evaluation.next_questions
 
 
198
  )
199
 
200
  def _parse_response(self, response_text: str, result_summary: str) -> EvaluationOutput:
 
177
  tool_name: str,
178
  arguments: Dict[str, Any],
179
  result_summary: str,
180
+ evaluation: "EvaluationOutput",
181
+ success: bool = True,
182
+ error_message: str = ""
183
  ) -> Finding:
184
  """
185
  Build a Finding from a completed iteration.
 
194
  arguments=arguments,
195
  result_summary=result_summary[:1000], # Cap size
196
  interpretation=evaluation.interpretation,
197
+ confidence=evaluation.confidence if success else 0.0,
198
+ answered_question=evaluation.answered if success else False,
199
+ next_questions=evaluation.next_questions,
200
+ success=success,
201
+ error_message=error_message
202
  )
203
 
204
  def _parse_response(self, response_text: str, result_summary: str) -> EvaluationOutput:
src/reasoning/findings.py CHANGED
@@ -35,6 +35,8 @@ class Finding:
35
  confidence: float # 0.0-1.0 confidence in this finding
36
  answered_question: bool # Did this iteration answer the user's question?
37
  next_questions: List[str] # Follow-up questions generated
 
 
38
  timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
39
 
40
  def to_dict(self) -> Dict[str, Any]:
@@ -48,6 +50,8 @@ class Finding:
48
  "confidence": self.confidence,
49
  "answered": self.answered_question,
50
  "next_questions": self.next_questions,
 
 
51
  "timestamp": self.timestamp
52
  }
53
 
@@ -121,6 +125,7 @@ class FindingsAccumulator:
121
  self.hypotheses: List[Hypothesis] = []
122
  self.tools_used: List[str] = []
123
  self.files_produced: List[str] = []
 
124
  self.is_answered = False
125
  self.answer_confidence = 0.0
126
  self.started_at = datetime.now().isoformat()
@@ -152,6 +157,23 @@ class FindingsAccumulator:
152
  source_iteration=finding.iteration
153
  ))
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  def add_hypothesis(self, text: str, priority: float = 0.5, source_iteration: int = 0):
156
  """Add a hypothesis to test."""
157
  if not any(h.text == text for h in self.hypotheses):
@@ -215,12 +237,18 @@ class FindingsAccumulator:
215
  parts.append(f"**Investigations completed**: {len(self.findings)}")
216
  parts.append(f"**Tools used**: {', '.join(self.tools_used)}")
217
 
 
 
 
 
 
218
  # Recent findings (most relevant for next decision)
219
  recent = self.findings[-max_findings:]
220
  parts.append("\n**Recent findings**:")
221
  for f in recent:
 
222
  parts.append(
223
- f" Step {f.iteration}: Ran `{f.action}` to test: \"{f.hypothesis}\"\n"
224
  f" β†’ Result: {f.interpretation}\n"
225
  f" β†’ Confidence: {f.confidence:.0%}"
226
  )
@@ -257,8 +285,9 @@ class FindingsAccumulator:
257
  # All findings in order
258
  parts.append("\n## Investigation Steps\n")
259
  for f in self.findings:
 
260
  parts.append(
261
- f"### Step {f.iteration}: {f.action}\n"
262
  f"**Hypothesis**: {f.hypothesis}\n"
263
  f"**Arguments**: {json.dumps(f.arguments, default=str)}\n"
264
  f"**Result**: {f.result_summary}\n"
 
35
  confidence: float # 0.0-1.0 confidence in this finding
36
  answered_question: bool # Did this iteration answer the user's question?
37
  next_questions: List[str] # Follow-up questions generated
38
+ success: bool = True # Whether the tool execution succeeded
39
+ error_message: str = "" # Error message if tool failed
40
  timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
41
 
42
  def to_dict(self) -> Dict[str, Any]:
 
50
  "confidence": self.confidence,
51
  "answered": self.answered_question,
52
  "next_questions": self.next_questions,
53
+ "success": self.success,
54
+ "error_message": self.error_message,
55
  "timestamp": self.timestamp
56
  }
57
 
 
125
  self.hypotheses: List[Hypothesis] = []
126
  self.tools_used: List[str] = []
127
  self.files_produced: List[str] = []
128
+ self.failed_tools: Dict[str, str] = {} # tool_name β†’ error message
129
  self.is_answered = False
130
  self.answer_confidence = 0.0
131
  self.started_at = datetime.now().isoformat()
 
157
  source_iteration=finding.iteration
158
  ))
159
 
160
+ def add_failed_tool(self, tool_name: str, error_message: str):
161
+ """Record a tool that failed so the Reasoner avoids retrying it."""
162
+ self.failed_tools[tool_name] = error_message
163
+
164
+ def get_failed_tools_context(self) -> str:
165
+ """Build context string listing tools that failed."""
166
+ if not self.failed_tools:
167
+ return ""
168
+ parts = ["\n**FAILED TOOLS (do NOT retry these)**:"]
169
+ for tool, error in self.failed_tools.items():
170
+ parts.append(f" - `{tool}`: {error[:150]}")
171
+ return "\n".join(parts)
172
+
173
+ def get_successful_findings(self) -> List[Finding]:
174
+ """Return only findings from successful tool executions."""
175
+ return [f for f in self.findings if f.success]
176
+
177
  def add_hypothesis(self, text: str, priority: float = 0.5, source_iteration: int = 0):
178
  """Add a hypothesis to test."""
179
  if not any(h.text == text for h in self.hypotheses):
 
237
  parts.append(f"**Investigations completed**: {len(self.findings)}")
238
  parts.append(f"**Tools used**: {', '.join(self.tools_used)}")
239
 
240
+ # Failed tools warning (critical for avoiding retries)
241
+ failed_ctx = self.get_failed_tools_context()
242
+ if failed_ctx:
243
+ parts.append(failed_ctx)
244
+
245
  # Recent findings (most relevant for next decision)
246
  recent = self.findings[-max_findings:]
247
  parts.append("\n**Recent findings**:")
248
  for f in recent:
249
+ status_tag = "" if f.success else " [FAILED]"
250
  parts.append(
251
+ f" Step {f.iteration}: Ran `{f.action}`{status_tag} to test: \"{f.hypothesis}\"\n"
252
  f" β†’ Result: {f.interpretation}\n"
253
  f" β†’ Confidence: {f.confidence:.0%}"
254
  )
 
285
  # All findings in order
286
  parts.append("\n## Investigation Steps\n")
287
  for f in self.findings:
288
+ status_label = "\u2705 SUCCESS" if f.success else "\u274c FAILED"
289
  parts.append(
290
+ f"### Step {f.iteration}: {f.action} [{status_label}]\n"
291
  f"**Hypothesis**: {f.hypothesis}\n"
292
  f"**Arguments**: {json.dumps(f.arguments, default=str)}\n"
293
  f"**Result**: {f.result_summary}\n"
src/reasoning/reasoner.py CHANGED
@@ -77,7 +77,18 @@ CRITICAL RULES:
77
  - If a previous tool produced a new data file (CSV/parquet), use THAT as file_path
78
  - NEVER use an HTML, PNG, or report path as file_path for data-consuming tools
79
  - For visualization, pick the chart type that best answers the question
80
- - NEVER hallucinate column names - use only columns from the schema"""
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  REASONER_USER_TEMPLATE = """**User's question**: {question}
83
 
@@ -94,7 +105,9 @@ REASONER_USER_TEMPLATE = """**User's question**: {question}
94
  **Available tools**:
95
  {tools_description}
96
 
97
- IMPORTANT: For ANY tool that needs a file_path argument, use "{file_path}" β€” the original data file. Do NOT use paths to HTML reports, plots, or other output artifacts.
 
 
98
 
99
  Decide the next action. Respond with ONLY this JSON:
100
  {{
@@ -223,8 +236,8 @@ class Reasoner:
223
  max_tokens=1024
224
  )
225
 
226
- # Parse response
227
- return self._parse_response(response_text, file_path)
228
 
229
  def generate_hypotheses(
230
  self,
@@ -273,7 +286,7 @@ class Reasoner:
273
 
274
  return self._parse_hypotheses(response_text)
275
 
276
- def _parse_response(self, response_text: str, file_path: str) -> ReasoningOutput:
277
  """Parse LLM response into ReasoningOutput."""
278
  try:
279
  # Try direct JSON parse
@@ -319,6 +332,14 @@ class Reasoner:
319
  if fp.lower().endswith(non_data_extensions):
320
  arguments["file_path"] = file_path
321
 
 
 
 
 
 
 
 
 
322
  return ReasoningOutput(
323
  status=status,
324
  reasoning=data.get("reasoning", ""),
 
77
  - If a previous tool produced a new data file (CSV/parquet), use THAT as file_path
78
  - NEVER use an HTML, PNG, or report path as file_path for data-consuming tools
79
  - For visualization, pick the chart type that best answers the question
80
+ - NEVER hallucinate column names - use only columns from the schema
81
+
82
+ TOOL FAILURE RULES:
83
+ - NEVER retry a tool that has already FAILED β€” try a DIFFERENT tool or approach instead
84
+ - If the "FAILED TOOLS" section lists a tool, that tool WILL fail again β€” do not call it
85
+ - If multiple tools have failed, consider stopping and synthesizing what you have
86
+
87
+ QUERY TYPE AWARENESS:
88
+ - For questions about "important features", "feature importance", "correlations", "patterns", or "explain the data":
89
+ Use EDA tools (profile_dataset, analyze_correlations, auto_feature_selection, generate_eda_plots)
90
+ Do NOT use model training tools (train_with_autogluon, train_model, etc.) β€” training is unnecessary for feature explanation
91
+ - Only use model training tools when the user explicitly asks to train, predict, build a model, or classify/regress"""
92
 
93
  REASONER_USER_TEMPLATE = """**User's question**: {question}
94
 
 
105
  **Available tools**:
106
  {tools_description}
107
 
108
+ IMPORTANT:
109
+ - For ANY tool that needs a file_path argument, use "{file_path}" β€” the original data file. Do NOT use paths to HTML reports, plots, or other output artifacts.
110
+ - If a tool is listed under FAILED TOOLS above, do NOT call it again β€” it will fail. Choose a different tool or stop.
111
 
112
  Decide the next action. Respond with ONLY this JSON:
113
  {{
 
236
  max_tokens=1024
237
  )
238
 
239
+ # Parse response (pass findings so we can reject failed tools)
240
+ return self._parse_response(response_text, file_path, findings)
241
 
242
  def generate_hypotheses(
243
  self,
 
286
 
287
  return self._parse_hypotheses(response_text)
288
 
289
+ def _parse_response(self, response_text: str, file_path: str, findings: Optional[FindingsAccumulator] = None) -> ReasoningOutput:
290
  """Parse LLM response into ReasoningOutput."""
291
  try:
292
  # Try direct JSON parse
 
332
  if fp.lower().endswith(non_data_extensions):
333
  arguments["file_path"] = file_path
334
 
335
+ # πŸ›‘οΈ SAFETY: Reject tools that already failed β€” force "done" to stop wasting iterations
336
+ if tool_name and findings and tool_name in findings.failed_tools:
337
+ print(f" ⚠️ Reasoner picked failed tool '{tool_name}' β€” forcing done")
338
+ return ReasoningOutput.done(
339
+ reasoning=f"Tool '{tool_name}' previously failed. Stopping to synthesize available findings.",
340
+ confidence=max(0.3, findings.answer_confidence)
341
+ )
342
+
343
  return ReasoningOutput(
344
  status=status,
345
  reasoning=data.get("reasoning", ""),
src/reasoning/synthesizer.py CHANGED
@@ -39,7 +39,11 @@ RULES:
39
  - Mention generated files/plots so user can find them
40
  - Be honest about confidence levels
41
  - Keep it under 500 words unless complex analysis warrants more
42
- - Use markdown formatting (headers, bullets, bold for emphasis)"""
 
 
 
 
43
 
44
  SYNTHESIS_USER_TEMPLATE = """**Original question**: {question}
45
 
@@ -93,9 +97,12 @@ class Synthesizer:
93
  # Build artifacts summary
94
  artifacts_summary = self._format_artifacts(artifacts or {}, findings)
95
 
 
 
 
96
  user_prompt = SYNTHESIS_USER_TEMPLATE.format(
97
  question=findings.question,
98
- findings_context=findings.get_context_for_synthesis(),
99
  artifacts_summary=artifacts_summary
100
  )
101
 
@@ -136,14 +143,20 @@ RULES:
136
  - Use specific numbers and metrics
137
  - Mention all generated visualizations with file paths
138
  - Suggest actionable next analysis steps
139
- - Keep it engaging but data-driven"""
 
 
 
140
 
141
  artifacts_summary = self._format_artifacts(artifacts or {}, findings)
142
 
 
 
 
143
  user_prompt = f"""**Analysis request**: {findings.question}
144
 
145
  **Investigation summary**:
146
- {findings.get_context_for_synthesis()}
147
 
148
  **Generated artifacts**:
149
  {artifacts_summary}
@@ -179,8 +192,10 @@ Write the exploratory analysis report."""
179
  for f in files:
180
  parts.append(f" - {f}")
181
 
182
- # Extract from findings history
183
  for finding in findings.findings:
 
 
184
  result = finding.result_summary
185
  if "output_file" in result or "output_path" in result or ".html" in result or ".png" in result:
186
  parts.append(f" - Step {finding.iteration} ({finding.action}): output in result")
@@ -193,3 +208,59 @@ Write the exploratory analysis report."""
193
  return "No artifacts generated yet."
194
 
195
  return "\n".join(parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  - Mention generated files/plots so user can find them
40
  - Be honest about confidence levels
41
  - Keep it under 500 words unless complex analysis warrants more
42
+ - Use markdown formatting (headers, bullets, bold for emphasis)
43
+ - ONLY report findings from SUCCESSFUL investigation steps
44
+ - Do NOT invent numbers, statistics, or insights that are not present in the findings
45
+ - If a step is marked [FAILED], ignore its results entirely β€” do not fabricate data from it
46
+ - If most steps failed, be transparent about limited evidence and recommend re-running"""
47
 
48
  SYNTHESIS_USER_TEMPLATE = """**Original question**: {question}
49
 
 
97
  # Build artifacts summary
98
  artifacts_summary = self._format_artifacts(artifacts or {}, findings)
99
 
100
+ # Build findings context β€” only successful findings get full detail
101
+ findings_context = self._build_filtered_context(findings)
102
+
103
  user_prompt = SYNTHESIS_USER_TEMPLATE.format(
104
  question=findings.question,
105
+ findings_context=findings_context,
106
  artifacts_summary=artifacts_summary
107
  )
108
 
 
143
  - Use specific numbers and metrics
144
  - Mention all generated visualizations with file paths
145
  - Suggest actionable next analysis steps
146
+ - Keep it engaging but data-driven
147
+ - ONLY report findings from SUCCESSFUL investigation steps
148
+ - Do NOT invent numbers or statistics not present in the findings
149
+ - If a step is marked [FAILED], ignore it entirely"""
150
 
151
  artifacts_summary = self._format_artifacts(artifacts or {}, findings)
152
 
153
+ # Build filtered context β€” only successful findings
154
+ findings_context = self._build_filtered_context(findings)
155
+
156
  user_prompt = f"""**Analysis request**: {findings.question}
157
 
158
  **Investigation summary**:
159
+ {findings_context}
160
 
161
  **Generated artifacts**:
162
  {artifacts_summary}
 
192
  for f in files:
193
  parts.append(f" - {f}")
194
 
195
+ # Extract from findings history β€” only from successful steps
196
  for finding in findings.findings:
197
+ if not finding.success:
198
+ continue
199
  result = finding.result_summary
200
  if "output_file" in result or "output_path" in result or ".html" in result or ".png" in result:
201
  parts.append(f" - Step {finding.iteration} ({finding.action}): output in result")
 
208
  return "No artifacts generated yet."
209
 
210
  return "\n".join(parts)
211
+
212
+ def _build_filtered_context(self, findings: FindingsAccumulator) -> str:
213
+ """
214
+ Build synthesis context that only includes SUCCESSFUL findings in detail.
215
+ Failed findings are listed as a brief summary so the LLM knows they happened
216
+ but cannot hallucinate data from them.
217
+ """
218
+ import json
219
+
220
+ parts = []
221
+ parts.append(f"**Original question**: {findings.question}")
222
+ parts.append(f"**Mode**: {findings.mode}")
223
+
224
+ successful = findings.get_successful_findings()
225
+ failed = [f for f in findings.findings if not f.success]
226
+
227
+ parts.append(f"**Total iterations**: {len(findings.findings)} ({len(successful)} succeeded, {len(failed)} failed)")
228
+ parts.append(f"**Tools used**: {', '.join(findings.tools_used)}")
229
+
230
+ # Only successful findings get full detail
231
+ if successful:
232
+ parts.append("\n## Successful Investigation Steps\n")
233
+ for f in successful:
234
+ parts.append(
235
+ f"### Step {f.iteration}: {f.action}\n"
236
+ f"**Hypothesis**: {f.hypothesis}\n"
237
+ f"**Arguments**: {json.dumps(f.arguments, default=str)}\n"
238
+ f"**Result**: {f.result_summary}\n"
239
+ f"**Interpretation**: {f.interpretation}\n"
240
+ f"**Confidence**: {f.confidence:.0%}\n"
241
+ )
242
+
243
+ # Failed findings get just a one-line mention
244
+ if failed:
245
+ parts.append("\n## Failed Steps (no usable data β€” do NOT cite these)\n")
246
+ for f in failed:
247
+ parts.append(f"- Step {f.iteration}: `{f.action}` FAILED β€” {f.error_message or 'execution error'}")
248
+
249
+ # Hypothesis outcomes
250
+ if findings.hypotheses:
251
+ parts.append("\n## Hypothesis Outcomes\n")
252
+ for h in findings.hypotheses:
253
+ status_emoji = {
254
+ "supported": "\u2705",
255
+ "refuted": "\u274c",
256
+ "inconclusive": "\u2753",
257
+ "testing": "\ud83d\udd04",
258
+ "untested": "\u2b1c"
259
+ }.get(h.status, "\u2b1c")
260
+ parts.append(f"{status_emoji} **{h.text}** \u2192 {h.status}")
261
+ if h.evidence_for:
262
+ parts.append(f" Evidence for: {'; '.join(h.evidence_for)}")
263
+ if h.evidence_against:
264
+ parts.append(f" Evidence against: {'; '.join(h.evidence_against)}")
265
+
266
+ return "\n".join(parts)