Spaces:
Running
Running
Pulastya B commited on
Commit Β·
7502356
1
Parent(s): 5ce70d3
Added Query Awareness , Success/Failure Marking further improving pipeline accuracy
Browse files- src/orchestrator.py +12 -3
- src/reasoning/evaluator.py +8 -4
- src/reasoning/findings.py +31 -2
- src/reasoning/reasoner.py +26 -5
- src/reasoning/synthesizer.py +76 -5
src/orchestrator.py
CHANGED
|
@@ -3240,8 +3240,12 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
|
|
| 3240 |
|
| 3241 |
tool_result = self._execute_tool(tool_name, tool_args)
|
| 3242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3243 |
# Track output file for next iteration β ONLY update for data files
|
| 3244 |
-
if
|
| 3245 |
result_data = tool_result.get("result", {})
|
| 3246 |
if isinstance(result_data, dict):
|
| 3247 |
new_file = result_data.get("output_file") or result_data.get("output_path")
|
|
@@ -3265,7 +3269,10 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
|
|
| 3265 |
print(f" β Tool completed successfully")
|
| 3266 |
else:
|
| 3267 |
error_msg = tool_result.get("error", "Unknown error")
|
|
|
|
| 3268 |
print(f" β Tool failed: {error_msg}")
|
|
|
|
|
|
|
| 3269 |
if hasattr(self, 'session') and self.session:
|
| 3270 |
progress_manager.emit(self.session.session_id, {
|
| 3271 |
'type': 'tool_failed',
|
|
@@ -3286,7 +3293,7 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
|
|
| 3286 |
self._update_workflow_state(tool_name, tool_result)
|
| 3287 |
|
| 3288 |
# Checkpoint
|
| 3289 |
-
if
|
| 3290 |
session_id = self.http_session_key or "default"
|
| 3291 |
self.recovery_manager.checkpoint_manager.save_checkpoint(
|
| 3292 |
session_id=session_id,
|
|
@@ -3333,7 +3340,9 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
|
|
| 3333 |
tool_name=tool_name,
|
| 3334 |
arguments=tool_args,
|
| 3335 |
result_summary=compressed_result,
|
| 3336 |
-
evaluation=evaluation
|
|
|
|
|
|
|
| 3337 |
)
|
| 3338 |
findings.add_finding(finding)
|
| 3339 |
|
|
|
|
| 3240 |
|
| 3241 |
tool_result = self._execute_tool(tool_name, tool_args)
|
| 3242 |
|
| 3243 |
+
# Determine success/failure
|
| 3244 |
+
tool_success = tool_result.get("success", True)
|
| 3245 |
+
tool_error = ""
|
| 3246 |
+
|
| 3247 |
# Track output file for next iteration β ONLY update for data files
|
| 3248 |
+
if tool_success:
|
| 3249 |
result_data = tool_result.get("result", {})
|
| 3250 |
if isinstance(result_data, dict):
|
| 3251 |
new_file = result_data.get("output_file") or result_data.get("output_path")
|
|
|
|
| 3269 |
print(f" β Tool completed successfully")
|
| 3270 |
else:
|
| 3271 |
error_msg = tool_result.get("error", "Unknown error")
|
| 3272 |
+
tool_error = str(error_msg)[:300]
|
| 3273 |
print(f" β Tool failed: {error_msg}")
|
| 3274 |
+
# Record failure so Reasoner won't retry this tool
|
| 3275 |
+
findings.add_failed_tool(tool_name, tool_error)
|
| 3276 |
if hasattr(self, 'session') and self.session:
|
| 3277 |
progress_manager.emit(self.session.session_id, {
|
| 3278 |
'type': 'tool_failed',
|
|
|
|
| 3293 |
self._update_workflow_state(tool_name, tool_result)
|
| 3294 |
|
| 3295 |
# Checkpoint
|
| 3296 |
+
if tool_success:
|
| 3297 |
session_id = self.http_session_key or "default"
|
| 3298 |
self.recovery_manager.checkpoint_manager.save_checkpoint(
|
| 3299 |
session_id=session_id,
|
|
|
|
| 3340 |
tool_name=tool_name,
|
| 3341 |
arguments=tool_args,
|
| 3342 |
result_summary=compressed_result,
|
| 3343 |
+
evaluation=evaluation,
|
| 3344 |
+
success=tool_success,
|
| 3345 |
+
error_message=tool_error
|
| 3346 |
)
|
| 3347 |
findings.add_finding(finding)
|
| 3348 |
|
src/reasoning/evaluator.py
CHANGED
|
@@ -177,7 +177,9 @@ class Evaluator:
|
|
| 177 |
tool_name: str,
|
| 178 |
arguments: Dict[str, Any],
|
| 179 |
result_summary: str,
|
| 180 |
-
evaluation: "EvaluationOutput"
|
|
|
|
|
|
|
| 181 |
) -> Finding:
|
| 182 |
"""
|
| 183 |
Build a Finding from a completed iteration.
|
|
@@ -192,9 +194,11 @@ class Evaluator:
|
|
| 192 |
arguments=arguments,
|
| 193 |
result_summary=result_summary[:1000], # Cap size
|
| 194 |
interpretation=evaluation.interpretation,
|
| 195 |
-
confidence=evaluation.confidence,
|
| 196 |
-
answered_question=evaluation.answered,
|
| 197 |
-
next_questions=evaluation.next_questions
|
|
|
|
|
|
|
| 198 |
)
|
| 199 |
|
| 200 |
def _parse_response(self, response_text: str, result_summary: str) -> EvaluationOutput:
|
|
|
|
| 177 |
tool_name: str,
|
| 178 |
arguments: Dict[str, Any],
|
| 179 |
result_summary: str,
|
| 180 |
+
evaluation: "EvaluationOutput",
|
| 181 |
+
success: bool = True,
|
| 182 |
+
error_message: str = ""
|
| 183 |
) -> Finding:
|
| 184 |
"""
|
| 185 |
Build a Finding from a completed iteration.
|
|
|
|
| 194 |
arguments=arguments,
|
| 195 |
result_summary=result_summary[:1000], # Cap size
|
| 196 |
interpretation=evaluation.interpretation,
|
| 197 |
+
confidence=evaluation.confidence if success else 0.0,
|
| 198 |
+
answered_question=evaluation.answered if success else False,
|
| 199 |
+
next_questions=evaluation.next_questions,
|
| 200 |
+
success=success,
|
| 201 |
+
error_message=error_message
|
| 202 |
)
|
| 203 |
|
| 204 |
def _parse_response(self, response_text: str, result_summary: str) -> EvaluationOutput:
|
src/reasoning/findings.py
CHANGED
|
@@ -35,6 +35,8 @@ class Finding:
|
|
| 35 |
confidence: float # 0.0-1.0 confidence in this finding
|
| 36 |
answered_question: bool # Did this iteration answer the user's question?
|
| 37 |
next_questions: List[str] # Follow-up questions generated
|
|
|
|
|
|
|
| 38 |
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
| 39 |
|
| 40 |
def to_dict(self) -> Dict[str, Any]:
|
|
@@ -48,6 +50,8 @@ class Finding:
|
|
| 48 |
"confidence": self.confidence,
|
| 49 |
"answered": self.answered_question,
|
| 50 |
"next_questions": self.next_questions,
|
|
|
|
|
|
|
| 51 |
"timestamp": self.timestamp
|
| 52 |
}
|
| 53 |
|
|
@@ -121,6 +125,7 @@ class FindingsAccumulator:
|
|
| 121 |
self.hypotheses: List[Hypothesis] = []
|
| 122 |
self.tools_used: List[str] = []
|
| 123 |
self.files_produced: List[str] = []
|
|
|
|
| 124 |
self.is_answered = False
|
| 125 |
self.answer_confidence = 0.0
|
| 126 |
self.started_at = datetime.now().isoformat()
|
|
@@ -152,6 +157,23 @@ class FindingsAccumulator:
|
|
| 152 |
source_iteration=finding.iteration
|
| 153 |
))
|
| 154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
def add_hypothesis(self, text: str, priority: float = 0.5, source_iteration: int = 0):
|
| 156 |
"""Add a hypothesis to test."""
|
| 157 |
if not any(h.text == text for h in self.hypotheses):
|
|
@@ -215,12 +237,18 @@ class FindingsAccumulator:
|
|
| 215 |
parts.append(f"**Investigations completed**: {len(self.findings)}")
|
| 216 |
parts.append(f"**Tools used**: {', '.join(self.tools_used)}")
|
| 217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
# Recent findings (most relevant for next decision)
|
| 219 |
recent = self.findings[-max_findings:]
|
| 220 |
parts.append("\n**Recent findings**:")
|
| 221 |
for f in recent:
|
|
|
|
| 222 |
parts.append(
|
| 223 |
-
f" Step {f.iteration}: Ran `{f.action}` to test: \"{f.hypothesis}\"\n"
|
| 224 |
f" β Result: {f.interpretation}\n"
|
| 225 |
f" β Confidence: {f.confidence:.0%}"
|
| 226 |
)
|
|
@@ -257,8 +285,9 @@ class FindingsAccumulator:
|
|
| 257 |
# All findings in order
|
| 258 |
parts.append("\n## Investigation Steps\n")
|
| 259 |
for f in self.findings:
|
|
|
|
| 260 |
parts.append(
|
| 261 |
-
f"### Step {f.iteration}: {f.action}\n"
|
| 262 |
f"**Hypothesis**: {f.hypothesis}\n"
|
| 263 |
f"**Arguments**: {json.dumps(f.arguments, default=str)}\n"
|
| 264 |
f"**Result**: {f.result_summary}\n"
|
|
|
|
| 35 |
confidence: float # 0.0-1.0 confidence in this finding
|
| 36 |
answered_question: bool # Did this iteration answer the user's question?
|
| 37 |
next_questions: List[str] # Follow-up questions generated
|
| 38 |
+
success: bool = True # Whether the tool execution succeeded
|
| 39 |
+
error_message: str = "" # Error message if tool failed
|
| 40 |
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
|
| 41 |
|
| 42 |
def to_dict(self) -> Dict[str, Any]:
|
|
|
|
| 50 |
"confidence": self.confidence,
|
| 51 |
"answered": self.answered_question,
|
| 52 |
"next_questions": self.next_questions,
|
| 53 |
+
"success": self.success,
|
| 54 |
+
"error_message": self.error_message,
|
| 55 |
"timestamp": self.timestamp
|
| 56 |
}
|
| 57 |
|
|
|
|
| 125 |
self.hypotheses: List[Hypothesis] = []
|
| 126 |
self.tools_used: List[str] = []
|
| 127 |
self.files_produced: List[str] = []
|
| 128 |
+
self.failed_tools: Dict[str, str] = {} # tool_name β error message
|
| 129 |
self.is_answered = False
|
| 130 |
self.answer_confidence = 0.0
|
| 131 |
self.started_at = datetime.now().isoformat()
|
|
|
|
| 157 |
source_iteration=finding.iteration
|
| 158 |
))
|
| 159 |
|
| 160 |
+
def add_failed_tool(self, tool_name: str, error_message: str):
|
| 161 |
+
"""Record a tool that failed so the Reasoner avoids retrying it."""
|
| 162 |
+
self.failed_tools[tool_name] = error_message
|
| 163 |
+
|
| 164 |
+
def get_failed_tools_context(self) -> str:
|
| 165 |
+
"""Build context string listing tools that failed."""
|
| 166 |
+
if not self.failed_tools:
|
| 167 |
+
return ""
|
| 168 |
+
parts = ["\n**FAILED TOOLS (do NOT retry these)**:"]
|
| 169 |
+
for tool, error in self.failed_tools.items():
|
| 170 |
+
parts.append(f" - `{tool}`: {error[:150]}")
|
| 171 |
+
return "\n".join(parts)
|
| 172 |
+
|
| 173 |
+
def get_successful_findings(self) -> List[Finding]:
|
| 174 |
+
"""Return only findings from successful tool executions."""
|
| 175 |
+
return [f for f in self.findings if f.success]
|
| 176 |
+
|
| 177 |
def add_hypothesis(self, text: str, priority: float = 0.5, source_iteration: int = 0):
|
| 178 |
"""Add a hypothesis to test."""
|
| 179 |
if not any(h.text == text for h in self.hypotheses):
|
|
|
|
| 237 |
parts.append(f"**Investigations completed**: {len(self.findings)}")
|
| 238 |
parts.append(f"**Tools used**: {', '.join(self.tools_used)}")
|
| 239 |
|
| 240 |
+
# Failed tools warning (critical for avoiding retries)
|
| 241 |
+
failed_ctx = self.get_failed_tools_context()
|
| 242 |
+
if failed_ctx:
|
| 243 |
+
parts.append(failed_ctx)
|
| 244 |
+
|
| 245 |
# Recent findings (most relevant for next decision)
|
| 246 |
recent = self.findings[-max_findings:]
|
| 247 |
parts.append("\n**Recent findings**:")
|
| 248 |
for f in recent:
|
| 249 |
+
status_tag = "" if f.success else " [FAILED]"
|
| 250 |
parts.append(
|
| 251 |
+
f" Step {f.iteration}: Ran `{f.action}`{status_tag} to test: \"{f.hypothesis}\"\n"
|
| 252 |
f" β Result: {f.interpretation}\n"
|
| 253 |
f" β Confidence: {f.confidence:.0%}"
|
| 254 |
)
|
|
|
|
| 285 |
# All findings in order
|
| 286 |
parts.append("\n## Investigation Steps\n")
|
| 287 |
for f in self.findings:
|
| 288 |
+
status_label = "\u2705 SUCCESS" if f.success else "\u274c FAILED"
|
| 289 |
parts.append(
|
| 290 |
+
f"### Step {f.iteration}: {f.action} [{status_label}]\n"
|
| 291 |
f"**Hypothesis**: {f.hypothesis}\n"
|
| 292 |
f"**Arguments**: {json.dumps(f.arguments, default=str)}\n"
|
| 293 |
f"**Result**: {f.result_summary}\n"
|
src/reasoning/reasoner.py
CHANGED
|
@@ -77,7 +77,18 @@ CRITICAL RULES:
|
|
| 77 |
- If a previous tool produced a new data file (CSV/parquet), use THAT as file_path
|
| 78 |
- NEVER use an HTML, PNG, or report path as file_path for data-consuming tools
|
| 79 |
- For visualization, pick the chart type that best answers the question
|
| 80 |
-
- NEVER hallucinate column names - use only columns from the schema
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
REASONER_USER_TEMPLATE = """**User's question**: {question}
|
| 83 |
|
|
@@ -94,7 +105,9 @@ REASONER_USER_TEMPLATE = """**User's question**: {question}
|
|
| 94 |
**Available tools**:
|
| 95 |
{tools_description}
|
| 96 |
|
| 97 |
-
IMPORTANT:
|
|
|
|
|
|
|
| 98 |
|
| 99 |
Decide the next action. Respond with ONLY this JSON:
|
| 100 |
{{
|
|
@@ -223,8 +236,8 @@ class Reasoner:
|
|
| 223 |
max_tokens=1024
|
| 224 |
)
|
| 225 |
|
| 226 |
-
# Parse response
|
| 227 |
-
return self._parse_response(response_text, file_path)
|
| 228 |
|
| 229 |
def generate_hypotheses(
|
| 230 |
self,
|
|
@@ -273,7 +286,7 @@ class Reasoner:
|
|
| 273 |
|
| 274 |
return self._parse_hypotheses(response_text)
|
| 275 |
|
| 276 |
-
def _parse_response(self, response_text: str, file_path: str) -> ReasoningOutput:
|
| 277 |
"""Parse LLM response into ReasoningOutput."""
|
| 278 |
try:
|
| 279 |
# Try direct JSON parse
|
|
@@ -319,6 +332,14 @@ class Reasoner:
|
|
| 319 |
if fp.lower().endswith(non_data_extensions):
|
| 320 |
arguments["file_path"] = file_path
|
| 321 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
return ReasoningOutput(
|
| 323 |
status=status,
|
| 324 |
reasoning=data.get("reasoning", ""),
|
|
|
|
| 77 |
- If a previous tool produced a new data file (CSV/parquet), use THAT as file_path
|
| 78 |
- NEVER use an HTML, PNG, or report path as file_path for data-consuming tools
|
| 79 |
- For visualization, pick the chart type that best answers the question
|
| 80 |
+
- NEVER hallucinate column names - use only columns from the schema
|
| 81 |
+
|
| 82 |
+
TOOL FAILURE RULES:
|
| 83 |
+
- NEVER retry a tool that has already FAILED β try a DIFFERENT tool or approach instead
|
| 84 |
+
- If the "FAILED TOOLS" section lists a tool, that tool WILL fail again β do not call it
|
| 85 |
+
- If multiple tools have failed, consider stopping and synthesizing what you have
|
| 86 |
+
|
| 87 |
+
QUERY TYPE AWARENESS:
|
| 88 |
+
- For questions about "important features", "feature importance", "correlations", "patterns", or "explain the data":
|
| 89 |
+
Use EDA tools (profile_dataset, analyze_correlations, auto_feature_selection, generate_eda_plots)
|
| 90 |
+
Do NOT use model training tools (train_with_autogluon, train_model, etc.) β training is unnecessary for feature explanation
|
| 91 |
+
- Only use model training tools when the user explicitly asks to train, predict, build a model, or classify/regress"""
|
| 92 |
|
| 93 |
REASONER_USER_TEMPLATE = """**User's question**: {question}
|
| 94 |
|
|
|
|
| 105 |
**Available tools**:
|
| 106 |
{tools_description}
|
| 107 |
|
| 108 |
+
IMPORTANT:
|
| 109 |
+
- For ANY tool that needs a file_path argument, use "{file_path}" β the original data file. Do NOT use paths to HTML reports, plots, or other output artifacts.
|
| 110 |
+
- If a tool is listed under FAILED TOOLS above, do NOT call it again β it will fail. Choose a different tool or stop.
|
| 111 |
|
| 112 |
Decide the next action. Respond with ONLY this JSON:
|
| 113 |
{{
|
|
|
|
| 236 |
max_tokens=1024
|
| 237 |
)
|
| 238 |
|
| 239 |
+
# Parse response (pass findings so we can reject failed tools)
|
| 240 |
+
return self._parse_response(response_text, file_path, findings)
|
| 241 |
|
| 242 |
def generate_hypotheses(
|
| 243 |
self,
|
|
|
|
| 286 |
|
| 287 |
return self._parse_hypotheses(response_text)
|
| 288 |
|
| 289 |
+
def _parse_response(self, response_text: str, file_path: str, findings: Optional[FindingsAccumulator] = None) -> ReasoningOutput:
|
| 290 |
"""Parse LLM response into ReasoningOutput."""
|
| 291 |
try:
|
| 292 |
# Try direct JSON parse
|
|
|
|
| 332 |
if fp.lower().endswith(non_data_extensions):
|
| 333 |
arguments["file_path"] = file_path
|
| 334 |
|
| 335 |
+
# π‘οΈ SAFETY: Reject tools that already failed β force "done" to stop wasting iterations
|
| 336 |
+
if tool_name and findings and tool_name in findings.failed_tools:
|
| 337 |
+
print(f" β οΈ Reasoner picked failed tool '{tool_name}' β forcing done")
|
| 338 |
+
return ReasoningOutput.done(
|
| 339 |
+
reasoning=f"Tool '{tool_name}' previously failed. Stopping to synthesize available findings.",
|
| 340 |
+
confidence=max(0.3, findings.answer_confidence)
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
return ReasoningOutput(
|
| 344 |
status=status,
|
| 345 |
reasoning=data.get("reasoning", ""),
|
src/reasoning/synthesizer.py
CHANGED
|
@@ -39,7 +39,11 @@ RULES:
|
|
| 39 |
- Mention generated files/plots so user can find them
|
| 40 |
- Be honest about confidence levels
|
| 41 |
- Keep it under 500 words unless complex analysis warrants more
|
| 42 |
-
- Use markdown formatting (headers, bullets, bold for emphasis)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
SYNTHESIS_USER_TEMPLATE = """**Original question**: {question}
|
| 45 |
|
|
@@ -93,9 +97,12 @@ class Synthesizer:
|
|
| 93 |
# Build artifacts summary
|
| 94 |
artifacts_summary = self._format_artifacts(artifacts or {}, findings)
|
| 95 |
|
|
|
|
|
|
|
|
|
|
| 96 |
user_prompt = SYNTHESIS_USER_TEMPLATE.format(
|
| 97 |
question=findings.question,
|
| 98 |
-
findings_context=
|
| 99 |
artifacts_summary=artifacts_summary
|
| 100 |
)
|
| 101 |
|
|
@@ -136,14 +143,20 @@ RULES:
|
|
| 136 |
- Use specific numbers and metrics
|
| 137 |
- Mention all generated visualizations with file paths
|
| 138 |
- Suggest actionable next analysis steps
|
| 139 |
-
- Keep it engaging but data-driven
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
artifacts_summary = self._format_artifacts(artifacts or {}, findings)
|
| 142 |
|
|
|
|
|
|
|
|
|
|
| 143 |
user_prompt = f"""**Analysis request**: {findings.question}
|
| 144 |
|
| 145 |
**Investigation summary**:
|
| 146 |
-
{
|
| 147 |
|
| 148 |
**Generated artifacts**:
|
| 149 |
{artifacts_summary}
|
|
@@ -179,8 +192,10 @@ Write the exploratory analysis report."""
|
|
| 179 |
for f in files:
|
| 180 |
parts.append(f" - {f}")
|
| 181 |
|
| 182 |
-
# Extract from findings history
|
| 183 |
for finding in findings.findings:
|
|
|
|
|
|
|
| 184 |
result = finding.result_summary
|
| 185 |
if "output_file" in result or "output_path" in result or ".html" in result or ".png" in result:
|
| 186 |
parts.append(f" - Step {finding.iteration} ({finding.action}): output in result")
|
|
@@ -193,3 +208,59 @@ Write the exploratory analysis report."""
|
|
| 193 |
return "No artifacts generated yet."
|
| 194 |
|
| 195 |
return "\n".join(parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
- Mention generated files/plots so user can find them
|
| 40 |
- Be honest about confidence levels
|
| 41 |
- Keep it under 500 words unless complex analysis warrants more
|
| 42 |
+
- Use markdown formatting (headers, bullets, bold for emphasis)
|
| 43 |
+
- ONLY report findings from SUCCESSFUL investigation steps
|
| 44 |
+
- Do NOT invent numbers, statistics, or insights that are not present in the findings
|
| 45 |
+
- If a step is marked [FAILED], ignore its results entirely β do not fabricate data from it
|
| 46 |
+
- If most steps failed, be transparent about limited evidence and recommend re-running"""
|
| 47 |
|
| 48 |
SYNTHESIS_USER_TEMPLATE = """**Original question**: {question}
|
| 49 |
|
|
|
|
| 97 |
# Build artifacts summary
|
| 98 |
artifacts_summary = self._format_artifacts(artifacts or {}, findings)
|
| 99 |
|
| 100 |
+
# Build findings context β only successful findings get full detail
|
| 101 |
+
findings_context = self._build_filtered_context(findings)
|
| 102 |
+
|
| 103 |
user_prompt = SYNTHESIS_USER_TEMPLATE.format(
|
| 104 |
question=findings.question,
|
| 105 |
+
findings_context=findings_context,
|
| 106 |
artifacts_summary=artifacts_summary
|
| 107 |
)
|
| 108 |
|
|
|
|
| 143 |
- Use specific numbers and metrics
|
| 144 |
- Mention all generated visualizations with file paths
|
| 145 |
- Suggest actionable next analysis steps
|
| 146 |
+
- Keep it engaging but data-driven
|
| 147 |
+
- ONLY report findings from SUCCESSFUL investigation steps
|
| 148 |
+
- Do NOT invent numbers or statistics not present in the findings
|
| 149 |
+
- If a step is marked [FAILED], ignore it entirely"""
|
| 150 |
|
| 151 |
artifacts_summary = self._format_artifacts(artifacts or {}, findings)
|
| 152 |
|
| 153 |
+
# Build filtered context β only successful findings
|
| 154 |
+
findings_context = self._build_filtered_context(findings)
|
| 155 |
+
|
| 156 |
user_prompt = f"""**Analysis request**: {findings.question}
|
| 157 |
|
| 158 |
**Investigation summary**:
|
| 159 |
+
{findings_context}
|
| 160 |
|
| 161 |
**Generated artifacts**:
|
| 162 |
{artifacts_summary}
|
|
|
|
| 192 |
for f in files:
|
| 193 |
parts.append(f" - {f}")
|
| 194 |
|
| 195 |
+
# Extract from findings history β only from successful steps
|
| 196 |
for finding in findings.findings:
|
| 197 |
+
if not finding.success:
|
| 198 |
+
continue
|
| 199 |
result = finding.result_summary
|
| 200 |
if "output_file" in result or "output_path" in result or ".html" in result or ".png" in result:
|
| 201 |
parts.append(f" - Step {finding.iteration} ({finding.action}): output in result")
|
|
|
|
| 208 |
return "No artifacts generated yet."
|
| 209 |
|
| 210 |
return "\n".join(parts)
|
| 211 |
+
|
| 212 |
+
def _build_filtered_context(self, findings: FindingsAccumulator) -> str:
|
| 213 |
+
"""
|
| 214 |
+
Build synthesis context that only includes SUCCESSFUL findings in detail.
|
| 215 |
+
Failed findings are listed as a brief summary so the LLM knows they happened
|
| 216 |
+
but cannot hallucinate data from them.
|
| 217 |
+
"""
|
| 218 |
+
import json
|
| 219 |
+
|
| 220 |
+
parts = []
|
| 221 |
+
parts.append(f"**Original question**: {findings.question}")
|
| 222 |
+
parts.append(f"**Mode**: {findings.mode}")
|
| 223 |
+
|
| 224 |
+
successful = findings.get_successful_findings()
|
| 225 |
+
failed = [f for f in findings.findings if not f.success]
|
| 226 |
+
|
| 227 |
+
parts.append(f"**Total iterations**: {len(findings.findings)} ({len(successful)} succeeded, {len(failed)} failed)")
|
| 228 |
+
parts.append(f"**Tools used**: {', '.join(findings.tools_used)}")
|
| 229 |
+
|
| 230 |
+
# Only successful findings get full detail
|
| 231 |
+
if successful:
|
| 232 |
+
parts.append("\n## Successful Investigation Steps\n")
|
| 233 |
+
for f in successful:
|
| 234 |
+
parts.append(
|
| 235 |
+
f"### Step {f.iteration}: {f.action}\n"
|
| 236 |
+
f"**Hypothesis**: {f.hypothesis}\n"
|
| 237 |
+
f"**Arguments**: {json.dumps(f.arguments, default=str)}\n"
|
| 238 |
+
f"**Result**: {f.result_summary}\n"
|
| 239 |
+
f"**Interpretation**: {f.interpretation}\n"
|
| 240 |
+
f"**Confidence**: {f.confidence:.0%}\n"
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
# Failed findings get just a one-line mention
|
| 244 |
+
if failed:
|
| 245 |
+
parts.append("\n## Failed Steps (no usable data β do NOT cite these)\n")
|
| 246 |
+
for f in failed:
|
| 247 |
+
parts.append(f"- Step {f.iteration}: `{f.action}` FAILED β {f.error_message or 'execution error'}")
|
| 248 |
+
|
| 249 |
+
# Hypothesis outcomes
|
| 250 |
+
if findings.hypotheses:
|
| 251 |
+
parts.append("\n## Hypothesis Outcomes\n")
|
| 252 |
+
for h in findings.hypotheses:
|
| 253 |
+
status_emoji = {
|
| 254 |
+
"supported": "\u2705",
|
| 255 |
+
"refuted": "\u274c",
|
| 256 |
+
"inconclusive": "\u2753",
|
| 257 |
+
"testing": "\ud83d\udd04",
|
| 258 |
+
"untested": "\u2b1c"
|
| 259 |
+
}.get(h.status, "\u2b1c")
|
| 260 |
+
parts.append(f"{status_emoji} **{h.text}** \u2192 {h.status}")
|
| 261 |
+
if h.evidence_for:
|
| 262 |
+
parts.append(f" Evidence for: {'; '.join(h.evidence_for)}")
|
| 263 |
+
if h.evidence_against:
|
| 264 |
+
parts.append(f" Evidence against: {'; '.join(h.evidence_against)}")
|
| 265 |
+
|
| 266 |
+
return "\n".join(parts)
|