Spaces:

Pulastya0
/

Data-Science-Agent

Running

App Files Files Community

Pulastya B commited on 15 days ago

Commit

7502356

1 Parent(s): 5ce70d3

Added Query Awareness , Success/Failure Marking further improving pipeline accuracy

Browse files

Files changed (5) hide show

src/orchestrator.py +12 -3
src/reasoning/evaluator.py +8 -4
src/reasoning/findings.py +31 -2
src/reasoning/reasoner.py +26 -5
src/reasoning/synthesizer.py +76 -5

src/orchestrator.py CHANGED Viewed

@@ -3240,8 +3240,12 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
             tool_result = self._execute_tool(tool_name, tool_args)
             # Track output file for next iteration — ONLY update for data files
-            if tool_result.get("success", True):
                 result_data = tool_result.get("result", {})
                 if isinstance(result_data, dict):
                     new_file = result_data.get("output_file") or result_data.get("output_path")
@@ -3265,7 +3269,10 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
                 print(f"   ✓ Tool completed successfully")
             else:
                 error_msg = tool_result.get("error", "Unknown error")
                 print(f"   ❌ Tool failed: {error_msg}")
                 if hasattr(self, 'session') and self.session:
                     progress_manager.emit(self.session.session_id, {
                         'type': 'tool_failed',
@@ -3286,7 +3293,7 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
             self._update_workflow_state(tool_name, tool_result)
             # Checkpoint
-            if tool_result.get("success", True):
                 session_id = self.http_session_key or "default"
                 self.recovery_manager.checkpoint_manager.save_checkpoint(
                     session_id=session_id,
@@ -3333,7 +3340,9 @@ You receive quality reports from EDA agent and deliver clean data to modeling ag
                 tool_name=tool_name,
                 arguments=tool_args,
                 result_summary=compressed_result,
-                evaluation=evaluation
             )
             findings.add_finding(finding)

             tool_result = self._execute_tool(tool_name, tool_args)
+            # Determine success/failure
+            tool_success = tool_result.get("success", True)
+            tool_error = ""
             # Track output file for next iteration — ONLY update for data files
+            if tool_success:
                 result_data = tool_result.get("result", {})
                 if isinstance(result_data, dict):
                     new_file = result_data.get("output_file") or result_data.get("output_path")
                 print(f"   ✓ Tool completed successfully")
             else:
                 error_msg = tool_result.get("error", "Unknown error")
+                tool_error = str(error_msg)[:300]
                 print(f"   ❌ Tool failed: {error_msg}")
+                # Record failure so Reasoner won't retry this tool
+                findings.add_failed_tool(tool_name, tool_error)
                 if hasattr(self, 'session') and self.session:
                     progress_manager.emit(self.session.session_id, {
                         'type': 'tool_failed',
             self._update_workflow_state(tool_name, tool_result)
             # Checkpoint
+            if tool_success:
                 session_id = self.http_session_key or "default"
                 self.recovery_manager.checkpoint_manager.save_checkpoint(
                     session_id=session_id,
                 tool_name=tool_name,
                 arguments=tool_args,
                 result_summary=compressed_result,
+                evaluation=evaluation,
+                success=tool_success,
+                error_message=tool_error
             )
             findings.add_finding(finding)

src/reasoning/evaluator.py CHANGED Viewed

@@ -177,7 +177,9 @@ class Evaluator:
         tool_name: str,
         arguments: Dict[str, Any],
         result_summary: str,
-        evaluation: "EvaluationOutput"
     ) -> Finding:
         """
         Build a Finding from a completed iteration.
@@ -192,9 +194,11 @@ class Evaluator:
             arguments=arguments,
             result_summary=result_summary[:1000],  # Cap size
             interpretation=evaluation.interpretation,
-            confidence=evaluation.confidence,
-            answered_question=evaluation.answered,
-            next_questions=evaluation.next_questions
         )
     def _parse_response(self, response_text: str, result_summary: str) -> EvaluationOutput:

         tool_name: str,
         arguments: Dict[str, Any],
         result_summary: str,
+        evaluation: "EvaluationOutput",
+        success: bool = True,
+        error_message: str = ""
     ) -> Finding:
         """
         Build a Finding from a completed iteration.
             arguments=arguments,
             result_summary=result_summary[:1000],  # Cap size
             interpretation=evaluation.interpretation,
+            confidence=evaluation.confidence if success else 0.0,
+            answered_question=evaluation.answered if success else False,
+            next_questions=evaluation.next_questions,
+            success=success,
+            error_message=error_message
         )
     def _parse_response(self, response_text: str, result_summary: str) -> EvaluationOutput:

src/reasoning/findings.py CHANGED Viewed

@@ -35,6 +35,8 @@ class Finding:
     confidence: float              # 0.0-1.0 confidence in this finding
     answered_question: bool        # Did this iteration answer the user's question?
     next_questions: List[str]      # Follow-up questions generated
     timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
     def to_dict(self) -> Dict[str, Any]:
@@ -48,6 +50,8 @@ class Finding:
             "confidence": self.confidence,
             "answered": self.answered_question,
             "next_questions": self.next_questions,
             "timestamp": self.timestamp
         }
@@ -121,6 +125,7 @@ class FindingsAccumulator:
         self.hypotheses: List[Hypothesis] = []
         self.tools_used: List[str] = []
         self.files_produced: List[str] = []
         self.is_answered = False
         self.answer_confidence = 0.0
         self.started_at = datetime.now().isoformat()
@@ -152,6 +157,23 @@ class FindingsAccumulator:
                     source_iteration=finding.iteration
                 ))
     def add_hypothesis(self, text: str, priority: float = 0.5, source_iteration: int = 0):
         """Add a hypothesis to test."""
         if not any(h.text == text for h in self.hypotheses):
@@ -215,12 +237,18 @@ class FindingsAccumulator:
         parts.append(f"**Investigations completed**: {len(self.findings)}")
         parts.append(f"**Tools used**: {', '.join(self.tools_used)}")
         # Recent findings (most relevant for next decision)
         recent = self.findings[-max_findings:]
         parts.append("\n**Recent findings**:")
         for f in recent:
             parts.append(
-                f"  Step {f.iteration}: Ran `{f.action}` to test: \"{f.hypothesis}\"\n"
                 f"    → Result: {f.interpretation}\n"
                 f"    → Confidence: {f.confidence:.0%}"
             )
@@ -257,8 +285,9 @@ class FindingsAccumulator:
         # All findings in order
         parts.append("\n## Investigation Steps\n")
         for f in self.findings:
             parts.append(
-                f"### Step {f.iteration}: {f.action}\n"
                 f"**Hypothesis**: {f.hypothesis}\n"
                 f"**Arguments**: {json.dumps(f.arguments, default=str)}\n"
                 f"**Result**: {f.result_summary}\n"

     confidence: float              # 0.0-1.0 confidence in this finding
     answered_question: bool        # Did this iteration answer the user's question?
     next_questions: List[str]      # Follow-up questions generated
+    success: bool = True           # Whether the tool execution succeeded
+    error_message: str = ""        # Error message if tool failed
     timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
     def to_dict(self) -> Dict[str, Any]:
             "confidence": self.confidence,
             "answered": self.answered_question,
             "next_questions": self.next_questions,
+            "success": self.success,
+            "error_message": self.error_message,
             "timestamp": self.timestamp
         }
         self.hypotheses: List[Hypothesis] = []
         self.tools_used: List[str] = []
         self.files_produced: List[str] = []
+        self.failed_tools: Dict[str, str] = {}  # tool_name → error message
         self.is_answered = False
         self.answer_confidence = 0.0
         self.started_at = datetime.now().isoformat()
                     source_iteration=finding.iteration
                 ))
+    def add_failed_tool(self, tool_name: str, error_message: str):
+        """Record a tool that failed so the Reasoner avoids retrying it."""
+        self.failed_tools[tool_name] = error_message
+    def get_failed_tools_context(self) -> str:
+        """Build context string listing tools that failed."""
+        if not self.failed_tools:
+            return ""
+        parts = ["\n**FAILED TOOLS (do NOT retry these)**:"]
+        for tool, error in self.failed_tools.items():
+            parts.append(f"  - `{tool}`: {error[:150]}")
+        return "\n".join(parts)
+    def get_successful_findings(self) -> List[Finding]:
+        """Return only findings from successful tool executions."""
+        return [f for f in self.findings if f.success]
     def add_hypothesis(self, text: str, priority: float = 0.5, source_iteration: int = 0):
         """Add a hypothesis to test."""
         if not any(h.text == text for h in self.hypotheses):
         parts.append(f"**Investigations completed**: {len(self.findings)}")
         parts.append(f"**Tools used**: {', '.join(self.tools_used)}")
+        # Failed tools warning (critical for avoiding retries)
+        failed_ctx = self.get_failed_tools_context()
+        if failed_ctx:
+            parts.append(failed_ctx)
         # Recent findings (most relevant for next decision)
         recent = self.findings[-max_findings:]
         parts.append("\n**Recent findings**:")
         for f in recent:
+            status_tag = "" if f.success else " [FAILED]"
             parts.append(
+                f"  Step {f.iteration}: Ran `{f.action}`{status_tag} to test: \"{f.hypothesis}\"\n"
                 f"    → Result: {f.interpretation}\n"
                 f"    → Confidence: {f.confidence:.0%}"
             )
         # All findings in order
         parts.append("\n## Investigation Steps\n")
         for f in self.findings:
+            status_label = "\u2705 SUCCESS" if f.success else "\u274c FAILED"
             parts.append(
+                f"### Step {f.iteration}: {f.action} [{status_label}]\n"
                 f"**Hypothesis**: {f.hypothesis}\n"
                 f"**Arguments**: {json.dumps(f.arguments, default=str)}\n"
                 f"**Result**: {f.result_summary}\n"

src/reasoning/reasoner.py CHANGED Viewed

@@ -77,7 +77,18 @@ CRITICAL RULES:
 - If a previous tool produced a new data file (CSV/parquet), use THAT as file_path
 - NEVER use an HTML, PNG, or report path as file_path for data-consuming tools
 - For visualization, pick the chart type that best answers the question
-- NEVER hallucinate column names - use only columns from the schema"""
 REASONER_USER_TEMPLATE = """**User's question**: {question}
@@ -94,7 +105,9 @@ REASONER_USER_TEMPLATE = """**User's question**: {question}
 **Available tools**:
 {tools_description}
-IMPORTANT: For ANY tool that needs a file_path argument, use "{file_path}" — the original data file. Do NOT use paths to HTML reports, plots, or other output artifacts.
 Decide the next action. Respond with ONLY this JSON:
 {{
@@ -223,8 +236,8 @@ class Reasoner:
             max_tokens=1024
         )
-        # Parse response
-        return self._parse_response(response_text, file_path)
     def generate_hypotheses(
         self,
@@ -273,7 +286,7 @@ class Reasoner:
         return self._parse_hypotheses(response_text)
-    def _parse_response(self, response_text: str, file_path: str) -> ReasoningOutput:
         """Parse LLM response into ReasoningOutput."""
         try:
             # Try direct JSON parse
@@ -319,6 +332,14 @@ class Reasoner:
             if fp.lower().endswith(non_data_extensions):
                 arguments["file_path"] = file_path
         return ReasoningOutput(
             status=status,
             reasoning=data.get("reasoning", ""),

 - If a previous tool produced a new data file (CSV/parquet), use THAT as file_path
 - NEVER use an HTML, PNG, or report path as file_path for data-consuming tools
 - For visualization, pick the chart type that best answers the question
+- NEVER hallucinate column names - use only columns from the schema
+TOOL FAILURE RULES:
+- NEVER retry a tool that has already FAILED — try a DIFFERENT tool or approach instead
+- If the "FAILED TOOLS" section lists a tool, that tool WILL fail again — do not call it
+- If multiple tools have failed, consider stopping and synthesizing what you have
+QUERY TYPE AWARENESS:
+- For questions about "important features", "feature importance", "correlations", "patterns", or "explain the data":
+  Use EDA tools (profile_dataset, analyze_correlations, auto_feature_selection, generate_eda_plots)
+  Do NOT use model training tools (train_with_autogluon, train_model, etc.) — training is unnecessary for feature explanation
+- Only use model training tools when the user explicitly asks to train, predict, build a model, or classify/regress"""
 REASONER_USER_TEMPLATE = """**User's question**: {question}
 **Available tools**:
 {tools_description}
+IMPORTANT:
+- For ANY tool that needs a file_path argument, use "{file_path}" — the original data file. Do NOT use paths to HTML reports, plots, or other output artifacts.
+- If a tool is listed under FAILED TOOLS above, do NOT call it again — it will fail. Choose a different tool or stop.
 Decide the next action. Respond with ONLY this JSON:
 {{
             max_tokens=1024
         )
+        # Parse response (pass findings so we can reject failed tools)
+        return self._parse_response(response_text, file_path, findings)
     def generate_hypotheses(
         self,
         return self._parse_hypotheses(response_text)
+    def _parse_response(self, response_text: str, file_path: str, findings: Optional[FindingsAccumulator] = None) -> ReasoningOutput:
         """Parse LLM response into ReasoningOutput."""
         try:
             # Try direct JSON parse
             if fp.lower().endswith(non_data_extensions):
                 arguments["file_path"] = file_path
+        # 🛡️ SAFETY: Reject tools that already failed — force "done" to stop wasting iterations
+        if tool_name and findings and tool_name in findings.failed_tools:
+            print(f"   ⚠️  Reasoner picked failed tool '{tool_name}' — forcing done")
+            return ReasoningOutput.done(
+                reasoning=f"Tool '{tool_name}' previously failed. Stopping to synthesize available findings.",
+                confidence=max(0.3, findings.answer_confidence)
+            )
         return ReasoningOutput(
             status=status,
             reasoning=data.get("reasoning", ""),

src/reasoning/synthesizer.py CHANGED Viewed

@@ -39,7 +39,11 @@ RULES:
 - Mention generated files/plots so user can find them
 - Be honest about confidence levels
 - Keep it under 500 words unless complex analysis warrants more
-- Use markdown formatting (headers, bullets, bold for emphasis)"""
 SYNTHESIS_USER_TEMPLATE = """**Original question**: {question}
@@ -93,9 +97,12 @@ class Synthesizer:
         # Build artifacts summary
         artifacts_summary = self._format_artifacts(artifacts or {}, findings)
         user_prompt = SYNTHESIS_USER_TEMPLATE.format(
             question=findings.question,
-            findings_context=findings.get_context_for_synthesis(),
             artifacts_summary=artifacts_summary
         )
@@ -136,14 +143,20 @@ RULES:
 - Use specific numbers and metrics
 - Mention all generated visualizations with file paths
 - Suggest actionable next analysis steps
-- Keep it engaging but data-driven"""
         artifacts_summary = self._format_artifacts(artifacts or {}, findings)
         user_prompt = f"""**Analysis request**: {findings.question}
 **Investigation summary**:
-{findings.get_context_for_synthesis()}
 **Generated artifacts**:
 {artifacts_summary}
@@ -179,8 +192,10 @@ Write the exploratory analysis report."""
             for f in files:
                 parts.append(f"  - {f}")
-        # Extract from findings history
         for finding in findings.findings:
             result = finding.result_summary
             if "output_file" in result or "output_path" in result or ".html" in result or ".png" in result:
                 parts.append(f"  - Step {finding.iteration} ({finding.action}): output in result")
@@ -193,3 +208,59 @@ Write the exploratory analysis report."""
             return "No artifacts generated yet."
         return "\n".join(parts)

 - Mention generated files/plots so user can find them
 - Be honest about confidence levels
 - Keep it under 500 words unless complex analysis warrants more
+- Use markdown formatting (headers, bullets, bold for emphasis)
+- ONLY report findings from SUCCESSFUL investigation steps
+- Do NOT invent numbers, statistics, or insights that are not present in the findings
+- If a step is marked [FAILED], ignore its results entirely — do not fabricate data from it
+- If most steps failed, be transparent about limited evidence and recommend re-running"""
 SYNTHESIS_USER_TEMPLATE = """**Original question**: {question}
         # Build artifacts summary
         artifacts_summary = self._format_artifacts(artifacts or {}, findings)
+        # Build findings context — only successful findings get full detail
+        findings_context = self._build_filtered_context(findings)
         user_prompt = SYNTHESIS_USER_TEMPLATE.format(
             question=findings.question,
+            findings_context=findings_context,
             artifacts_summary=artifacts_summary
         )
 - Use specific numbers and metrics
 - Mention all generated visualizations with file paths
 - Suggest actionable next analysis steps
+- Keep it engaging but data-driven
+- ONLY report findings from SUCCESSFUL investigation steps
+- Do NOT invent numbers or statistics not present in the findings
+- If a step is marked [FAILED], ignore it entirely"""
         artifacts_summary = self._format_artifacts(artifacts or {}, findings)
+        # Build filtered context — only successful findings
+        findings_context = self._build_filtered_context(findings)
         user_prompt = f"""**Analysis request**: {findings.question}
 **Investigation summary**:
+{findings_context}
 **Generated artifacts**:
 {artifacts_summary}
             for f in files:
                 parts.append(f"  - {f}")
+        # Extract from findings history — only from successful steps
         for finding in findings.findings:
+            if not finding.success:
+                continue
             result = finding.result_summary
             if "output_file" in result or "output_path" in result or ".html" in result or ".png" in result:
                 parts.append(f"  - Step {finding.iteration} ({finding.action}): output in result")
             return "No artifacts generated yet."
         return "\n".join(parts)
+    def _build_filtered_context(self, findings: FindingsAccumulator) -> str:
+        """
+        Build synthesis context that only includes SUCCESSFUL findings in detail.
+        Failed findings are listed as a brief summary so the LLM knows they happened
+        but cannot hallucinate data from them.
+        """
+        import json
+        parts = []
+        parts.append(f"**Original question**: {findings.question}")
+        parts.append(f"**Mode**: {findings.mode}")
+        successful = findings.get_successful_findings()
+        failed = [f for f in findings.findings if not f.success]
+        parts.append(f"**Total iterations**: {len(findings.findings)} ({len(successful)} succeeded, {len(failed)} failed)")
+        parts.append(f"**Tools used**: {', '.join(findings.tools_used)}")
+        # Only successful findings get full detail
+        if successful:
+            parts.append("\n## Successful Investigation Steps\n")
+            for f in successful:
+                parts.append(
+                    f"### Step {f.iteration}: {f.action}\n"
+                    f"**Hypothesis**: {f.hypothesis}\n"
+                    f"**Arguments**: {json.dumps(f.arguments, default=str)}\n"
+                    f"**Result**: {f.result_summary}\n"
+                    f"**Interpretation**: {f.interpretation}\n"
+                    f"**Confidence**: {f.confidence:.0%}\n"
+                )
+        # Failed findings get just a one-line mention
+        if failed:
+            parts.append("\n## Failed Steps (no usable data — do NOT cite these)\n")
+            for f in failed:
+                parts.append(f"- Step {f.iteration}: `{f.action}` FAILED — {f.error_message or 'execution error'}")
+        # Hypothesis outcomes
+        if findings.hypotheses:
+            parts.append("\n## Hypothesis Outcomes\n")
+            for h in findings.hypotheses:
+                status_emoji = {
+                    "supported": "\u2705",
+                    "refuted": "\u274c",
+                    "inconclusive": "\u2753",
+                    "testing": "\ud83d\udd04",
+                    "untested": "\u2b1c"
+                }.get(h.status, "\u2b1c")
+                parts.append(f"{status_emoji} **{h.text}** \u2192 {h.status}")
+                if h.evidence_for:
+                    parts.append(f"  Evidence for: {'; '.join(h.evidence_for)}")
+                if h.evidence_against:
+                    parts.append(f"  Evidence against: {'; '.join(h.evidence_against)}")
+        return "\n".join(parts)