Spaces:

triflix
/

chatplotapi

Paused

App Files Files Community

triflix commited on Sep 23, 2025

Commit

1035208

verified ·

1 Parent(s): 17ab777

Update pipeline_with_agents.py

Browse files

Files changed (1) hide show

pipeline_with_agents.py +106 -39

pipeline_with_agents.py CHANGED Viewed

@@ -1,17 +1,12 @@
-# (This file is verbatim copy of the pipeline you provided — unchanged.)
-# Save exactly as provided by you.
-# -------------------------
-# Paste the full content you provided earlier here.
-# -------------------------
 """
 Automated Data-Analysis Pipeline with Agent Prompts + Gemini (google-genai)
-Fixed:
- - Uses GEMINI_API_KEY from environment (no hardcoded key)
- - Enforces planning agent contract: model MUST assign `result` variable
- - Hardened execution: exec -> check for `result` or attempt safe eval for single-expression snippets
- - Rejects unsafe snippets via regex blacklist
- - Logs model code when execution fails and falls back to deterministic generator
 """
 import os
@@ -34,25 +29,60 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("pipeline")
 # ---------------------------
-# Agent system prompts
 # ---------------------------
 PROMPTS = {
-    "file_ingestion": """You are a file ingestion agent. Detect file type, extract sheets if Excel, load into structured dataframe. Return metadata: file_type, sheet_names, selected_sheet.""",
-    "preprocessing": """You are a preprocessing agent. Clean and normalize the dataset: handle nulls, infer types, encode categories, scale numerics. Output structured summary of cleaning steps and cleaned dataframe schema.""",
-    "sampling": """You are a sampling agent. From the dataframe, generate head(5), tail(5), and a random sample. Output each in JSON structure.""",
-    "classification": """You are a classification agent. Analyze given dataset samples. Identify domain (finance, sales, demographics, etc.) and suggest suitable visualization tasks (minimum 6). Return structured JSON listing tasks with fields: chart_type (one of pie, bar, line, scatter, histogram, boxplot), target_columns (array), aggregation (string or null), reasoning (short). Example output: { "domain": "sales", "tasks": [ { "chart_type": "pie", "target_columns": ["region"], "aggregation": "count", "reasoning": "distribution of region" }, ... ] }""",
-    "planning": """You are a planning agent. From classification output, create at least 6 chart tasks. For each: chart_type (pie, bar, line, scatter, histogram, boxplot), target_columns, aggregation (if needed), and a Python/pandas code snippet that generates the chart-ready aggregated JSON (but code must be limited to pandas/numpy operations). IMPORTANT: Your code MUST assign the final chart data to a variable named `result` that is a list of dictionaries ready for JSON serialization. Example:
-result = df.groupby('anemia_label').size().reset_index(name='value').to_dict(orient='records')
-Return JSON with array 'tasks'. Example task entry: { "chart_type": "bar", "target_columns": ["month", "sales"], "aggregation": "sum", "code": "result = df.groupby('month')['sales'].sum().reset_index().to_dict(orient=\\'records\\')" }""",
-    "execution": """You are an execution agent. Execute given Python code safely on provided dataset. Return structured JSON results in chart-ready format. Follow schema:
-Pie → [{ "name": "...", "value": ... }]
-Bar → [{ "label": "...", "metric1": ..., "metric2": ... }]
-Line → [{ "x": "...", "y": ... }]
-Scatter → [{ "x": ..., "y": ... }]
-Histogram → [{ "bin": ..., "count": ... }]
-Boxplot → [{ "category": "...", "q1": ..., "median": ..., "q3": ... }]""",
-    "output": """You are an output agent. Aggregate all chart JSON objects into final structured response. Ensure at least 6 charts included. Output JSON with keys: "pie", "bar", "line", "scatter", "histogram", "boxplot"."""
 }
 # ---------------------------
@@ -73,13 +103,24 @@ DISALLOWED_PATTERNS = [
     r"\bsystem\s*\(",
     r"\bPopen\b",
     r"\bsh\b",
 ]
 def code_is_safe(code: str) -> Tuple[bool, Optional[str]]:
     lowered = code
     for pat in DISALLOWED_PATTERNS:
-        if re.search(pat, lowered):
             return False, f"disallowed pattern: {pat}"
     return True, None
@@ -203,8 +244,9 @@ def gemini_generate_json(model: str, system_instruction: str, user_content: str,
     """
     Calls genai generate_content_stream with given system prompt and user content.
     Expects the model to return JSON text. Joins chunks and returns parsed JSON or raw text.
     """
-    api_key = "AIzaSyDfy0E-9b2XjoYHrHX2C1nVLHWyrWUFkMs"
     if not api_key:
         raise EnvironmentError("GEMINI_API_KEY not set in environment.")
     client = genai.Client(api_key=api_key)
@@ -503,12 +545,38 @@ def process_file(path: str, sheet: Optional[str] = None, model: str = "gemini-2.
         # If model didn't produce tasks array, use classification tasks
         tasks = classification_output.get("tasks", [])
     # Ensure at least 6 tasks
     tasks = ensure_six_tasks(tasks, pre_df)
     # Execute tasks
-    final = {"pie": [], "bar": [], "line": [], "scatter": [], "histogram": [], "boxplot": []}
-    execution_errors = []
     for idx, task in enumerate(tasks):
         chart_type = task.get("chart_type")
@@ -518,7 +586,7 @@ def process_file(path: str, sheet: Optional[str] = None, model: str = "gemini-2.
         if code_snippet:
             safe, reason = code_is_safe(code_snippet)
             if not safe:
-                logger.warning("Rejected unsafe code snippet: %s", reason)
             else:
                 # Controlled globals for exec/eval
                 allowed_globals = {
@@ -537,7 +605,7 @@ def process_file(path: str, sheet: Optional[str] = None, model: str = "gemini-2.
                     "np": np,
                     "df": pre_df.copy(),
                 }
-                local_vars = {}
                 try:
                     # 1) Try exec (model should assign `result`)
                     exec(code_snippet, allowed_globals, local_vars)
@@ -564,19 +632,18 @@ def process_file(path: str, sheet: Optional[str] = None, model: str = "gemini-2.
                     # 3) Normalize result into list-of-dicts
                     result_json = None
                     if isinstance(result, pd.DataFrame):
-                        result_json = [ {k: to_json_serializable(v) for k,v in r.items()} for r in result.to_dict(orient="records") ]
                     elif isinstance(result, list):
                         norm = []
-                        valid = True
                         for r in result:
                             if isinstance(r, dict):
-                                norm.append({k: to_json_serializable(v) for k,v in r.items()})
                             else:
                                 # allow primitive lists but wrap as dict with value key
-                                norm.append(to_json_serializable(r))
                         result_json = norm
                     elif isinstance(result, dict):
-                        result_json = [{k: to_json_serializable(v) for k,v in result.items()}]
                     else:
                         # primitive or None -> invalid for chart payload
                         result_json = None
@@ -597,7 +664,7 @@ def process_file(path: str, sheet: Optional[str] = None, model: str = "gemini-2.
                                 final.setdefault(chart_type, []).extend(normalized)
                             executed = True
                     if not executed:
-                        execution_errors.append({"task_index": idx, "reason": "result not list-of-dicts or missing", "code": code_snippet})
                 except Exception as e:
                     logger.exception("Model code execution failed for task %s: %s", idx, str(e))
                     execution_errors.append({"task_index": idx, "reason": "exception during exec/eval", "exception": str(e), "code": code_snippet})

 """
 Automated Data-Analysis Pipeline with Agent Prompts + Gemini (google-genai)
+Changes applied:
+ - Use GEMINI_API_KEY from environment (no hardcoded key)
+ - Stronger, model-proof PROMPTS that forbid plotting and require `result` assignment
+ - Extended DISALLOWED_PATTERNS to block plotting libraries and plotting methods
+ - Validation step after planning: drop model-provided code that lacks `result` or uses plotting tokens; record execution_errors
+ - Execution still performs safety checks and falls back to deterministic generators when needed
 """
 import os
 logger = logging.getLogger("pipeline")
 # ---------------------------
+# Agent system prompts (strict, plotting banned)
 # ---------------------------
 PROMPTS = {
+    "file_ingestion": (
+        "You are a file ingestion agent. Detect file type; if Excel enumerate sheets and pick the specified sheet or default to the first. "
+        "Load the chosen sheet into a pandas DataFrame and return only metadata (no narrative): "
+        '{"file_type":"<.csv|.xlsx|...>", "sheet_names":[...], "selected_sheet":"..."}'
+    ),
+    "preprocessing": (
+        "You are a preprocessing agent. Clean and normalize the dataset deterministically. "
+        "Operations allowed: trim strings, coerce numeric columns with pandas.to_numeric, fill numeric NaNs with median, fill object NaNs with mode, "
+        "generate one-line schema summary. RETURN JSON only: {\"actions\": [...], \"schema\": {\"columns\":[{\"name\":\"...\",\"dtype\":\"...\",\"n_unique\":N},...], \"n_rows\":N}}. "
+        "Do NOT print or return any code, diagrams, or explanations."
+    ),
+    "sampling": (
+        "You are a sampling agent. From the cleaned dataframe produce three JSON arrays: head(5), tail(5), random(5). "
+        "Return JSON: {\"head\": [...], \"tail\": [...], \"random\": [...]} where each array contains row dicts. Do NOT include extra fields."
+    ),
+    "classification": (
+        "You are a classification agent. Examine provided samples and schema. Identify dataset domain (one-word) and propose at least SIX visualization tasks. "
+        "Each task must be a JSON object: {\"task_id\":\"tN\",\"chart_type\":\"pie|bar|line|scatter|histogram|boxplot\",\"target_columns\":[...],"
+        "\"aggregation\": null|\"count\"|\"sum\"|\"mean\",\"reasoning\":\"one-sentence\"}. "
+        "Return JSON exactly: {\"domain\":\"...\",\"tasks\":[...]} and nothing else. Do NOT include code. Do NOT recommend plotting libraries."
+    ),
+    "planning": (
+        "You are a planning agent. Input: the classification JSON + schema + small samples. Produce at least SIX task entries. "
+        "For each task output a Python/pandas code snippet that uses ONLY pandas and numpy (and the dataframe variable `df`) and assigns the final result to a variable named `result`. "
+        "REQUIREMENTS for the code string: "
+        " - Must NOT import or reference matplotlib, seaborn, plotly, altair, bokeh, or any plotting functions. "
+        " - Must NOT call pandas plotting methods (e.g. .plot(), .hist() wrapper that uses matplotlib). "
+        " - Must NOT use eval/exec/compile or open(). "
+        " - Allowed names: df, pd, np, len, sum, min, max, round, sorted. "
+        " - The code must produce `result` as a list of dictionaries ready for JSON serialization (use .to_dict(orient='records') or list comprehension). "
+        " - Return JSON exactly: {\"tasks\":[ {\"task_id\":\"t1\",\"chart_type\":\"pie\",\"target_columns\":[\"colA\"],"
+        "\"aggregation\":\"count\",\"reasoning\":\"...\",\"code\":\"result = df.groupby('colA').size().reset_index(name=\\'value\\').to_dict(orient=\\'records\\')\" }, ... ] }"
+    ),
+    "execution": (
+        "You are an execution agent. You will run model-provided code in a restricted execution environment WITHOUT plotting libraries. "
+        "The executor expects the code to assign a variable named `result` containing a list of dicts. "
+        "Rules: do not rely on plotting functions. Use pandas/numpy for aggregation and numeric work only. "
+        "Schema expectations per chart type (examples only): "
+        " Pie → [{\"name\":\"...\",\"value\":number}], "
+        " Bar → [{\"label\":\"...\",\"metric1\":number, ...}], "
+        " Line → [{\"x\":...,\"y\":...}] (x may be ISO string), "
+        " Scatter → [{\"x\":number,\"y\":number}], "
+        " Histogram → [{\"bin\":\"start-end\",\"count\":number}], "
+        " Boxplot → [{\"category\":\"...\",\"q1\":number,\"median\":number,\"q3\":number}]. "
+        "Return nothing else; the pipeline will read `result` after execution. If you must provide example code show it only as a code string and follow the allowed-names rule."
+    ),
+    "output": (
+        "You are an output agent. Aggregate final chart JSON objects into a single JSON object with keys: "
+        '"pie","bar","line","scatter","histogram","boxplot". Each key maps to an array (may be empty). Output JSON only.'
+    )
 }
 # ---------------------------
     r"\bsystem\s*\(",
     r"\bPopen\b",
     r"\bsh\b",
+    # plotting libraries / functions
+    r"\bmatplotlib\b",
+    r"\bseaborn\b",
+    r"\bplotly\b",
+    r"\baltair\b",
+    r"\bbokeh\b",
+    r"\.plot\s*\(",
+    r"\.hist\s*\(",
+    r"\.boxplot\s*\(",
+    r"\bpyplot\b",
+    r"\bplt\b",
 ]
 def code_is_safe(code: str) -> Tuple[bool, Optional[str]]:
     lowered = code
     for pat in DISALLOWED_PATTERNS:
+        if re.search(pat, lowered, flags=re.I):
             return False, f"disallowed pattern: {pat}"
     return True, None
     """
     Calls genai generate_content_stream with given system prompt and user content.
     Expects the model to return JSON text. Joins chunks and returns parsed JSON or raw text.
+    Uses GEMINI_API_KEY from environment.
     """
+    api_key = os.environ.get("GEMINI_API_KEY")
     if not api_key:
         raise EnvironmentError("GEMINI_API_KEY not set in environment.")
     client = genai.Client(api_key=api_key)
         # If model didn't produce tasks array, use classification tasks
         tasks = classification_output.get("tasks", [])
+    # Execution errors list (populate during validation/execution)
+    execution_errors: List[Dict[str, Any]] = []
+    # Validate model-provided code before execution:
+    # - require 'result' assignment inside code
+    # - drop code that contains plotting tokens or disallowed patterns
+    plotting_disallowed_re = re.compile(r"(matplotlib|seaborn|plotly|altair|bokeh|\.plot\s*\(|\.hist\s*\(|\.boxplot\s*\(|plt\b|pyplot\b)", flags=re.I)
+    for i, t in enumerate(tasks):
+        code = t.get("code", "") or ""
+        if code:
+            # 1) basic presence of `result`
+            if "result" not in code:
+                t.pop("code", None)
+                execution_errors.append({"task_index": i, "task_id": t.get("task_id"), "reason": "missing 'result' assignment - code dropped"})
+                continue
+            # 2) plotting tokens check
+            if plotting_disallowed_re.search(code):
+                t.pop("code", None)
+                execution_errors.append({"task_index": i, "task_id": t.get("task_id"), "reason": "plotting functions not allowed - code dropped"})
+                continue
+            # 3) disallowed patterns check
+            safe, reason = code_is_safe(code)
+            if not safe:
+                t.pop("code", None)
+                execution_errors.append({"task_index": i, "task_id": t.get("task_id"), "reason": f"disallowed pattern - code dropped: {reason}"})
+                continue
     # Ensure at least 6 tasks
     tasks = ensure_six_tasks(tasks, pre_df)
     # Execute tasks
+    final: Dict[str, List[Dict[str, Any]]] = {"pie": [], "bar": [], "line": [], "scatter": [], "histogram": [], "boxplot": []}
     for idx, task in enumerate(tasks):
         chart_type = task.get("chart_type")
         if code_snippet:
             safe, reason = code_is_safe(code_snippet)
             if not safe:
+                logger.warning("Rejected unsafe code snippet at execution time: %s", reason)
             else:
                 # Controlled globals for exec/eval
                 allowed_globals = {
                     "np": np,
                     "df": pre_df.copy(),
                 }
+                local_vars: Dict[str, Any] = {}
                 try:
                     # 1) Try exec (model should assign `result`)
                     exec(code_snippet, allowed_globals, local_vars)
                     # 3) Normalize result into list-of-dicts
                     result_json = None
                     if isinstance(result, pd.DataFrame):
+                        result_json = [{k: to_json_serializable(v) for k, v in r.items()} for r in result.to_dict(orient="records")]
                     elif isinstance(result, list):
                         norm = []
                         for r in result:
                             if isinstance(r, dict):
+                                norm.append({k: to_json_serializable(v) for k, v in r.items()})
                             else:
                                 # allow primitive lists but wrap as dict with value key
+                                norm.append({"value": to_json_serializable(r)})
                         result_json = norm
                     elif isinstance(result, dict):
+                        result_json = [{k: to_json_serializable(v) for k, v in result.items()}]
                     else:
                         # primitive or None -> invalid for chart payload
                         result_json = None
                                 final.setdefault(chart_type, []).extend(normalized)
                             executed = True
                     if not executed:
+                        execution_errors.append({"task_index": idx, "reason": "result not list-of-dicts or missing after exec", "code": code_snippet})
                 except Exception as e:
                     logger.exception("Model code execution failed for task %s: %s", idx, str(e))
                     execution_errors.append({"task_index": idx, "reason": "exception during exec/eval", "exception": str(e), "code": code_snippet})