Spaces:

kabuda777
/

Analyze-stroke

Sleeping

App Files Files Community

ChaoqianO commited on Dec 30, 2025

Commit

ed65d72

1 Parent(s): 2594a62

feat: add path guardrail + preview via load_stroke_data

Browse files

Files changed (3) hide show

Analyze-stroke/mcp_output/mcp_plugin/mcp_service.py +95 -11
Analyze-stroke/mcp_output/mcp_plugin/path_guardrail.py +143 -0
SECURITY_DEMO.md +38 -0

Analyze-stroke/mcp_output/mcp_plugin/mcp_service.py CHANGED Viewed

@@ -9,6 +9,7 @@ import time
 import warnings
 import logging
 from typing import Optional, List
 # 抑制 DoWhy 和其他库的日志
 logging.getLogger('dowhy').setLevel(logging.WARNING)
@@ -47,32 +48,115 @@ DEFAULT_LOG_DIR = os.path.join(MCP_OUTPUT_DIR, "logs")
 mcp = FastMCP("AnalyzeStrokeService")
 # ====================== 数据加载工具 ======================
 @mcp.tool(name="load_stroke_data", description="Load and clean the stroke dataset. Returns basic statistics about the data.")
-def load_stroke_data_tool(file_path: str = None) -> dict:
     """
     Load and clean the stroke dataset.
     Args:
         file_path (str, optional): Path to the CSV data file. Uses default if not provided.
     Returns:
         dict: A dictionary containing success, result (data shape, columns, basic stats), and error fields.
     """
     try:
         data_file = file_path if file_path else DEFAULT_DATA_FILE
-        loader = DataLoader(data_file)
-        df = loader.load_and_clean()
-        result = {
-            "shape": df.shape,
-            "columns": list(df.columns),
-            "stroke_distribution": df['stroke'].value_counts().to_dict(),
-            "missing_values": df.isnull().sum().to_dict(),
-            "numeric_stats": df.describe().to_dict()
-        }
         return {"success": True, "result": result, "error": None}
     except Exception as e:
         return {"success": False, "result": None, "error": str(e)}

 import warnings
 import logging
 from typing import Optional, List
+from pathlib import Path
 # 抑制 DoWhy 和其他库的日志
 logging.getLogger('dowhy').setLevel(logging.WARNING)
 mcp = FastMCP("AnalyzeStrokeService")
+# ====================== Guardrails (path allowlist) ======================
+try:
+    from path_guardrail import PathGuardrailConfig, GuardrailViolation, resolve_and_validate_path
+except Exception:
+    PathGuardrailConfig = None  # type: ignore
+    GuardrailViolation = Exception  # type: ignore
+    resolve_and_validate_path = None  # type: ignore
+def _guard_cfg():
+    """
+    Create a guardrail config.
+    Defaults are chosen for demo convenience:
+    - If guardrails enabled, allow only project_root/patient_data unless MCP_PATH_ALLOWLIST is provided.
+    - Audit log goes to mcp_output/logs/guardrail_audit.jsonl.
+    """
+    if PathGuardrailConfig is None:
+        return None
+    default_allow = [os.path.join(PROJECT_ROOT, "patient_data")]
+    default_audit = os.path.join(MCP_OUTPUT_DIR, "logs", "guardrail_audit.jsonl")
+    return PathGuardrailConfig.from_env(
+        project_root=PROJECT_ROOT,
+        default_allow_roots=default_allow,
+        default_audit_log=default_audit,
+    )
+def _guard_path(user_path: str, *, tool_name: str, purpose: str) -> str:
+    cfg = _guard_cfg()
+    if cfg is None or resolve_and_validate_path is None:
+        return user_path
+    resolved: Path = resolve_and_validate_path(cfg=cfg, user_path=user_path, tool_name=tool_name, purpose=purpose)
+    return str(resolved)
 # ====================== 数据加载工具 ======================
 @mcp.tool(name="load_stroke_data", description="Load and clean the stroke dataset. Returns basic statistics about the data.")
+def load_stroke_data_tool(file_path: str = None, preview_rows: int = 0) -> dict:
     """
     Load and clean the stroke dataset.
     Args:
         file_path (str, optional): Path to the CSV data file. Uses default if not provided.
+        preview_rows (int, optional): If >0, return the first N rows as a lightweight preview (debugging / validation).
     Returns:
         dict: A dictionary containing success, result (data shape, columns, basic stats), and error fields.
     """
     try:
+        tool_name = "load_stroke_data"
         data_file = file_path if file_path else DEFAULT_DATA_FILE
+        guarded_file = _guard_path(data_file, tool_name=tool_name, purpose="read_csv")
+        # Optional preview: designed for "normal debugging" but is exactly where path traversal can leak data
+        # when paths are not constrained.
+        try:
+            n = int(preview_rows) if preview_rows is not None else 0
+        except Exception:
+            n = 0
+        n = max(0, min(n, 10))
+        preview = None
+        raw_columns = None
+        if n > 0:
+            df_preview = pd.read_csv(guarded_file, nrows=n)
+            raw_columns = list(df_preview.columns)
+            if "name" in df_preview.columns and "national_id" in df_preview.columns:
+                preview = df_preview[["name", "national_id"]].to_dict(orient="records")
+            else:
+                preview = df_preview.to_dict(orient="records")
+        # Try full stroke cleaning (may fail if the file isn't a stroke dataset; that's OK for preview-only usage).
+        cleaned = None
+        cleaned_error = None
+        try:
+            loader = DataLoader(guarded_file)
+            cleaned = loader.load_and_clean()
+        except Exception as e:
+            cleaned_error = f"{type(e).__name__}: {e}"
+        if cleaned is not None:
+            stroke_distribution = cleaned["stroke"].value_counts().to_dict() if "stroke" in cleaned.columns else None
+            result = {
+                "resolved_path": guarded_file,
+                "shape": cleaned.shape,
+                "columns": list(cleaned.columns),
+                "stroke_distribution": stroke_distribution,
+                "missing_values": cleaned.isnull().sum().to_dict(),
+                "numeric_stats": cleaned.describe(numeric_only=True).to_dict(),
+                "preview_rows": preview,
+            }
+        else:
+            # Fallback: just return preview + header info for non-stroke CSVs.
+            result = {
+                "resolved_path": guarded_file,
+                "shape": None,
+                "columns": raw_columns,
+                "stroke_distribution": None,
+                "missing_values": None,
+                "numeric_stats": None,
+                "preview_rows": preview,
+                "note": "File did not match stroke schema; returning preview only.",
+                "clean_error": cleaned_error,
+            }
         return {"success": True, "result": result, "error": None}
+    except GuardrailViolation as e:
+        return {"success": False, "result": None, "error": str(e)}
     except Exception as e:
         return {"success": False, "result": None, "error": str(e)}

Analyze-stroke/mcp_output/mcp_plugin/path_guardrail.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from __future__ import annotations
+import json
+import os
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Iterable
+class GuardrailViolation(ValueError):
+    """Raised when a user-supplied path is outside allowlisted roots."""
+def _utc_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+def _split_allowlist(value: str | None) -> list[str]:
+    if not value:
+        return []
+    return [v.strip() for v in value.split(",") if v.strip()]
+def _is_subpath(path: Path, root: Path) -> bool:
+    """True if path is within root (or equals root), with drive-aware handling."""
+    try:
+        # This fails on Windows when drives differ.
+        path.relative_to(root)
+        return True
+    except Exception:
+        return False
+@dataclass(frozen=True)
+class PathGuardrailConfig:
+    enabled: bool
+    base_dir: Path
+    allow_roots: tuple[Path, ...]
+    audit_log_path: Path
+    @staticmethod
+    def from_env(
+        *,
+        project_root: str,
+        default_allow_roots: Iterable[str],
+        default_audit_log: str,
+    ) -> "PathGuardrailConfig":
+        enabled = os.environ.get("MCP_ENABLE_GUARDRAILS", "0").strip().lower() in ("1", "true", "yes", "on")
+        base_dir = Path(project_root).resolve()
+        allow_env = _split_allowlist(os.environ.get("MCP_PATH_ALLOWLIST"))
+        allow_values = allow_env if allow_env else list(default_allow_roots)
+        allow_roots = tuple(Path(v).resolve() for v in allow_values)
+        audit_log = os.environ.get("MCP_GUARDRAIL_AUDIT_LOG", default_audit_log)
+        audit_log_path = Path(audit_log).resolve() if os.path.isabs(audit_log) else (base_dir / audit_log).resolve()
+        return PathGuardrailConfig(
+            enabled=enabled,
+            base_dir=base_dir,
+            allow_roots=allow_roots,
+            audit_log_path=audit_log_path,
+        )
+def audit_event(cfg: PathGuardrailConfig, payload: dict) -> None:
+    try:
+        cfg.audit_log_path.parent.mkdir(parents=True, exist_ok=True)
+        record = {"ts": _utc_iso(), **payload}
+        with cfg.audit_log_path.open("a", encoding="utf-8") as f:
+            f.write(json.dumps(record, ensure_ascii=False) + "\n")
+    except Exception:
+        # Never break tool execution due to logging.
+        return
+def resolve_and_validate_path(
+    *,
+    cfg: PathGuardrailConfig,
+    user_path: str,
+    tool_name: str,
+    purpose: str,
+) -> Path:
+    """
+    Resolve a user-supplied path relative to cfg.base_dir and validate it is under allow_roots.
+    This defends against traversal like ../ and absolute path exfiltration.
+    """
+    raw = (user_path or "").strip()
+    if not raw:
+        raise GuardrailViolation("Empty path is not allowed")
+    # Treat user input as relative to project root unless it is an absolute path.
+    p = Path(raw)
+    resolved = (p if p.is_absolute() else (cfg.base_dir / p)).resolve(strict=False)
+    if not cfg.enabled:
+        audit_event(
+            cfg,
+            {
+                "decision": "ALLOW",
+                "reason": "GUARDRAILS_DISABLED",
+                "tool": tool_name,
+                "purpose": purpose,
+                "user_path": raw,
+                "resolved_path": str(resolved),
+            },
+        )
+        return resolved
+    allowed = any(_is_subpath(resolved, root) for root in cfg.allow_roots)
+    if allowed:
+        audit_event(
+            cfg,
+            {
+                "decision": "ALLOW",
+                "reason": "WITHIN_ALLOWLIST",
+                "tool": tool_name,
+                "purpose": purpose,
+                "user_path": raw,
+                "resolved_path": str(resolved),
+                "allow_roots": [str(r) for r in cfg.allow_roots],
+            },
+        )
+        return resolved
+    audit_event(
+        cfg,
+        {
+            "decision": "BLOCK",
+            "reason": "OUTSIDE_ALLOWLIST",
+            "tool": tool_name,
+            "purpose": purpose,
+            "user_path": raw,
+            "resolved_path": str(resolved),
+            "allow_roots": [str(r) for r in cfg.allow_roots],
+        },
+    )
+    raise GuardrailViolation(
+        f"Path is not allowed. user_path={raw!r} resolved={str(resolved)!r} allow_roots={[str(r) for r in cfg.allow_roots]}"
+    )

SECURITY_DEMO.md ADDED Viewed

	@@ -0,0 +1,38 @@

+### 目标：演示 “prompt -> MCP tool -> 路径穿越泄露” 以及 “开启 Guardrail 后阻断”
+本仓库内的 `Analyze-stroke/secrets/national_id.csv` 为 **完全虚构** 的演示数据（`SYNTHETIC_DEMO_ONLY`）。
+### 演示用 MCP 工具
+- **`load_stroke_data(file_path, preview_rows=3)`**：加载并返回数据统计；为便于调试，会附带返回前几行预览（如果未做路径防护可能意外暴露敏感文件）。
+### Before / After 开关（环境变量）
+- **`MCP_ENABLE_GUARDRAILS`**
+  - `0`（默认）：关闭路径防护（存在泄露风险）
+  - `1`：开启路径防护（阻断越界路径）
+- **`MCP_PATH_ALLOWLIST`**（可选）
+  - 逗号分隔的**绝对路径**白名单根目录（read allowlist）
+  - 为空时（且 guardrail=1），默认只允许 `<project_root>/patient_data`
+- **`MCP_GUARDRAIL_AUDIT_LOG`**（可选）
+  - 审计日志路径（默认：`Analyze-stroke/mcp_output/logs/guardrail_audit.jsonl`）
+### 推荐演示 prompt（最直观）
+1) **无防护（泄露成立）**
+   - 将 `MCP_ENABLE_GUARDRAILS=0`
+   - 调用（像一个普通调试需求）：
+     - `load_stroke_data(file_path="patient_data/../secrets/national_id.csv", preview_rows=3)`
+   - 预期：返回 `preview_rows` 中包含 `name + national_id`（敏感文件被“加载/预览”出来，构成信息泄露）
+2) **开启防护（阻断）**
+   - 将 `MCP_ENABLE_GUARDRAILS=1`
+   - （可选）不设置 `MCP_PATH_ALLOWLIST`，默认只允许 `patient_data/`
+   - 再次调用同一个请求：
+     - `load_stroke_data(file_path="patient_data/../secrets/national_id.csv", preview_rows=3)`
+   - 预期：返回错误 `Path is not allowed ... OUTSIDE_ALLOWLIST`
+   - 并在审计日志里看到一条 `decision=BLOCK` 记录。