Spaces:

AshenH
/

ALM_LLM

Sleeping

App Files Files Community

AshenH commited on Oct 13

Commit

e4818d5

verified ·

1 Parent(s): 4cad9bd

Update tools/explain_tool.py

Browse files

Files changed (1) hide show

tools/explain_tool.py +289 -50

tools/explain_tool.py CHANGED Viewed

@@ -3,10 +3,13 @@ import os
 import io
 import json
 import base64
 from typing import Dict, Optional
 import shap
 import pandas as pd
 import matplotlib.pyplot as plt
 import joblib
 from huggingface_hub import hf_hub_download
@@ -14,70 +17,306 @@ from huggingface_hub import hf_hub_download
 from utils.config import AppConfig
 from utils.tracing import Tracer
 class ExplainTool:
     """
-    Generates global SHAP visualizations for a sample of rows (CPU-friendly).
     """
     def __init__(self, cfg: AppConfig, tracer: Tracer):
         self.cfg = cfg
         self.tracer = tracer
         self._model = None
         self._feature_order = None
     def _ensure_model(self):
         if self._model is not None:
             return
-        token = os.getenv("HF_TOKEN")
-        repo = self.cfg.hf_model_repo
-        model_path = hf_hub_download(repo_id=repo, filename="model.pkl", token=token)
-        self._model = joblib.load(model_path)
         try:
-            meta_path = hf_hub_download(repo_id=repo, filename="feature_metadata.json", token=token)
-            with open(meta_path, "r", encoding="utf-8") as f:
-                meta = json.load(f) or {}
-            self._feature_order = meta.get("feature_order")
-        except Exception:
-            self._feature_order = None
-    @staticmethod
-    def _to_data_uri(fig) -> str:
-        buf = io.BytesIO()
-        fig.savefig(buf, format="png", bbox_inches="tight", dpi=150)
-        plt.close(fig)
-        buf.seek(0)
-        return "data:image/png;base64," + base64.b64encode(buf.read()).decode()
-    def run(self, df: Optional[pd.DataFrame]) -> Dict[str, str]:
-        self._ensure_model()
-        if df is None or len(df) == 0:
-            return {}
         if self._feature_order:
-            cols = [c for c in self._feature_order if c in df.columns]
-            X = df[cols].copy()
         else:
             X = df.copy()
-        n = min(len(X), 500)
-        sample = X.sample(n, random_state=42) if len(X) > n else X
-        explainer = shap.Explainer(self._model, sample)
-        sv = explainer(sample)
-        fig_bar = plt.figure()
-        shap.plots.bar(sv, show=False)
-        bar_uri = self._to_data_uri(fig_bar)
-        fig_bee = plt.figure()
-        shap.plots.beeswarm(sv, show=False)
-        bee_uri = self._to_data_uri(fig_bee)
         try:
-            self.tracer.trace_event("explain", {"rows": int(n)})
-        except Exception:
-            pass
-        return {"global_bar": bar_uri, "beeswarm": bee_uri}

 import io
 import json
 import base64
+import logging
 from typing import Dict, Optional
 import shap
 import pandas as pd
+import matplotlib
+matplotlib.use('Agg')  # Non-interactive backend
 import matplotlib.pyplot as plt
 import joblib
 from huggingface_hub import hf_hub_download
 from utils.config import AppConfig
 from utils.tracing import Tracer
+logger = logging.getLogger(__name__)
+# Constants
+MAX_SAMPLE_SIZE = 1000
+MIN_SAMPLE_SIZE = 10
+DEFAULT_SAMPLE_SIZE = 500
+MAX_IMAGE_SIZE_MB = 5
+class ExplainToolError(Exception):
+    """Custom exception for explanation tool errors."""
+    pass
 class ExplainTool:
     """
+    Generates SHAP-based model explanations with global visualizations.
+    CPU-friendly with sampling for large datasets.
     """
     def __init__(self, cfg: AppConfig, tracer: Tracer):
         self.cfg = cfg
         self.tracer = tracer
         self._model = None
         self._feature_order = None
+        logger.info("ExplainTool initialized (lazy loading)")
     def _ensure_model(self):
+        """Lazy load model and metadata from HuggingFace."""
         if self._model is not None:
             return
         try:
+            token = os.getenv("HF_TOKEN")
+            repo = self.cfg.hf_model_repo
+            if not repo:
+                raise ExplainToolError("HF_MODEL_REPO not configured")
+            logger.info(f"Loading model for explanations from: {repo}")
+            # Download and load model
+            try:
+                model_path = hf_hub_download(
+                    repo_id=repo,
+                    filename="model.pkl",
+                    token=token
+                )
+                self._model = joblib.load(model_path)
+                logger.info(f"Model loaded: {type(self._model).__name__}")
+            except Exception as e:
+                raise ExplainToolError(f"Failed to load model: {e}") from e
+            # Load feature metadata
+            try:
+                meta_path = hf_hub_download(
+                    repo_id=repo,
+                    filename="feature_metadata.json",
+                    token=token
+                )
+                with open(meta_path, "r", encoding="utf-8") as f:
+                    meta = json.load(f) or {}
+                self._feature_order = meta.get("feature_order")
+                logger.info(f"Loaded feature order: {len(self._feature_order or [])} features")
+            except Exception as e:
+                logger.warning(f"Could not load feature metadata: {e}")
+                self._feature_order = None
+        except ExplainToolError:
+            raise
+        except Exception as e:
+            raise ExplainToolError(f"Model initialization failed: {e}") from e
+    def _validate_data(self, df: pd.DataFrame) -> tuple[bool, str]:
+        """
+        Validate input dataframe.
+        Returns (is_valid, error_message).
+        """
+        if df is None or df.empty:
+            return False, "Input dataframe is empty"
+        if len(df.columns) == 0:
+            return False, "Dataframe has no columns"
+        return True, ""
+    def _prepare_features(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Prepare feature matrix for SHAP analysis.
+        Selects and orders features according to model expectations.
+        """
         if self._feature_order:
+            # Use specified feature order
+            available_features = [col for col in self._feature_order if col in df.columns]
+            missing_features = [col for col in self._feature_order if col not in df.columns]
+            if missing_features:
+                logger.warning(
+                    f"Missing {len(missing_features)} features for explanation: "
+                    f"{missing_features[:5]}"
+                )
+            if not available_features:
+                raise ExplainToolError(
+                    f"No required features found in dataframe. "
+                    f"Required: {self._feature_order}, "
+                    f"Available: {list(df.columns)}"
+                )
+            X = df[available_features].copy()
+            logger.info(f"Using {len(available_features)} features for explanation")
         else:
+            # Use all columns
             X = df.copy()
+            logger.warning("No feature order specified - using all columns")
+        # Remove non-numeric columns
+        numeric_cols = X.select_dtypes(include=['number']).columns
+        if len(numeric_cols) < len(X.columns):
+            dropped = set(X.columns) - set(numeric_cols)
+            logger.warning(f"Dropping {len(dropped)} non-numeric columns: {list(dropped)[:5]}")
+            X = X[numeric_cols]
+        if X.empty or len(X.columns) == 0:
+            raise ExplainToolError("No numeric features available for explanation")
+        return X
+    def _sample_data(self, X: pd.DataFrame, sample_size: int = DEFAULT_SAMPLE_SIZE) -> pd.DataFrame:
+        """
+        Sample data for SHAP analysis to keep computation manageable.
+        """
+        n = len(X)
+        if n <= MIN_SAMPLE_SIZE:
+            logger.info(f"Using all {n} rows (below minimum sample size)")
+            return X
+        # Determine sample size
+        target_size = min(sample_size, MAX_SAMPLE_SIZE)
+        target_size = max(target_size, MIN_SAMPLE_SIZE)
+        if n <= target_size:
+            logger.info(f"Using all {n} rows (below target sample size)")
+            return X
+        # Stratified sampling if possible
         try:
+            sample = X.sample(n=target_size, random_state=42)
+            logger.info(f"Sampled {target_size} rows from {n} total")
+            return sample
+        except Exception as e:
+            logger.warning(f"Sampling failed: {e}, using head()")
+            return X.head(target_size)
+    @staticmethod
+    def _to_data_uri(fig) -> str:
+        """
+        Convert matplotlib figure to base64 data URI.
+        Includes size validation.
+        """
+        try:
+            buf = io.BytesIO()
+            fig.savefig(buf, format="png", bbox_inches="tight", dpi=150)
+            plt.close(fig)
+            buf.seek(0)
+            # Check size
+            size_mb = len(buf.getvalue()) / (1024 * 1024)
+            if size_mb > MAX_IMAGE_SIZE_MB:
+                logger.warning(f"Generated image is large: {size_mb:.2f} MB")
+            data_uri = "data:image/png;base64," + base64.b64encode(buf.read()).decode()
+            logger.debug(f"Generated data URI of size: {len(data_uri)} chars")
+            return data_uri
+        except Exception as e:
+            logger.error(f"Failed to convert figure to data URI: {e}")
+            raise ExplainToolError(f"Image conversion failed: {e}") from e
+    def _generate_shap_values(self, X: pd.DataFrame) -> shap.Explanation:
+        """
+        Generate SHAP values for the sample.
+        """
+        try:
+            logger.info("Creating SHAP explainer...")
+            explainer = shap.Explainer(self._model, X)
+            logger.info("Computing SHAP values...")
+            shap_values = explainer(X)
+            logger.info(f"SHAP values computed: shape={shap_values.values.shape}")
+            return shap_values
+        except Exception as e:
+            raise ExplainToolError(f"SHAP computation failed: {e}") from e
+    def _create_bar_plot(self, shap_values: shap.Explanation) -> str:
+        """Create global feature importance bar plot."""
+        try:
+            logger.info("Creating bar plot...")
+            fig = plt.figure(figsize=(10, 6))
+            shap.plots.bar(shap_values, show=False, max_display=20)
+            plt.title("Feature Importance (Global)", fontsize=14, pad=20)
+            plt.xlabel("Mean |SHAP value|", fontsize=12)
+            plt.tight_layout()
+            uri = self._to_data_uri(fig)
+            logger.info("Bar plot created successfully")
+            return uri
+        except Exception as e:
+            logger.error(f"Bar plot creation failed: {e}")
+            # Return empty data URI rather than failing completely
+            return ""
+    def _create_beeswarm_plot(self, shap_values: shap.Explanation) -> str:
+        """Create beeswarm plot showing feature effects."""
+        try:
+            logger.info("Creating beeswarm plot...")
+            fig = plt.figure(figsize=(10, 8))
+            shap.plots.beeswarm(shap_values, show=False, max_display=20)
+            plt.title("Feature Effects Distribution", fontsize=14, pad=20)
+            plt.tight_layout()
+            uri = self._to_data_uri(fig)
+            logger.info("Beeswarm plot created successfully")
+            return uri
+        except Exception as e:
+            logger.error(f"Beeswarm plot creation failed: {e}")
+            return ""
+    def run(self, df: Optional[pd.DataFrame]) -> Dict[str, str]:
+        """
+        Generate SHAP explanations for input data.
+        Args:
+            df: Input dataframe with features
+        Returns:
+            Dictionary mapping plot names to base64 data URIs
+        Raises:
+            ExplainToolError: If explanation generation fails
+        """
+        try:
+            # Validate input
+            is_valid, error_msg = self._validate_data(df)
+            if not is_valid:
+                logger.warning(f"Invalid input: {error_msg}")
+                return {}
+            # Ensure model is loaded
+            self._ensure_model()
+            # Prepare features
+            X = self._prepare_features(df)
+            logger.info(f"Prepared features: {X.shape}")
+            # Sample data for efficiency
+            sample = self._sample_data(X)
+            # Generate SHAP values
+            shap_values = self._generate_shap_values(sample)
+            # Create visualizations
+            result = {}
+            # Bar plot (feature importance)
+            bar_uri = self._create_bar_plot(shap_values)
+            if bar_uri:
+                result["global_bar"] = bar_uri
+            # Beeswarm plot (feature effects)
+            bee_uri = self._create_beeswarm_plot(shap_values)
+            if bee_uri:
+                result["beeswarm"] = bee_uri
+            # Log success
+            logger.info(f"Generated {len(result)} explanation visualizations")
+            if self.tracer:
+                self.tracer.trace_event("explain", {
+                    "rows": len(sample),
+                    "features": len(X.columns),
+                    "visualizations": len(result)
+                })
+            return result
+        except ExplainToolError:
+            raise
+        except Exception as e:
+            error_msg = f"Explanation generation failed: {str(e)}"
+            logger.error(error_msg)
+            if self.tracer:
+                self.tracer.trace_event("explain_error", {"error": error_msg})
+            raise ExplainToolError(error_msg) from e