Spaces:

ACA050
/

ModelSmith-AI

Running

App Files Files Community

ACA050 commited on Feb 4

Commit

654bcd9

verified ·

1 Parent(s): 6f5e4ab

Update backend/core/explainability.py

Browse files

Files changed (1) hide show

backend/core/explainability.py +64 -9

backend/core/explainability.py CHANGED Viewed

@@ -1,30 +1,44 @@
 import shap
 import numpy as np
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.linear_model import LinearRegression, LogisticRegression
 class ExplainabilityEngine:
     def explain_tabular(self, model_pipeline, X_sample):
-        if X_sample.empty:
             raise ValueError("Sample data is empty, cannot compute explanations")
         preprocessor = model_pipeline.named_steps["preprocessor"]
         model = model_pipeline.named_steps["model"]
         X_transformed = preprocessor.transform(X_sample)
-        if X_transformed.shape[0] == 0:
-            raise ValueError("Transformed sample data is empty after preprocessing")
-        # -------- Model-aware SHAP selection --------
         if isinstance(model, (RandomForestClassifier, RandomForestRegressor)):
             explainer = shap.TreeExplainer(model)
             shap_values = explainer.shap_values(
                 X_transformed, check_additivity=False
             )
-            # Classification returns list
             if isinstance(shap_values, list):
                 shap_values = shap_values[1]
@@ -37,12 +51,53 @@ class ExplainabilityEngine:
                 f"Explainability not supported for model type: {type(model)}"
             )
         if shap_values is None or len(shap_values) == 0:
             raise ValueError("SHAP computation failed")
-        global_importance = np.abs(shap_values).mean(axis=0).tolist()
-        if not global_importance:
-            raise ValueError("No feature importance computed")
-        return global_importance

 import shap
 import numpy as np
+import collections
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.linear_model import LinearRegression, LogisticRegression
 class ExplainabilityEngine:
     def explain_tabular(self, model_pipeline, X_sample):
+        """
+        Computes global feature importance for tabular models using SHAP.
+        Aggregates SHAP values from transformed features back to original features.
+        """
+        # -------------------- Validation --------------------
+        if X_sample is None or X_sample.empty:
             raise ValueError("Sample data is empty, cannot compute explanations")
+        if "preprocessor" not in model_pipeline.named_steps:
+            raise ValueError("Pipeline missing 'preprocessor' step")
+        if "model" not in model_pipeline.named_steps:
+            raise ValueError("Pipeline missing 'model' step")
         preprocessor = model_pipeline.named_steps["preprocessor"]
         model = model_pipeline.named_steps["model"]
+        # -------------------- Transform Data --------------------
         X_transformed = preprocessor.transform(X_sample)
+        if X_transformed is None or X_transformed.shape[0] == 0:
+            raise ValueError("Transformed data is empty after preprocessing")
+        # -------------------- SHAP Explainer Selection --------------------
         if isinstance(model, (RandomForestClassifier, RandomForestRegressor)):
             explainer = shap.TreeExplainer(model)
             shap_values = explainer.shap_values(
                 X_transformed, check_additivity=False
             )
+            # For binary classification, SHAP returns list → take positive class
             if isinstance(shap_values, list):
                 shap_values = shap_values[1]
                 f"Explainability not supported for model type: {type(model)}"
             )
+        # -------------------- Validate SHAP Output --------------------
         if shap_values is None or len(shap_values) == 0:
             raise ValueError("SHAP computation failed")
+        if isinstance(shap_values, list):
+            shap_values = np.array(shap_values)
+        # -------------------- Aggregate Importance --------------------
+        abs_shap = np.abs(shap_values)
+        mean_shap = abs_shap.mean(axis=0)
+        try:
+            feature_names = preprocessor.get_feature_names_out()
+        except Exception as e:
+            raise ValueError(
+                f"Failed to retrieve feature names from preprocessor: {e}"
+            )
+        if len(feature_names) != len(mean_shap):
+            raise ValueError(
+                "Mismatch between SHAP values and feature names"
+            )
+        # Aggregate encoded features back to original feature names
+        aggregated_importance = collections.defaultdict(float)
+        for feature_name, importance in zip(feature_names, mean_shap):
+            # Examples:
+            # num__WindSpeed
+            # cat__PaymentMethod_CreditCard
+            if "__" in feature_name:
+                original_feature = feature_name.split("__")[1].split("_")[0]
+            else:
+                original_feature = feature_name
+            aggregated_importance[original_feature] += float(importance)
+        if not aggregated_importance:
+            raise ValueError("No feature importance computed after aggregation")
+        # -------------------- Sort + Limit Output --------------------
+        sorted_importance = dict(
+            sorted(
+                aggregated_importance.items(),
+                key=lambda x: x[1],
+                reverse=True
+            )[:10]
+        )
+        return sorted_importance