Spaces:

rb757
/

chemi-mlops-demo

Sleeping

File size: 4,160 Bytes

import sys
from pathlib import Path
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))


FEATURE_NAMES = [
    "MolWt",
    "LogP", 
    "TPSA",
    "NumHDonors",
    "NumHAcceptors",
    "NumRotatableBonds",
    "RingCount"
]


class ModelInterpreter:
    """Provides model interpretability via SHAP and uncertainty quantification."""
    
    def __init__(self, model: RandomForestRegressor, X_train: np.ndarray):
        """
        Initialize the interpreter with a trained model and training data.
        
        Args:
            model: Trained RandomForestRegressor
            X_train: Training features used to fit the explainer
        """
        self.model = model
        self.explainer = shap.TreeExplainer(model)
        # Use a subset of training data for faster SHAP computation
        self.X_background = shap.sample(X_train, min(100, len(X_train)))
    
    def get_feature_importance(self) -> dict:
        """Get global feature importance from the random forest model."""
        importance = self.model.feature_importances_
        return {
            name: float(imp) 
            for name, imp in zip(FEATURE_NAMES, importance)
        }
    
    def explain_prediction(self, X: np.ndarray):
        """
        Explain a single prediction using SHAP values.
        
        Args:
            X: Feature vector (1, n_features)
            
        Returns:
            dict with SHAP values and base value
        """
        shap_values = self.explainer.shap_values(X)
        return {
            "shap_values": {
                name: float(val) 
                for name, val in zip(FEATURE_NAMES, shap_values[0])
            },
            "base_value": float(self.explainer.expected_value),
            "prediction": float(self.model.predict(X)[0])
        }
    
    def get_prediction_interval(self, X: np.ndarray, confidence=0.95) -> tuple:
        """
        Estimate prediction interval using individual tree predictions.
        
        Args:
            X: Feature vector (1, n_features)
            confidence: Confidence level (default 0.95)
            
        Returns:
            (lower_bound, upper_bound, std_dev)
        """
        # Get predictions from all trees
        tree_predictions = np.array([
            tree.predict(X)[0] 
            for tree in self.model.estimators_
        ])
        
        mean_pred = tree_predictions.mean()
        std_pred = tree_predictions.std()
        
        # Calculate confidence interval
        alpha = 1 - confidence
        lower = np.percentile(tree_predictions, alpha/2 * 100)
        upper = np.percentile(tree_predictions, (1 - alpha/2) * 100)
        
        return float(lower), float(upper), float(std_pred)
    
    def plot_shap_waterfall(self, X: np.ndarray, feature_values: dict) -> plt.Figure:
        """
        Create a SHAP waterfall plot showing feature contributions.
        
        Args:
            X: Feature vector (1, n_features)
            feature_values: Dict mapping feature names to values
            
        Returns:
            matplotlib Figure
        """
        shap_values = self.explainer.shap_values(X)
        base_value = float(self.explainer.expected_value)
        
        # Create waterfall plot
        fig, ax = plt.subplots(figsize=(10, 6))
        
        # Sort by absolute SHAP value
        indices = np.argsort(np.abs(shap_values[0]))[::-1]
        
        y_pos = np.arange(len(FEATURE_NAMES))
        colors = ['#ff0051' if val > 0 else '#008bfb' for val in shap_values[0][indices]]
        
        ax.barh(y_pos, shap_values[0][indices], color=colors)
        ax.set_yticks(y_pos)
        ax.set_yticklabels([f"{FEATURE_NAMES[i]} = {X[0][i]:.2f}" for i in indices])
        ax.set_xlabel('SHAP value (impact on prediction)')
        ax.set_title(f'Feature Contributions to Prediction\nBase value: {base_value:.3f}')
        ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
        
        plt.tight_layout()
        return fig