import numpy as np import pandas as pd from sklearn.datasets import make_classification from sklearn.ensemble import IsolationForest import shap import matplotlib.pyplot as plt from itertools import combinations # Generate synthetic data with 20 features np.random.seed(42) X, _ = make_classification( n_samples=500, n_features=20, n_informative=10, n_redundant=5, n_clusters_per_class=1, random_state=42 ) outliers = np.random.uniform(low=-6, high=6, size=(50, 20)) # Add outliers X = np.vstack([X, outliers]) # Convert to DataFrame columns = [f"Feature{i+1}" for i in range(20)] df = pd.DataFrame(X, columns=columns) # Fit Isolation Forest iso_forest = IsolationForest( n_estimators=100, max_samples=256, contamination=0.1, random_state=42 ) iso_forest.fit(df) # Predict anomaly scores anomaly_scores = iso_forest.decision_function(df) # Negative values indicate anomalies anomaly_labels = iso_forest.predict(df) # -1 for anomaly, 1 for normal # Add results to DataFrame df["Anomaly_Score"] = anomaly_scores df["Anomaly_Label"] = np.where(anomaly_labels == -1, "Anomaly", "Normal") # SHAP Explainability explainer = shap.Explainer(iso_forest, df[columns]) shap_values = explainer(df[columns]) # SHAP Summary Plot (Global Explainability) shap.summary_plot(shap_values, df[columns], feature_names=columns) # SHAP Waterfall Plot for a Specific Data Point (Local Explainability) specific_index = df[df["Anomaly_Label"] == "Anomaly"].index[0] # Select first anomaly shap.waterfall_plot( shap.Explanation( values=shap_values.values[specific_index], base_values=shap_values.base_values[specific_index], data=df.iloc[specific_index], feature_names=columns ) ) # Scatter plots for pairwise combinations of features feature_combinations = list(combinations(columns[:5], 2)) # Use first 5 features for simplicity for feature1, feature2 in feature_combinations: plt.figure(figsize=(8, 6)) plt.scatter( df[feature1], df[feature2], c=(df["Anomaly_Label"] == "Anomaly"), cmap="coolwarm", edgecolor="k", alpha=0.7 ) plt.title(f"Isolation Forest - {feature1} vs {feature2}") plt.xlabel(feature1) plt.ylabel(feature2) plt.show()