import numpy as np import pandas as pd from sklearn.datasets import make_classification from sklearn.ensemble import IsolationForest from sklearn.metrics import roc_curve, auc import shap import matplotlib.pyplot as plt import gradio as gr from sklearn import svm from sklearn.covariance import EllipticEnvelope from sklearn.neighbors import LocalOutlierFactor from sklearn.linear_model import SGDOneClassSVM from sklearn.kernel_approximation import Nystroem from sklearn.pipeline import make_pipeline import time from functools import partial # Generate synthetic data with 20 features np.random.seed(42) X, _ = make_classification( n_samples=500, n_features=20, n_informative=10, n_redundant=5, n_clusters_per_class=1, random_state=42 ) outliers = np.random.uniform(low=-6, high=6, size=(50, 20)) # Add outliers X = np.vstack([X, outliers]) # Convert to DataFrame columns = [f"Feature{i+1}" for i in range(20)] df = pd.DataFrame(X, columns=columns) # Fit Isolation Forest iso_forest = IsolationForest( n_estimators=100, max_samples=256, contamination=0.1, random_state=42 ) iso_forest.fit(df) # Predict anomaly scores anomaly_scores = iso_forest.decision_function(df) # Negative values indicate anomalies anomaly_labels = iso_forest.predict(df) # -1 for anomaly, 1 for normal # Add results to DataFrame df["Anomaly_Score"] = anomaly_scores df["Anomaly_Label"] = np.where(anomaly_labels == -1, "Anomaly", "Normal") # Generate true labels (1 for anomaly, 0 for normal) for ROC curve true_labels = np.where(df["Anomaly_Label"] == "Anomaly", 1, 0) # SHAP Explainability explainer = shap.Explainer(iso_forest, df[columns]) shap_values = explainer(df[columns]) # Functions for Anomaly Detection Algorithms tab def train_models(input_data, outliers_fraction, n_samples, clf_name): """Train anomaly detection models and plot results.""" n_outliers = int(outliers_fraction * n_samples) n_inliers = n_samples - n_outliers blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2) NAME_CLF_MAPPING = { "Robust covariance": EllipticEnvelope(contamination=outliers_fraction), "One-Class SVM": svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1), "One-Class SVM (SGD)": make_pipeline( Nystroem(gamma=0.1, random_state=42, n_components=150), SGDOneClassSVM( nu=outliers_fraction, shuffle=True, fit_intercept=True, random_state=42, tol=1e-6, ), ), "Isolation Forest": IsolationForest(contamination=outliers_fraction, random_state=42), "Local Outlier Factor": LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction), } DATA_MAPPING = { "Central Blob": make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5, **blobs_params)[0], "Two Blobs": make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5], **blobs_params)[0], "Blob with Noise": make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, 0.3], **blobs_params)[0], "Moons": 4.0 * (make_moons(n_samples=n_samples, noise=0.05, random_state=0)[0] - np.array([0.5, 0.25])), "Noise": 14.0 * (np.random.RandomState(42).rand(n_samples, 2) - 0.5), } xx, yy = np.meshgrid(np.linspace(-7, 7, 150), np.linspace(-7, 7, 150)) clf = NAME_CLF_MAPPING[clf_name] plt.figure(figsize=(10, 8)) X = DATA_MAPPING[input_data] rng = np.random.RandomState(42) X = np.concatenate([X, rng.uniform(low=-6, high=6, size=(n_outliers, 2))], axis=0) t0 = time.time() clf.fit(X) t1 = time.time() if clf_name == "Local Outlier Factor": y_pred = clf.fit_predict(X) else: y_pred = clf.fit(X).predict(X) if clf_name != "Local Outlier Factor": Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors="black") colors = np.array(["#377eb8", "#ff7f00"]) plt.scatter(X[:, 0], X[:, 1], s=30, color=colors[(y_pred + 1) // 2]) plt.xlim(-7, 7) plt.ylim(-7, 7) plt.xticks(()) plt.yticks(()) plt.title(f"{clf_name} (time: {t1 - t0:.2f}s)") return plt # Create Gradio interface with gr.Blocks() as demo: gr.Markdown("# Isolation Forest Anomaly Detection") with gr.Tab("Anomaly Detection Algorithms"): gr.Markdown("## Compare Anomaly Detection Algorithms") input_models = [ "Robust covariance", "One-Class SVM", "One-Class SVM (SGD)", "Isolation Forest", "Local Outlier Factor" ] input_data = gr.Radio( choices=["Central Blob", "Two Blobs", "Blob with Noise", "Moons", "Noise"], value="Moons", label="Dataset Type" ) n_samples = gr.Slider( minimum=100, maximum=500, step=25, value=300, label="Number of Samples" ) outliers_fraction = gr.Slider( minimum=0.1, maximum=0.9, step=0.1, value=0.2, label="Outlier Fraction" ) for clf_name in input_models: plot = gr.Plot(label=clf_name) fn = partial(train_models, clf_name=clf_name) input_data.change(fn=fn, inputs=[input_data, outliers_fraction, n_samples], outputs=plot) n_samples.change(fn=fn, inputs=[input_data, outliers_fraction, n_samples], outputs=plot) outliers_fraction.change(fn=fn, inputs=[input_data, outliers_fraction, n_samples], outputs=plot) # Launch the Gradio app demo.launch()