Spaces:
Build error
Build error
File size: 6,542 Bytes
c533407 93f8de3 a9b33b5 c533407 cadabca c533407 93f8de3 c533407 93f8de3 c533407 93f8de3 c533407 93f8de3 c533407 93f8de3 a9b33b5 93f8de3 c533407 cadabca c533407 7918a66 a9b33b5 51923bf 3d3fa38 51923bf 8a47560 3d9e921 8a47560 3d9e921 8a47560 3d3fa38 51923bf cadabca 7918a66 51923bf a9b33b5 3d3fa38 3d9e921 a9b33b5 3d3fa38 3d9e921 a9b33b5 3d3fa38 3d9e921 3d3fa38 a9b33b5 51923bf cadabca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_curve, auc
import shap
import matplotlib.pyplot as plt
import gradio as gr
# Generate synthetic data with 20 features
np.random.seed(42)
X, _ = make_classification(
n_samples=500,
n_features=20,
n_informative=10,
n_redundant=5,
n_clusters_per_class=1,
random_state=42
)
outliers = np.random.uniform(low=-6, high=6, size=(50, 20)) # Add outliers
X = np.vstack([X, outliers])
# Convert to DataFrame
columns = [f"Feature{i+1}" for i in range(20)]
df = pd.DataFrame(X, columns=columns)
# Fit Isolation Forest
iso_forest = IsolationForest(
n_estimators=100,
max_samples=256,
contamination=0.1,
random_state=42
)
iso_forest.fit(df)
# Predict anomaly scores
anomaly_scores = iso_forest.decision_function(df) # Negative values indicate anomalies
anomaly_labels = iso_forest.predict(df) # -1 for anomaly, 1 for normal
# Add results to DataFrame
df["Anomaly_Score"] = anomaly_scores
df["Anomaly_Label"] = np.where(anomaly_labels == -1, "Anomaly", "Normal")
# Generate true labels (1 for anomaly, 0 for normal) for ROC curve
true_labels = np.where(df["Anomaly_Label"] == "Anomaly", 1, 0)
# SHAP Explainability
explainer = shap.Explainer(iso_forest, df[columns])
shap_values = explainer(df[columns])
# Define functions for Gradio
def get_shap_summary():
"""Generates SHAP summary plot."""
plt.figure()
shap.summary_plot(shap_values, df[columns], feature_names=columns, show=False)
plt.savefig("shap_summary.png")
return "shap_summary.png"
def get_shap_waterfall(index):
"""Generates SHAP waterfall plot for a specific data point."""
specific_index = int(index)
plt.figure()
shap.waterfall_plot(
shap.Explanation(
values=shap_values.values[specific_index],
base_values=shap_values.base_values[specific_index],
data=df.iloc[specific_index],
feature_names=columns
)
)
plt.savefig("shap_waterfall.png")
return "shap_waterfall.png"
def get_scatter_plot(feature1, feature2):
"""Generates scatter plot for two features."""
plt.figure(figsize=(8, 6))
plt.scatter(
df[feature1],
df[feature2],
c=(df["Anomaly_Label"] == "Anomaly"),
cmap="coolwarm",
edgecolor="k",
alpha=0.7
)
plt.title(f"Isolation Forest - {feature1} vs {feature2}")
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.savefig("scatter_plot.png")
return "scatter_plot.png"
def get_roc_curve():
"""Generates the ROC curve plot."""
fpr, tpr, _ = roc_curve(true_labels, -df["Anomaly_Score"]) # Use -scores as higher scores mean normal
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], "k--", label="Random Guess")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.grid()
plt.savefig("roc_curve.png")
return "roc_curve.png"
def get_anomaly_samples():
"""Returns formatted top, middle, and bottom 10 records based on anomaly score."""
sorted_df = df.sort_values("Anomaly_Score", ascending=False)
# Top 10 anomalies
top_10 = sorted_df[sorted_df["Anomaly_Label"] == "Anomaly"].head(10)
# Middle 10 (mix of anomalies and normal)
mid_start = len(sorted_df) // 2 - 50 # Get a broader middle slice
middle_section = sorted_df.iloc[mid_start: mid_start + 100] # Consider a larger middle slice
middle_anomalies = middle_section[middle_section["Anomaly_Label"] == "Anomaly"].sample(n=5, random_state=42)
middle_normals = middle_section[middle_section["Anomaly_Label"] == "Normal"].sample(n=5, random_state=42)
middle_10 = pd.concat([middle_anomalies, middle_normals]).sort_values("Anomaly_Score", ascending=False)
# Bottom 10 normal records
bottom_10 = sorted_df[sorted_df["Anomaly_Label"] == "Normal"].tail(10)
return top_10, middle_10, bottom_10
# Create Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Isolation Forest Anomaly Detection")
with gr.Tab("SHAP Summary"):
gr.Markdown("### Global Explainability: SHAP Summary Plot")
shap_button = gr.Button("Generate SHAP Summary Plot")
shap_image = gr.Image()
shap_button.click(get_shap_summary, outputs=shap_image)
with gr.Tab("SHAP Waterfall"):
gr.Markdown("### Local Explainability: SHAP Waterfall Plot")
index_input = gr.Number(label="Data Point Index", value=0)
shap_waterfall_button = gr.Button("Generate SHAP Waterfall Plot")
shap_waterfall_image = gr.Image()
shap_waterfall_button.click(get_shap_waterfall, inputs=index_input, outputs=shap_waterfall_image)
with gr.Tab("Feature Scatter Plot"):
gr.Markdown("### Feature Interaction: Scatter Plot")
feature1_dropdown = gr.Dropdown(choices=columns, label="Feature 1")
feature2_dropdown = gr.Dropdown(choices=columns, label="Feature 2")
scatter_button = gr.Button("Generate Scatter Plot")
scatter_image = gr.Image()
scatter_button.click(get_scatter_plot, inputs=[feature1_dropdown, feature2_dropdown], outputs=scatter_image)
with gr.Tab("Anomaly Samples"):
gr.HTML("<h3 style='text-align: center; font-size: 18px; font-weight: bold;'>Top 10 Records (Anomalies)</h3>")
top_table = gr.Dataframe(label="Top 10 Records")
gr.HTML("<h3 style='text-align: center; font-size: 18px; font-weight: bold;'>Middle 10 Records (Mixed)</h3>")
middle_table = gr.Dataframe(label="Middle 10 Records")
gr.HTML("<h3 style='text-align: center; font-size: 18px; font-weight: bold;'>Bottom 10 Records (Normal)</h3>")
bottom_table = gr.Dataframe(label="Bottom 10 Records")
anomaly_samples_button = gr.Button("Show Anomaly Samples")
anomaly_samples_button.click(
get_anomaly_samples,
outputs=[top_table, middle_table, bottom_table]
)
with gr.Tab("ROC Curve"):
gr.Markdown("### ROC Curve for Isolation Forest")
roc_button = gr.Button("Generate ROC Curve")
roc_image = gr.Image()
roc_button.click(get_roc_curve, outputs=roc_image)
# Launch the Gradio app
demo.launch()
|