File size: 6,542 Bytes
c533407
93f8de3
 
 
a9b33b5
c533407
 
cadabca
c533407
 
 
 
 
 
 
 
 
 
 
 
 
 
93f8de3
c533407
 
 
93f8de3
 
 
 
 
 
c533407
93f8de3
c533407
 
93f8de3
 
c533407
93f8de3
 
 
 
a9b33b5
 
 
93f8de3
 
 
c533407
cadabca
c533407
7918a66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9b33b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51923bf
3d3fa38
51923bf
8a47560
3d9e921
 
8a47560
 
 
 
 
 
 
 
3d9e921
 
8a47560
3d3fa38
51923bf
cadabca
 
 
 
7918a66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51923bf
a9b33b5
3d3fa38
3d9e921
a9b33b5
3d3fa38
3d9e921
a9b33b5
3d3fa38
3d9e921
 
3d3fa38
 
 
 
a9b33b5
 
 
 
 
 
51923bf
cadabca
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_curve, auc
import shap
import matplotlib.pyplot as plt
import gradio as gr

# Generate synthetic data with 20 features
np.random.seed(42)
X, _ = make_classification(
    n_samples=500,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    n_clusters_per_class=1,
    random_state=42
)
outliers = np.random.uniform(low=-6, high=6, size=(50, 20))  # Add outliers
X = np.vstack([X, outliers])

# Convert to DataFrame
columns = [f"Feature{i+1}" for i in range(20)]
df = pd.DataFrame(X, columns=columns)

# Fit Isolation Forest
iso_forest = IsolationForest(
    n_estimators=100,
    max_samples=256,
    contamination=0.1,
    random_state=42
)
iso_forest.fit(df)

# Predict anomaly scores
anomaly_scores = iso_forest.decision_function(df)  # Negative values indicate anomalies
anomaly_labels = iso_forest.predict(df)  # -1 for anomaly, 1 for normal

# Add results to DataFrame
df["Anomaly_Score"] = anomaly_scores
df["Anomaly_Label"] = np.where(anomaly_labels == -1, "Anomaly", "Normal")

# Generate true labels (1 for anomaly, 0 for normal) for ROC curve
true_labels = np.where(df["Anomaly_Label"] == "Anomaly", 1, 0)

# SHAP Explainability
explainer = shap.Explainer(iso_forest, df[columns])
shap_values = explainer(df[columns])

# Define functions for Gradio

def get_shap_summary():
    """Generates SHAP summary plot."""
    plt.figure()
    shap.summary_plot(shap_values, df[columns], feature_names=columns, show=False)
    plt.savefig("shap_summary.png")
    return "shap_summary.png"

def get_shap_waterfall(index):
    """Generates SHAP waterfall plot for a specific data point."""
    specific_index = int(index)
    plt.figure()
    shap.waterfall_plot(
        shap.Explanation(
            values=shap_values.values[specific_index],
            base_values=shap_values.base_values[specific_index],
            data=df.iloc[specific_index],
            feature_names=columns
        )
    )
    plt.savefig("shap_waterfall.png")
    return "shap_waterfall.png"

def get_scatter_plot(feature1, feature2):
    """Generates scatter plot for two features."""
    plt.figure(figsize=(8, 6))
    plt.scatter(
        df[feature1],
        df[feature2],
        c=(df["Anomaly_Label"] == "Anomaly"),
        cmap="coolwarm",
        edgecolor="k",
        alpha=0.7
    )
    plt.title(f"Isolation Forest - {feature1} vs {feature2}")
    plt.xlabel(feature1)
    plt.ylabel(feature2)
    plt.savefig("scatter_plot.png")
    return "scatter_plot.png"

def get_roc_curve():
    """Generates the ROC curve plot."""
    fpr, tpr, _ = roc_curve(true_labels, -df["Anomaly_Score"])  # Use -scores as higher scores mean normal
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], "k--", label="Random Guess")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend(loc="lower right")
    plt.grid()
    plt.savefig("roc_curve.png")
    return "roc_curve.png"

def get_anomaly_samples():
    """Returns formatted top, middle, and bottom 10 records based on anomaly score."""
    sorted_df = df.sort_values("Anomaly_Score", ascending=False)
    
    # Top 10 anomalies
    top_10 = sorted_df[sorted_df["Anomaly_Label"] == "Anomaly"].head(10)
    
    # Middle 10 (mix of anomalies and normal)
    mid_start = len(sorted_df) // 2 - 50  # Get a broader middle slice
    middle_section = sorted_df.iloc[mid_start: mid_start + 100]  # Consider a larger middle slice
    middle_anomalies = middle_section[middle_section["Anomaly_Label"] == "Anomaly"].sample(n=5, random_state=42)
    middle_normals = middle_section[middle_section["Anomaly_Label"] == "Normal"].sample(n=5, random_state=42)
    middle_10 = pd.concat([middle_anomalies, middle_normals]).sort_values("Anomaly_Score", ascending=False)
    
    # Bottom 10 normal records
    bottom_10 = sorted_df[sorted_df["Anomaly_Label"] == "Normal"].tail(10)
    
    return top_10, middle_10, bottom_10

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Isolation Forest Anomaly Detection")
    
    with gr.Tab("SHAP Summary"):
        gr.Markdown("### Global Explainability: SHAP Summary Plot")
        shap_button = gr.Button("Generate SHAP Summary Plot")
        shap_image = gr.Image()
        shap_button.click(get_shap_summary, outputs=shap_image)
    
    with gr.Tab("SHAP Waterfall"):
        gr.Markdown("### Local Explainability: SHAP Waterfall Plot")
        index_input = gr.Number(label="Data Point Index", value=0)
        shap_waterfall_button = gr.Button("Generate SHAP Waterfall Plot")
        shap_waterfall_image = gr.Image()
        shap_waterfall_button.click(get_shap_waterfall, inputs=index_input, outputs=shap_waterfall_image)
    
    with gr.Tab("Feature Scatter Plot"):
        gr.Markdown("### Feature Interaction: Scatter Plot")
        feature1_dropdown = gr.Dropdown(choices=columns, label="Feature 1")
        feature2_dropdown = gr.Dropdown(choices=columns, label="Feature 2")
        scatter_button = gr.Button("Generate Scatter Plot")
        scatter_image = gr.Image()
        scatter_button.click(get_scatter_plot, inputs=[feature1_dropdown, feature2_dropdown], outputs=scatter_image)

    with gr.Tab("Anomaly Samples"):
        gr.HTML("<h3 style='text-align: center; font-size: 18px; font-weight: bold;'>Top 10 Records (Anomalies)</h3>")
        top_table = gr.Dataframe(label="Top 10 Records")
        
        gr.HTML("<h3 style='text-align: center; font-size: 18px; font-weight: bold;'>Middle 10 Records (Mixed)</h3>")
        middle_table = gr.Dataframe(label="Middle 10 Records")
        
        gr.HTML("<h3 style='text-align: center; font-size: 18px; font-weight: bold;'>Bottom 10 Records (Normal)</h3>")
        bottom_table = gr.Dataframe(label="Bottom 10 Records")
        
        anomaly_samples_button = gr.Button("Show Anomaly Samples")
        anomaly_samples_button.click(
            get_anomaly_samples, 
            outputs=[top_table, middle_table, bottom_table]
        )
    
    with gr.Tab("ROC Curve"):
        gr.Markdown("### ROC Curve for Isolation Forest")
        roc_button = gr.Button("Generate ROC Curve")
        roc_image = gr.Image()
        roc_button.click(get_roc_curve, outputs=roc_image)

# Launch the Gradio app
demo.launch()