File size: 2,860 Bytes
c533407
93f8de3
 
 
c533407
 
cadabca
c533407
 
 
 
 
 
 
 
 
 
 
 
 
 
93f8de3
c533407
 
 
93f8de3
 
 
 
 
 
c533407
93f8de3
c533407
 
93f8de3
 
c533407
93f8de3
 
 
 
 
 
 
c533407
cadabca
c533407
51923bf
3d3fa38
51923bf
3d9e921
 
 
 
 
 
 
3d3fa38
51923bf
cadabca
 
 
 
51923bf
3d9e921
3d3fa38
3d9e921
 
3d3fa38
3d9e921
 
3d3fa38
3d9e921
 
3d3fa38
 
 
 
51923bf
cadabca
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import IsolationForest
import shap
import matplotlib.pyplot as plt
import gradio as gr

# Generate synthetic data with 20 features
np.random.seed(42)
X, _ = make_classification(
    n_samples=500,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    n_clusters_per_class=1,
    random_state=42
)
outliers = np.random.uniform(low=-6, high=6, size=(50, 20))  # Add outliers
X = np.vstack([X, outliers])

# Convert to DataFrame
columns = [f"Feature{i+1}" for i in range(20)]
df = pd.DataFrame(X, columns=columns)

# Fit Isolation Forest
iso_forest = IsolationForest(
    n_estimators=100,
    max_samples=256,
    contamination=0.1,
    random_state=42
)
iso_forest.fit(df)

# Predict anomaly scores
anomaly_scores = iso_forest.decision_function(df)  # Negative values indicate anomalies
anomaly_labels = iso_forest.predict(df)  # -1 for anomaly, 1 for normal

# Add results to DataFrame
df["Anomaly_Score"] = anomaly_scores
df["Anomaly_Label"] = np.where(anomaly_labels == -1, "Anomaly", "Normal")

# SHAP Explainability
explainer = shap.Explainer(iso_forest, df[columns])
shap_values = explainer(df[columns])

# Define functions for Gradio

def get_anomaly_samples():
    """Returns formatted top, middle, and bottom 10 records based on anomaly score."""
    sorted_df = df.sort_values("Anomaly_Score", ascending=False)
    # Top 10 anomalies
    top_10 = sorted_df[sorted_df["Anomaly_Label"] == "Anomaly"].head(10)
    # Middle 10 (mixed records)
    mid_start = len(sorted_df) // 2 - 5
    middle_10 = sorted_df.iloc[mid_start: mid_start + 10]
    # Bottom 10 normal records
    bottom_10 = sorted_df[sorted_df["Anomaly_Label"] == "Normal"].tail(10)
    return top_10, middle_10, bottom_10

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Isolation Forest Anomaly Detection")
    
    with gr.Tab("Anomaly Samples"):
        gr.Markdown("<h3 style='text-align: center; font-size: 18px; font-weight: bold;'>Top 10 Records (Anomalies)</h3>", unsafe_allow_html=True)
        top_table = gr.Dataframe(label="Top 10 Records")
        
        gr.Markdown("<h3 style='text-align: center; font-size: 18px; font-weight: bold;'>Middle 10 Records (Mixed)</h3>", unsafe_allow_html=True)
        middle_table = gr.Dataframe(label="Middle 10 Records")
        
        gr.Markdown("<h3 style='text-align: center; font-size: 18px; font-weight: bold;'>Bottom 10 Records (Normal)</h3>", unsafe_allow_html=True)
        bottom_table = gr.Dataframe(label="Bottom 10 Records")
        
        anomaly_samples_button = gr.Button("Show Anomaly Samples")
        anomaly_samples_button.click(
            get_anomaly_samples, 
            outputs=[top_table, middle_table, bottom_table]
        )

# Launch the Gradio app
demo.launch()