Spaces:
Running
Running
uploading application files
Browse files- app.py +105 -0
- dataset.csv +73 -0
- isolation_forest_model.pkl +3 -0
- kmeans_model.pkl +3 -0
- model.ipynb +0 -0
- requirements.txt +5 -0
app.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import joblib
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import numpy as np
|
| 6 |
+
from sklearn.preprocessing import StandardScaler
|
| 7 |
+
from sklearn.cluster import KMeans
|
| 8 |
+
from sklearn.ensemble import IsolationForest
|
| 9 |
+
from sklearn.metrics import silhouette_score
|
| 10 |
+
|
| 11 |
+
# Load Models
|
| 12 |
+
kmeans = joblib.load("kmeans_model.pkl")
|
| 13 |
+
isolation_forest = joblib.load("isolation_forest_model.pkl")
|
| 14 |
+
|
| 15 |
+
# Streamlit UI
|
| 16 |
+
st.set_page_config(page_title="Anomaly & Clustering Analysis", page_icon="📊", layout="wide")
|
| 17 |
+
st.title("📌 Anomaly & Clustering Analysis")
|
| 18 |
+
st.markdown("### A simple interactive app to analyze anomalies and clusters in generated data using Isolation Forest and K-Means clustering.")
|
| 19 |
+
|
| 20 |
+
# Load dataset
|
| 21 |
+
df = pd.read_csv("dataset.csv")
|
| 22 |
+
df.dropna(inplace=True)
|
| 23 |
+
|
| 24 |
+
# Ensure data is numerical
|
| 25 |
+
df["Accidental Deaths"] = pd.to_numeric(df["Accidental Deaths"], errors='coerce')
|
| 26 |
+
|
| 27 |
+
# Standardize Data
|
| 28 |
+
scaler = StandardScaler()
|
| 29 |
+
scaled_data = scaler.fit_transform(df[["Accidental Deaths"]])
|
| 30 |
+
|
| 31 |
+
# Determine optimal K
|
| 32 |
+
silhouette_scores = []
|
| 33 |
+
k_values = range(2, 10)
|
| 34 |
+
for k in k_values:
|
| 35 |
+
kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
|
| 36 |
+
labels = kmeans_temp.fit_predict(scaled_data)
|
| 37 |
+
score = silhouette_score(scaled_data, labels)
|
| 38 |
+
silhouette_scores.append(score)
|
| 39 |
+
optimal_k = k_values[np.argmax(silhouette_scores)]
|
| 40 |
+
|
| 41 |
+
# Train final K-Means with optimal K
|
| 42 |
+
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
|
| 43 |
+
df["Cluster"] = kmeans_final.fit_predict(scaled_data)
|
| 44 |
+
|
| 45 |
+
# Tabs
|
| 46 |
+
tabs = st.tabs(["📊 Data Preview", "📈 Insights", "🧪 User Testing"])
|
| 47 |
+
|
| 48 |
+
with tabs[0]:
|
| 49 |
+
st.markdown("## 📊 Data Preview")
|
| 50 |
+
with st.expander("📝 View Dataset Sample"):
|
| 51 |
+
st.dataframe(df.head())
|
| 52 |
+
|
| 53 |
+
st.markdown("### 🔍 Cluster Distribution")
|
| 54 |
+
unique, counts = np.unique(df["Cluster"], return_counts=True)
|
| 55 |
+
st.bar_chart(pd.DataFrame({"Cluster": unique, "Count": counts}))
|
| 56 |
+
|
| 57 |
+
with tabs[1]:
|
| 58 |
+
st.markdown("## 📈 Insights & Visualizations")
|
| 59 |
+
|
| 60 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
| 61 |
+
for cluster in np.unique(df["Cluster"]):
|
| 62 |
+
subset = df[df["Cluster"] == cluster]
|
| 63 |
+
ax.scatter(subset.index, subset["Accidental Deaths"], label=f"Cluster {cluster}")
|
| 64 |
+
ax.plot(df.index, df["Accidental Deaths"], color="gray", linestyle="dashed", alpha=0.5)
|
| 65 |
+
ax.set_title("Clustered Data")
|
| 66 |
+
ax.set_xlabel("Index")
|
| 67 |
+
ax.set_ylabel("Accidental Deaths")
|
| 68 |
+
ax.legend()
|
| 69 |
+
st.pyplot(fig)
|
| 70 |
+
|
| 71 |
+
with tabs[2]:
|
| 72 |
+
st.markdown("## 🧪 User Testing")
|
| 73 |
+
st.markdown(f"### Optimal K Selected: **{optimal_k}**")
|
| 74 |
+
|
| 75 |
+
user_input = st.number_input("Enter a new Accidental Deaths value:", min_value=0, step=1)
|
| 76 |
+
|
| 77 |
+
if st.button("🔍 Analyze Input"):
|
| 78 |
+
input_scaled = scaler.transform(np.array([[user_input]]))
|
| 79 |
+
cluster_prediction = kmeans_final.predict(input_scaled)[0]
|
| 80 |
+
anomaly_prediction = isolation_forest.predict(input_scaled)[0]
|
| 81 |
+
anomaly_status = "🟢 Normal" if anomaly_prediction == 1 else "🔴 Anomalous"
|
| 82 |
+
|
| 83 |
+
st.write(f"**Cluster Assigned:** {cluster_prediction}")
|
| 84 |
+
st.write(f"**Anomaly Status:** {anomaly_status}")
|
| 85 |
+
st.write(f"### **Anomaly Prediction: {anomaly_status}**")
|
| 86 |
+
|
| 87 |
+
st.markdown("### 📊 Updated Cluster Visualization")
|
| 88 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
| 89 |
+
for cluster in np.unique(df["Cluster"]):
|
| 90 |
+
subset = df[df["Cluster"] == cluster]
|
| 91 |
+
ax.scatter(subset.index, subset["Accidental Deaths"], label=f"Cluster {cluster}")
|
| 92 |
+
ax.plot(df.index, df["Accidental Deaths"], color="gray", linestyle="dashed", alpha=0.5)
|
| 93 |
+
|
| 94 |
+
# Highlight user input
|
| 95 |
+
if 'user_input' in locals():
|
| 96 |
+
ax.scatter(len(df), user_input, color='red', marker='x', s=100, label="User Input")
|
| 97 |
+
|
| 98 |
+
ax.set_title("Updated Clustering Graph")
|
| 99 |
+
ax.set_xlabel("Index")
|
| 100 |
+
ax.set_ylabel("Accidental Deaths")
|
| 101 |
+
ax.legend()
|
| 102 |
+
st.pyplot(fig)
|
| 103 |
+
|
| 104 |
+
if st.button("🔄 Refresh Data"):
|
| 105 |
+
st.rerun()
|
dataset.csv
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Month,Accidental Deaths
|
| 2 |
+
1973-01,9007
|
| 3 |
+
1973-02,8106
|
| 4 |
+
1973-03,8928
|
| 5 |
+
1973-04,9137
|
| 6 |
+
1973-05,10017
|
| 7 |
+
1973-06,10826
|
| 8 |
+
1973-07,11317
|
| 9 |
+
1973-08,10744
|
| 10 |
+
1973-09,9713
|
| 11 |
+
1973-10,9938
|
| 12 |
+
1973-11,9161
|
| 13 |
+
1973-12,8927
|
| 14 |
+
1974-01,7750
|
| 15 |
+
1974-02,6981
|
| 16 |
+
1974-03,8038
|
| 17 |
+
1974-04,8422
|
| 18 |
+
1974-05,8714
|
| 19 |
+
1974-06,9512
|
| 20 |
+
1974-07,10120
|
| 21 |
+
1974-08,9823
|
| 22 |
+
1974-09,8743
|
| 23 |
+
1974-10,9129
|
| 24 |
+
1974-11,8710
|
| 25 |
+
1974-12,8680
|
| 26 |
+
1975-01,8162
|
| 27 |
+
1975-02,7306
|
| 28 |
+
1975-03,8124
|
| 29 |
+
1975-04,7870
|
| 30 |
+
1975-05,9387
|
| 31 |
+
1975-06,9556
|
| 32 |
+
1975-07,10093
|
| 33 |
+
1975-08,9620
|
| 34 |
+
1975-09,8285
|
| 35 |
+
1975-10,8433
|
| 36 |
+
1975-11,8160
|
| 37 |
+
1975-12,8034
|
| 38 |
+
1976-01,7717
|
| 39 |
+
1976-02,7461
|
| 40 |
+
1976-03,7776
|
| 41 |
+
1976-04,7925
|
| 42 |
+
1976-05,8634
|
| 43 |
+
1976-06,8945
|
| 44 |
+
1976-07,10078
|
| 45 |
+
1976-08,9179
|
| 46 |
+
1976-09,8037
|
| 47 |
+
1976-10,8488
|
| 48 |
+
1976-11,7874
|
| 49 |
+
1976-12,8647
|
| 50 |
+
1977-01,7792
|
| 51 |
+
1977-02,6957
|
| 52 |
+
1977-03,7726
|
| 53 |
+
1977-04,8106
|
| 54 |
+
1977-05,8890
|
| 55 |
+
1977-06,9299
|
| 56 |
+
1977-07,10625
|
| 57 |
+
1977-08,9302
|
| 58 |
+
1977-09,8314
|
| 59 |
+
1977-10,8850
|
| 60 |
+
1977-11,8265
|
| 61 |
+
1977-12,8796
|
| 62 |
+
1978-01,7836
|
| 63 |
+
1978-02,6892
|
| 64 |
+
1978-03,7791
|
| 65 |
+
1978-04,8129
|
| 66 |
+
1978-05,9115
|
| 67 |
+
1978-06,9434
|
| 68 |
+
1978-07,10484
|
| 69 |
+
1978-08,9827
|
| 70 |
+
1978-09,9110
|
| 71 |
+
1978-10,9070
|
| 72 |
+
1978-11,8633
|
| 73 |
+
1978-12,9240
|
isolation_forest_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1fba1121133e60dbbc29e158cef747fa59d9dd96bea6ff2ac65a48232df0f3d7
|
| 3 |
+
size 1411305
|
kmeans_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:00192b3fc5d36ba2eec69abb6122d4c1a5ba5d3626a21271391f9f57abcb85fc
|
| 3 |
+
size 1063
|
model.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
matplotlib
|
| 3 |
+
numpy
|
| 4 |
+
pandas
|
| 5 |
+
scikit-learn
|