Markndrei commited on
Commit
0915a9a
·
1 Parent(s): f976d9e

uploading application files

Browse files
Files changed (6) hide show
  1. app.py +105 -0
  2. dataset.csv +73 -0
  3. isolation_forest_model.pkl +3 -0
  4. kmeans_model.pkl +3 -0
  5. model.ipynb +0 -0
  6. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import joblib
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ from sklearn.preprocessing import StandardScaler
7
+ from sklearn.cluster import KMeans
8
+ from sklearn.ensemble import IsolationForest
9
+ from sklearn.metrics import silhouette_score
10
+
11
+ # Load Models
12
+ kmeans = joblib.load("kmeans_model.pkl")
13
+ isolation_forest = joblib.load("isolation_forest_model.pkl")
14
+
15
+ # Streamlit UI
16
+ st.set_page_config(page_title="Anomaly & Clustering Analysis", page_icon="📊", layout="wide")
17
+ st.title("📌 Anomaly & Clustering Analysis")
18
+ st.markdown("### A simple interactive app to analyze anomalies and clusters in generated data using Isolation Forest and K-Means clustering.")
19
+
20
+ # Load dataset
21
+ df = pd.read_csv("dataset.csv")
22
+ df.dropna(inplace=True)
23
+
24
+ # Ensure data is numerical
25
+ df["Accidental Deaths"] = pd.to_numeric(df["Accidental Deaths"], errors='coerce')
26
+
27
+ # Standardize Data
28
+ scaler = StandardScaler()
29
+ scaled_data = scaler.fit_transform(df[["Accidental Deaths"]])
30
+
31
+ # Determine optimal K
32
+ silhouette_scores = []
33
+ k_values = range(2, 10)
34
+ for k in k_values:
35
+ kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
36
+ labels = kmeans_temp.fit_predict(scaled_data)
37
+ score = silhouette_score(scaled_data, labels)
38
+ silhouette_scores.append(score)
39
+ optimal_k = k_values[np.argmax(silhouette_scores)]
40
+
41
+ # Train final K-Means with optimal K
42
+ kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
43
+ df["Cluster"] = kmeans_final.fit_predict(scaled_data)
44
+
45
+ # Tabs
46
+ tabs = st.tabs(["📊 Data Preview", "📈 Insights", "🧪 User Testing"])
47
+
48
+ with tabs[0]:
49
+ st.markdown("## 📊 Data Preview")
50
+ with st.expander("📝 View Dataset Sample"):
51
+ st.dataframe(df.head())
52
+
53
+ st.markdown("### 🔍 Cluster Distribution")
54
+ unique, counts = np.unique(df["Cluster"], return_counts=True)
55
+ st.bar_chart(pd.DataFrame({"Cluster": unique, "Count": counts}))
56
+
57
+ with tabs[1]:
58
+ st.markdown("## 📈 Insights & Visualizations")
59
+
60
+ fig, ax = plt.subplots(figsize=(10, 5))
61
+ for cluster in np.unique(df["Cluster"]):
62
+ subset = df[df["Cluster"] == cluster]
63
+ ax.scatter(subset.index, subset["Accidental Deaths"], label=f"Cluster {cluster}")
64
+ ax.plot(df.index, df["Accidental Deaths"], color="gray", linestyle="dashed", alpha=0.5)
65
+ ax.set_title("Clustered Data")
66
+ ax.set_xlabel("Index")
67
+ ax.set_ylabel("Accidental Deaths")
68
+ ax.legend()
69
+ st.pyplot(fig)
70
+
71
+ with tabs[2]:
72
+ st.markdown("## 🧪 User Testing")
73
+ st.markdown(f"### Optimal K Selected: **{optimal_k}**")
74
+
75
+ user_input = st.number_input("Enter a new Accidental Deaths value:", min_value=0, step=1)
76
+
77
+ if st.button("🔍 Analyze Input"):
78
+ input_scaled = scaler.transform(np.array([[user_input]]))
79
+ cluster_prediction = kmeans_final.predict(input_scaled)[0]
80
+ anomaly_prediction = isolation_forest.predict(input_scaled)[0]
81
+ anomaly_status = "🟢 Normal" if anomaly_prediction == 1 else "🔴 Anomalous"
82
+
83
+ st.write(f"**Cluster Assigned:** {cluster_prediction}")
84
+ st.write(f"**Anomaly Status:** {anomaly_status}")
85
+ st.write(f"### **Anomaly Prediction: {anomaly_status}**")
86
+
87
+ st.markdown("### 📊 Updated Cluster Visualization")
88
+ fig, ax = plt.subplots(figsize=(10, 5))
89
+ for cluster in np.unique(df["Cluster"]):
90
+ subset = df[df["Cluster"] == cluster]
91
+ ax.scatter(subset.index, subset["Accidental Deaths"], label=f"Cluster {cluster}")
92
+ ax.plot(df.index, df["Accidental Deaths"], color="gray", linestyle="dashed", alpha=0.5)
93
+
94
+ # Highlight user input
95
+ if 'user_input' in locals():
96
+ ax.scatter(len(df), user_input, color='red', marker='x', s=100, label="User Input")
97
+
98
+ ax.set_title("Updated Clustering Graph")
99
+ ax.set_xlabel("Index")
100
+ ax.set_ylabel("Accidental Deaths")
101
+ ax.legend()
102
+ st.pyplot(fig)
103
+
104
+ if st.button("🔄 Refresh Data"):
105
+ st.rerun()
dataset.csv ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Month,Accidental Deaths
2
+ 1973-01,9007
3
+ 1973-02,8106
4
+ 1973-03,8928
5
+ 1973-04,9137
6
+ 1973-05,10017
7
+ 1973-06,10826
8
+ 1973-07,11317
9
+ 1973-08,10744
10
+ 1973-09,9713
11
+ 1973-10,9938
12
+ 1973-11,9161
13
+ 1973-12,8927
14
+ 1974-01,7750
15
+ 1974-02,6981
16
+ 1974-03,8038
17
+ 1974-04,8422
18
+ 1974-05,8714
19
+ 1974-06,9512
20
+ 1974-07,10120
21
+ 1974-08,9823
22
+ 1974-09,8743
23
+ 1974-10,9129
24
+ 1974-11,8710
25
+ 1974-12,8680
26
+ 1975-01,8162
27
+ 1975-02,7306
28
+ 1975-03,8124
29
+ 1975-04,7870
30
+ 1975-05,9387
31
+ 1975-06,9556
32
+ 1975-07,10093
33
+ 1975-08,9620
34
+ 1975-09,8285
35
+ 1975-10,8433
36
+ 1975-11,8160
37
+ 1975-12,8034
38
+ 1976-01,7717
39
+ 1976-02,7461
40
+ 1976-03,7776
41
+ 1976-04,7925
42
+ 1976-05,8634
43
+ 1976-06,8945
44
+ 1976-07,10078
45
+ 1976-08,9179
46
+ 1976-09,8037
47
+ 1976-10,8488
48
+ 1976-11,7874
49
+ 1976-12,8647
50
+ 1977-01,7792
51
+ 1977-02,6957
52
+ 1977-03,7726
53
+ 1977-04,8106
54
+ 1977-05,8890
55
+ 1977-06,9299
56
+ 1977-07,10625
57
+ 1977-08,9302
58
+ 1977-09,8314
59
+ 1977-10,8850
60
+ 1977-11,8265
61
+ 1977-12,8796
62
+ 1978-01,7836
63
+ 1978-02,6892
64
+ 1978-03,7791
65
+ 1978-04,8129
66
+ 1978-05,9115
67
+ 1978-06,9434
68
+ 1978-07,10484
69
+ 1978-08,9827
70
+ 1978-09,9110
71
+ 1978-10,9070
72
+ 1978-11,8633
73
+ 1978-12,9240
isolation_forest_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fba1121133e60dbbc29e158cef747fa59d9dd96bea6ff2ac65a48232df0f3d7
3
+ size 1411305
kmeans_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00192b3fc5d36ba2eec69abb6122d4c1a5ba5d3626a21271391f9f57abcb85fc
3
+ size 1063
model.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ matplotlib
3
+ numpy
4
+ pandas
5
+ scikit-learn