Markndrei's picture
uploading application files
0915a9a
import streamlit as st
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
# Load Models
kmeans = joblib.load("kmeans_model.pkl")
isolation_forest = joblib.load("isolation_forest_model.pkl")
# Streamlit UI
st.set_page_config(page_title="Anomaly & Clustering Analysis", page_icon="πŸ“Š", layout="wide")
st.title("πŸ“Œ Anomaly & Clustering Analysis")
st.markdown("### A simple interactive app to analyze anomalies and clusters in generated data using Isolation Forest and K-Means clustering.")
# Load dataset
df = pd.read_csv("dataset.csv")
df.dropna(inplace=True)
# Ensure data is numerical
df["Accidental Deaths"] = pd.to_numeric(df["Accidental Deaths"], errors='coerce')
# Standardize Data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[["Accidental Deaths"]])
# Determine optimal K
silhouette_scores = []
k_values = range(2, 10)
for k in k_values:
kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans_temp.fit_predict(scaled_data)
score = silhouette_score(scaled_data, labels)
silhouette_scores.append(score)
optimal_k = k_values[np.argmax(silhouette_scores)]
# Train final K-Means with optimal K
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df["Cluster"] = kmeans_final.fit_predict(scaled_data)
# Tabs
tabs = st.tabs(["πŸ“Š Data Preview", "πŸ“ˆ Insights", "πŸ§ͺ User Testing"])
with tabs[0]:
st.markdown("## πŸ“Š Data Preview")
with st.expander("πŸ“ View Dataset Sample"):
st.dataframe(df.head())
st.markdown("### πŸ” Cluster Distribution")
unique, counts = np.unique(df["Cluster"], return_counts=True)
st.bar_chart(pd.DataFrame({"Cluster": unique, "Count": counts}))
with tabs[1]:
st.markdown("## πŸ“ˆ Insights & Visualizations")
fig, ax = plt.subplots(figsize=(10, 5))
for cluster in np.unique(df["Cluster"]):
subset = df[df["Cluster"] == cluster]
ax.scatter(subset.index, subset["Accidental Deaths"], label=f"Cluster {cluster}")
ax.plot(df.index, df["Accidental Deaths"], color="gray", linestyle="dashed", alpha=0.5)
ax.set_title("Clustered Data")
ax.set_xlabel("Index")
ax.set_ylabel("Accidental Deaths")
ax.legend()
st.pyplot(fig)
with tabs[2]:
st.markdown("## πŸ§ͺ User Testing")
st.markdown(f"### Optimal K Selected: **{optimal_k}**")
user_input = st.number_input("Enter a new Accidental Deaths value:", min_value=0, step=1)
if st.button("πŸ” Analyze Input"):
input_scaled = scaler.transform(np.array([[user_input]]))
cluster_prediction = kmeans_final.predict(input_scaled)[0]
anomaly_prediction = isolation_forest.predict(input_scaled)[0]
anomaly_status = "🟒 Normal" if anomaly_prediction == 1 else "πŸ”΄ Anomalous"
st.write(f"**Cluster Assigned:** {cluster_prediction}")
st.write(f"**Anomaly Status:** {anomaly_status}")
st.write(f"### **Anomaly Prediction: {anomaly_status}**")
st.markdown("### πŸ“Š Updated Cluster Visualization")
fig, ax = plt.subplots(figsize=(10, 5))
for cluster in np.unique(df["Cluster"]):
subset = df[df["Cluster"] == cluster]
ax.scatter(subset.index, subset["Accidental Deaths"], label=f"Cluster {cluster}")
ax.plot(df.index, df["Accidental Deaths"], color="gray", linestyle="dashed", alpha=0.5)
# Highlight user input
if 'user_input' in locals():
ax.scatter(len(df), user_input, color='red', marker='x', s=100, label="User Input")
ax.set_title("Updated Clustering Graph")
ax.set_xlabel("Index")
ax.set_ylabel("Accidental Deaths")
ax.legend()
st.pyplot(fig)
if st.button("πŸ”„ Refresh Data"):
st.rerun()