import streamlit as st import pandas as pd import joblib import matplotlib.pyplot as plt import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans from sklearn.ensemble import IsolationForest from sklearn.metrics import silhouette_score # Load Models kmeans = joblib.load("kmeans_model.pkl") isolation_forest = joblib.load("isolation_forest_model.pkl") # Streamlit UI st.set_page_config(page_title="Anomaly & Clustering Analysis", page_icon="๐Ÿ“Š", layout="wide") st.title("๐Ÿ“Œ Anomaly & Clustering Analysis") st.markdown("### A simple interactive app to analyze anomalies and clusters in generated data using Isolation Forest and K-Means clustering.") # Load dataset df = pd.read_csv("dataset.csv") df.dropna(inplace=True) # Ensure data is numerical df["Accidental Deaths"] = pd.to_numeric(df["Accidental Deaths"], errors='coerce') # Standardize Data scaler = StandardScaler() scaled_data = scaler.fit_transform(df[["Accidental Deaths"]]) # Determine optimal K silhouette_scores = [] k_values = range(2, 10) for k in k_values: kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10) labels = kmeans_temp.fit_predict(scaled_data) score = silhouette_score(scaled_data, labels) silhouette_scores.append(score) optimal_k = k_values[np.argmax(silhouette_scores)] # Train final K-Means with optimal K kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10) df["Cluster"] = kmeans_final.fit_predict(scaled_data) # Tabs tabs = st.tabs(["๐Ÿ“Š Data Preview", "๐Ÿ“ˆ Insights", "๐Ÿงช User Testing"]) with tabs[0]: st.markdown("## ๐Ÿ“Š Data Preview") with st.expander("๐Ÿ“ View Dataset Sample"): st.dataframe(df.head()) st.markdown("### ๐Ÿ” Cluster Distribution") unique, counts = np.unique(df["Cluster"], return_counts=True) st.bar_chart(pd.DataFrame({"Cluster": unique, "Count": counts})) with tabs[1]: st.markdown("## ๐Ÿ“ˆ Insights & Visualizations") fig, ax = plt.subplots(figsize=(10, 5)) for cluster in np.unique(df["Cluster"]): subset = df[df["Cluster"] == cluster] ax.scatter(subset.index, subset["Accidental Deaths"], label=f"Cluster {cluster}") ax.plot(df.index, df["Accidental Deaths"], color="gray", linestyle="dashed", alpha=0.5) ax.set_title("Clustered Data") ax.set_xlabel("Index") ax.set_ylabel("Accidental Deaths") ax.legend() st.pyplot(fig) with tabs[2]: st.markdown("## ๐Ÿงช User Testing") st.markdown(f"### Optimal K Selected: **{optimal_k}**") user_input = st.number_input("Enter a new Accidental Deaths value:", min_value=0, step=1) if st.button("๐Ÿ” Analyze Input"): input_scaled = scaler.transform(np.array([[user_input]])) cluster_prediction = kmeans_final.predict(input_scaled)[0] anomaly_prediction = isolation_forest.predict(input_scaled)[0] anomaly_status = "๐ŸŸข Normal" if anomaly_prediction == 1 else "๐Ÿ”ด Anomalous" st.write(f"**Cluster Assigned:** {cluster_prediction}") st.write(f"**Anomaly Status:** {anomaly_status}") st.write(f"### **Anomaly Prediction: {anomaly_status}**") st.markdown("### ๐Ÿ“Š Updated Cluster Visualization") fig, ax = plt.subplots(figsize=(10, 5)) for cluster in np.unique(df["Cluster"]): subset = df[df["Cluster"] == cluster] ax.scatter(subset.index, subset["Accidental Deaths"], label=f"Cluster {cluster}") ax.plot(df.index, df["Accidental Deaths"], color="gray", linestyle="dashed", alpha=0.5) # Highlight user input if 'user_input' in locals(): ax.scatter(len(df), user_input, color='red', marker='x', s=100, label="User Input") ax.set_title("Updated Clustering Graph") ax.set_xlabel("Index") ax.set_ylabel("Accidental Deaths") ax.legend() st.pyplot(fig) if st.button("๐Ÿ”„ Refresh Data"): st.rerun()