Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
| 5 |
+
from sklearn.metrics import silhouette_score
|
| 6 |
+
from sklearn.preprocessing import StandardScaler
|
| 7 |
+
from statsmodels.tsa.arima.model import ARIMA
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
import seaborn as sns
|
| 10 |
+
|
| 11 |
+
# Streamlit app title
|
| 12 |
+
st.title('Clustering and Time Series Analysis')
|
| 13 |
+
|
| 14 |
+
# Step 1: Upload CSV file
|
| 15 |
+
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
|
| 16 |
+
|
| 17 |
+
if uploaded_file is not None:
|
| 18 |
+
data = pd.read_csv(uploaded_file)
|
| 19 |
+
st.write("Dataset Preview:", data.head())
|
| 20 |
+
|
| 21 |
+
# Step 2: Data Preprocessing
|
| 22 |
+
# Selecting only numerical columns for clustering
|
| 23 |
+
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
|
| 24 |
+
st.write("Numerical columns for clustering:", numerical_cols)
|
| 25 |
+
|
| 26 |
+
# Option to scale data or not
|
| 27 |
+
scale_data = st.checkbox("Scale Data", value=True)
|
| 28 |
+
if scale_data:
|
| 29 |
+
scaler = StandardScaler()
|
| 30 |
+
data_scaled = scaler.fit_transform(data[numerical_cols])
|
| 31 |
+
else:
|
| 32 |
+
data_scaled = data[numerical_cols].values
|
| 33 |
+
|
| 34 |
+
# Step 3: Clustering Algorithm Selection
|
| 35 |
+
clustering_method = st.selectbox("Choose a clustering method", ["K-Means", "Hierarchical Clustering", "DBSCAN"])
|
| 36 |
+
|
| 37 |
+
if clustering_method == "K-Means":
|
| 38 |
+
k_range = st.slider("Select number of clusters for K-Means", min_value=2, max_value=7, value=3)
|
| 39 |
+
kmeans = KMeans(n_clusters=k_range, random_state=42)
|
| 40 |
+
cluster_labels = kmeans.fit_predict(data_scaled)
|
| 41 |
+
silhouette_avg = silhouette_score(data_scaled, cluster_labels)
|
| 42 |
+
st.write(f"K-Means Silhouette Score for {k_range} clusters: {silhouette_avg}")
|
| 43 |
+
|
| 44 |
+
elif clustering_method == "Hierarchical Clustering":
|
| 45 |
+
k_range = st.slider("Select number of clusters for Hierarchical Clustering", min_value=2, max_value=7, value=3)
|
| 46 |
+
hierarchical = AgglomerativeClustering(n_clusters=k_range)
|
| 47 |
+
cluster_labels = hierarchical.fit_predict(data_scaled)
|
| 48 |
+
silhouette_avg = silhouette_score(data_scaled, cluster_labels)
|
| 49 |
+
st.write(f"Hierarchical Clustering Silhouette Score for {k_range} clusters: {silhouette_avg}")
|
| 50 |
+
|
| 51 |
+
elif clustering_method == "DBSCAN":
|
| 52 |
+
eps_value = st.slider("Select eps value for DBSCAN", min_value=0.1, max_value=2.0, value=0.5)
|
| 53 |
+
min_samples_value = st.slider("Select minimum samples for DBSCAN", min_value=1, max_value=10, value=5)
|
| 54 |
+
dbscan = DBSCAN(eps=eps_value, min_samples=min_samples_value)
|
| 55 |
+
cluster_labels = dbscan.fit_predict(data_scaled)
|
| 56 |
+
|
| 57 |
+
# Check if DBSCAN found valid clusters
|
| 58 |
+
if len(set(cluster_labels)) > 1:
|
| 59 |
+
silhouette_avg = silhouette_score(data_scaled, cluster_labels)
|
| 60 |
+
st.write(f"DBSCAN Silhouette Score: {silhouette_avg}")
|
| 61 |
+
else:
|
| 62 |
+
st.write("DBSCAN did not form valid clusters. Try adjusting eps or min_samples.")
|
| 63 |
+
|
| 64 |
+
# Step 4: Visualize the clusters if valid
|
| 65 |
+
if len(set(cluster_labels)) > 1:
|
| 66 |
+
st.write("Cluster Labels:", np.unique(cluster_labels))
|
| 67 |
+
sns.scatterplot(x=data_scaled[:, 0], y=data_scaled[:, 1], hue=cluster_labels, palette='Set1')
|
| 68 |
+
st.pyplot(plt)
|
| 69 |
+
|
| 70 |
+
# Step 5: ARIMA Time Series Analysis
|
| 71 |
+
# Checking if there are any time-related columns
|
| 72 |
+
time_series_col = None
|
| 73 |
+
for col in data.columns:
|
| 74 |
+
if pd.api.types.is_datetime64_any_dtype(data[col]):
|
| 75 |
+
time_series_col = col
|
| 76 |
+
break
|
| 77 |
+
|
| 78 |
+
if time_series_col:
|
| 79 |
+
st.write("Time Series Analysis (ARIMA) on column:", time_series_col)
|
| 80 |
+
time_series_data = data[time_series_col].dropna()
|
| 81 |
+
|
| 82 |
+
# ARIMA model order
|
| 83 |
+
p = st.number_input("ARIMA p value", min_value=0, max_value=5, value=1)
|
| 84 |
+
d = st.number_input("ARIMA d value", min_value=0, max_value=2, value=1)
|
| 85 |
+
q = st.number_input("ARIMA q value", min_value=0, max_value=5, value=1)
|
| 86 |
+
|
| 87 |
+
arima_model = ARIMA(time_series_data, order=(p, d, q))
|
| 88 |
+
arima_result = arima_model.fit()
|
| 89 |
+
|
| 90 |
+
# Display ARIMA result summary
|
| 91 |
+
st.write(arima_result.summary())
|
| 92 |
+
|
| 93 |
+
# Plotting the original and forecast
|
| 94 |
+
fig, ax = plt.subplots()
|
| 95 |
+
arima_result.plot_predict(dynamic=False, ax=ax)
|
| 96 |
+
st.pyplot(fig)
|
| 97 |
+
|
| 98 |
+
# Step 6: Create Silhouette Score Table for K-Means and Hierarchical Clustering
|
| 99 |
+
st.write("### Silhouette Score Table for 2-7 Clusters")
|
| 100 |
+
silhouette_scores = {'Number of Clusters': [], 'K-Means Silhouette Score': [], 'Hierarchical Silhouette Score': []}
|
| 101 |
+
|
| 102 |
+
for n_clusters in range(2, 8):
|
| 103 |
+
# K-Means
|
| 104 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
| 105 |
+
kmeans_labels = kmeans.fit_predict(data_scaled)
|
| 106 |
+
kmeans_silhouette = silhouette_score(data_scaled, kmeans_labels)
|
| 107 |
+
|
| 108 |
+
# Hierarchical
|
| 109 |
+
hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
|
| 110 |
+
hierarchical_labels = hierarchical.fit_predict(data_scaled)
|
| 111 |
+
hierarchical_silhouette = silhouette_score(data_scaled, hierarchical_labels)
|
| 112 |
+
|
| 113 |
+
silhouette_scores['Number of Clusters'].append(n_clusters)
|
| 114 |
+
silhouette_scores['K-Means Silhouette Score'].append(kmeans_silhouette)
|
| 115 |
+
silhouette_scores['Hierarchical Silhouette Score'].append(hierarchical_silhouette)
|
| 116 |
+
|
| 117 |
+
silhouette_df = pd.DataFrame(silhouette_scores)
|
| 118 |
+
st.write(silhouette_df)
|