Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,8 +5,8 @@ from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
|
| 5 |
from sklearn.metrics import silhouette_score
|
| 6 |
from sklearn.preprocessing import StandardScaler
|
| 7 |
from statsmodels.tsa.arima.model import ARIMA
|
| 8 |
-
import
|
| 9 |
-
import
|
| 10 |
|
| 11 |
# Streamlit app title
|
| 12 |
st.title('Clustering and Time Series Analysis')
|
|
@@ -23,13 +23,10 @@ if uploaded_file is not None:
|
|
| 23 |
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
|
| 24 |
st.write("Numerical columns for clustering:", numerical_cols)
|
| 25 |
|
| 26 |
-
#
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
data_scaled = scaler.fit_transform(data[numerical_cols])
|
| 31 |
-
else:
|
| 32 |
-
data_scaled = data[numerical_cols].values
|
| 33 |
|
| 34 |
# Step 3: Clustering Algorithm Selection
|
| 35 |
clustering_method = st.selectbox("Choose a clustering method", ["K-Means", "Hierarchical Clustering", "DBSCAN"])
|
|
@@ -61,11 +58,13 @@ if uploaded_file is not None:
|
|
| 61 |
else:
|
| 62 |
st.write("DBSCAN did not form valid clusters. Try adjusting eps or min_samples.")
|
| 63 |
|
| 64 |
-
# Step 4: Visualize the clusters
|
| 65 |
if len(set(cluster_labels)) > 1:
|
| 66 |
st.write("Cluster Labels:", np.unique(cluster_labels))
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
| 69 |
|
| 70 |
# Step 5: ARIMA Time Series Analysis
|
| 71 |
# Checking if there are any time-related columns
|
|
@@ -89,11 +88,11 @@ if uploaded_file is not None:
|
|
| 89 |
|
| 90 |
# Display ARIMA result summary
|
| 91 |
st.write(arima_result.summary())
|
| 92 |
-
|
| 93 |
-
# Plotting the
|
| 94 |
-
fig
|
| 95 |
-
arima_result.plot_predict(dynamic=False, ax=
|
| 96 |
-
st.
|
| 97 |
|
| 98 |
# Step 6: Create Silhouette Score Table for K-Means and Hierarchical Clustering
|
| 99 |
st.write("### Silhouette Score Table for 2-7 Clusters")
|
|
@@ -115,4 +114,24 @@ if uploaded_file is not None:
|
|
| 115 |
silhouette_scores['Hierarchical Silhouette Score'].append(hierarchical_silhouette)
|
| 116 |
|
| 117 |
silhouette_df = pd.DataFrame(silhouette_scores)
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from sklearn.metrics import silhouette_score
|
| 6 |
from sklearn.preprocessing import StandardScaler
|
| 7 |
from statsmodels.tsa.arima.model import ARIMA
|
| 8 |
+
import plotly.express as px
|
| 9 |
+
import plotly.graph_objects as go
|
| 10 |
|
| 11 |
# Streamlit app title
|
| 12 |
st.title('Clustering and Time Series Analysis')
|
|
|
|
| 23 |
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
|
| 24 |
st.write("Numerical columns for clustering:", numerical_cols)
|
| 25 |
|
| 26 |
+
# Step 2.1: Data Standardization using StandardScaler (always applied)
|
| 27 |
+
scaler = StandardScaler()
|
| 28 |
+
data_scaled = scaler.fit_transform(data[numerical_cols])
|
| 29 |
+
st.write("Data has been standardized using StandardScaler.")
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
# Step 3: Clustering Algorithm Selection
|
| 32 |
clustering_method = st.selectbox("Choose a clustering method", ["K-Means", "Hierarchical Clustering", "DBSCAN"])
|
|
|
|
| 58 |
else:
|
| 59 |
st.write("DBSCAN did not form valid clusters. Try adjusting eps or min_samples.")
|
| 60 |
|
| 61 |
+
# Step 4: Visualize the clusters using Plotly
|
| 62 |
if len(set(cluster_labels)) > 1:
|
| 63 |
st.write("Cluster Labels:", np.unique(cluster_labels))
|
| 64 |
+
|
| 65 |
+
fig = px.scatter(x=data_scaled[:, 0], y=data_scaled[:, 1], color=cluster_labels, title="Clustering Results",
|
| 66 |
+
labels={'x': numerical_cols[0], 'y': numerical_cols[1]})
|
| 67 |
+
st.plotly_chart(fig)
|
| 68 |
|
| 69 |
# Step 5: ARIMA Time Series Analysis
|
| 70 |
# Checking if there are any time-related columns
|
|
|
|
| 88 |
|
| 89 |
# Display ARIMA result summary
|
| 90 |
st.write(arima_result.summary())
|
| 91 |
+
|
| 92 |
+
# Plotting the ARIMA results
|
| 93 |
+
fig = go.Figure()
|
| 94 |
+
arima_result.plot_predict(dynamic=False, ax=fig.add_subplot(1, 1, 1))
|
| 95 |
+
st.plotly_chart(fig)
|
| 96 |
|
| 97 |
# Step 6: Create Silhouette Score Table for K-Means and Hierarchical Clustering
|
| 98 |
st.write("### Silhouette Score Table for 2-7 Clusters")
|
|
|
|
| 114 |
silhouette_scores['Hierarchical Silhouette Score'].append(hierarchical_silhouette)
|
| 115 |
|
| 116 |
silhouette_df = pd.DataFrame(silhouette_scores)
|
| 117 |
+
|
| 118 |
+
# Plot the Silhouette Score Table using Plotly
|
| 119 |
+
fig = go.Figure()
|
| 120 |
+
|
| 121 |
+
# Plot K-Means Silhouette Scores
|
| 122 |
+
fig.add_trace(go.Scatter(x=silhouette_df['Number of Clusters'], y=silhouette_df['K-Means Silhouette Score'],
|
| 123 |
+
mode='lines+markers', name='K-Means Silhouette Score'))
|
| 124 |
+
|
| 125 |
+
# Plot Hierarchical Silhouette Scores
|
| 126 |
+
fig.add_trace(go.Scatter(x=silhouette_df['Number of Clusters'], y=silhouette_df['Hierarchical Silhouette Score'],
|
| 127 |
+
mode='lines+markers', name='Hierarchical Silhouette Score'))
|
| 128 |
+
|
| 129 |
+
# Set the y-axis range from -1 to 1 with intervals of 0.2
|
| 130 |
+
fig.update_layout(
|
| 131 |
+
title="Silhouette Scores for K-Means and Hierarchical Clustering",
|
| 132 |
+
xaxis_title="Number of Clusters",
|
| 133 |
+
yaxis_title="Silhouette Score",
|
| 134 |
+
yaxis=dict(range=[-1, 1], dtick=0.2)
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
st.plotly_chart(fig)
|