Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering | |
| from sklearn.metrics import silhouette_score | |
| from sklearn.preprocessing import StandardScaler | |
| from statsmodels.tsa.arima.model import ARIMA | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| # Streamlit app title | |
| st.title('Clustering and Time Series Analysis') | |
| # Step 1: Upload CSV file | |
| uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) | |
| if uploaded_file is not None: | |
| data = pd.read_csv(uploaded_file) | |
| st.write("Dataset Preview:", data.head()) | |
| # Step 2: Data Preprocessing | |
| # Selecting only numerical columns for clustering | |
| numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist() | |
| st.write("Numerical columns for clustering:", numerical_cols) | |
| # Step 2.1: Data Standardization using StandardScaler (always applied) | |
| scaler = StandardScaler() | |
| data_scaled = scaler.fit_transform(data[numerical_cols]) | |
| st.write("Data has been standardized using StandardScaler.") | |
| # Step 3: Clustering Algorithm Selection | |
| clustering_method = st.selectbox("Choose a clustering method", ["K-Means", "Hierarchical Clustering", "DBSCAN"]) | |
| if clustering_method == "K-Means": | |
| k_range = st.slider("Select number of clusters for K-Means", min_value=2, max_value=7, value=3) | |
| kmeans = KMeans(n_clusters=k_range, random_state=42) | |
| cluster_labels = kmeans.fit_predict(data_scaled) | |
| silhouette_avg = silhouette_score(data_scaled, cluster_labels) | |
| st.write(f"K-Means Silhouette Score for {k_range} clusters: {silhouette_avg}") | |
| elif clustering_method == "Hierarchical Clustering": | |
| k_range = st.slider("Select number of clusters for Hierarchical Clustering", min_value=2, max_value=7, value=3) | |
| hierarchical = AgglomerativeClustering(n_clusters=k_range) | |
| cluster_labels = hierarchical.fit_predict(data_scaled) | |
| silhouette_avg = silhouette_score(data_scaled, cluster_labels) | |
| st.write(f"Hierarchical Clustering Silhouette Score for {k_range} clusters: {silhouette_avg}") | |
| elif clustering_method == "DBSCAN": | |
| eps_value = st.slider("Select eps value for DBSCAN", min_value=0.1, max_value=2.0, value=0.5) | |
| min_samples_value = st.slider("Select minimum samples for DBSCAN", min_value=1, max_value=10, value=5) | |
| dbscan = DBSCAN(eps=eps_value, min_samples=min_samples_value) | |
| cluster_labels = dbscan.fit_predict(data_scaled) | |
| # Check if DBSCAN found valid clusters | |
| if len(set(cluster_labels)) > 1: | |
| silhouette_avg = silhouette_score(data_scaled, cluster_labels) | |
| st.write(f"DBSCAN Silhouette Score: {silhouette_avg}") | |
| else: | |
| st.write("DBSCAN did not form valid clusters. Try adjusting eps or min_samples.") | |
| # Step 4: Visualize the clusters using Plotly | |
| if len(set(cluster_labels)) > 1: | |
| st.write("Cluster Labels:", np.unique(cluster_labels)) | |
| # Create Plotly scatter plot | |
| fig = px.scatter(x=data_scaled[:, 0], y=data_scaled[:, 1], color=cluster_labels, title="Clustering Results", | |
| labels={'x': numerical_cols[0], 'y': numerical_cols[1]}) | |
| # Update y-axis range to be from -1 to 1 with 0.2 intervals | |
| fig.update_layout( | |
| yaxis=dict(range=[-1, 1], dtick=0.2), | |
| xaxis_title=numerical_cols[0], | |
| yaxis_title=numerical_cols[1] | |
| ) | |
| st.plotly_chart(fig) | |
| # Step 5: ARIMA Time Series Analysis | |
| # Checking if there are any time-related columns | |
| time_series_col = None | |
| for col in data.columns: | |
| if pd.api.types.is_datetime64_any_dtype(data[col]): | |
| time_series_col = col | |
| break | |
| if time_series_col: | |
| st.write("Time Series Analysis (ARIMA) on column:", time_series_col) | |
| time_series_data = data[time_series_col].dropna() | |
| # ARIMA model order | |
| p = st.number_input("ARIMA p value", min_value=0, max_value=5, value=1) | |
| d = st.number_input("ARIMA d value", min_value=0, max_value=2, value=1) | |
| q = st.number_input("ARIMA q value", min_value=0, max_value=5, value=1) | |
| arima_model = ARIMA(time_series_data, order=(p, d, q)) | |
| arima_result = arima_model.fit() | |
| # Display ARIMA result summary | |
| st.write(arima_result.summary()) | |
| # Plotting the ARIMA results | |
| fig = go.Figure() | |
| arima_result.plot_predict(dynamic=False, ax=fig.add_subplot(1, 1, 1)) | |
| st.plotly_chart(fig) | |
| # Step 6: Create Silhouette Score Table for K-Means and Hierarchical Clustering | |
| st.write("### Silhouette Score Table for 2-7 Clusters") | |
| silhouette_scores = {'Number of Clusters': [], 'K-Means Silhouette Score': [], 'Hierarchical Silhouette Score': []} | |
| for n_clusters in range(2, 8): | |
| # K-Means | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42) | |
| kmeans_labels = kmeans.fit_predict(data_scaled) | |
| kmeans_silhouette = silhouette_score(data_scaled, kmeans_labels) | |
| # Hierarchical | |
| hierarchical = AgglomerativeClustering(n_clusters=n_clusters) | |
| hierarchical_labels = hierarchical.fit_predict(data_scaled) | |
| hierarchical_silhouette = silhouette_score(data_scaled, hierarchical_labels) | |
| silhouette_scores['Number of Clusters'].append(n_clusters) | |
| silhouette_scores['K-Means Silhouette Score'].append(kmeans_silhouette) | |
| silhouette_scores['Hierarchical Silhouette Score'].append(hierarchical_silhouette) | |
| silhouette_df = pd.DataFrame(silhouette_scores) | |
| # Plot the Silhouette Score Table using Plotly | |
| fig = go.Figure() | |
| # Plot K-Means Silhouette Scores | |
| fig.add_trace(go.Scatter(x=silhouette_df['Number of Clusters'], y=silhouette_df['K-Means Silhouette Score'], | |
| mode='lines+markers', name='K-Means Silhouette Score')) | |
| # Plot Hierarchical Silhouette Scores | |
| fig.add_trace(go.Scatter(x=silhouette_df['Number of Clusters'], y=silhouette_df['Hierarchical Silhouette Score'], | |
| mode='lines+markers', name='Hierarchical Silhouette Score')) | |
| # Set the y-axis range from -1 to 1 with intervals of 0.2 | |
| fig.update_layout( | |
| title="Silhouette Scores for K-Means and Hierarchical Clustering", | |
| xaxis_title="Number of Clusters", | |
| yaxis_title="Silhouette Score", | |
| yaxis=dict(range=[-1, 1], dtick=0.2) | |
| ) | |
| st.plotly_chart(fig) | |