import streamlit as st import pandas as pd import numpy as np import joblib import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans, DBSCAN from sklearn.metrics import silhouette_score from scipy.cluster.hierarchy import linkage, dendrogram, fcluster from sklearn.mixture import GaussianMixture from datasets import load_dataset import os # Load dataset @st.cache_data def load_data(): dataset = load_dataset("Ci-Dave/SPY500MW_StockMarket") df = pd.DataFrame(dataset["train"]) return df # Preprocessing function def preprocess_data(df): numeric_cols = df.select_dtypes(include=[np.number]).columns # Select only numeric columns df_numeric = df[numeric_cols].dropna() # Drop missing values scaler = StandardScaler() df_scaled = scaler.fit_transform(df_numeric) return df_numeric, df_scaled, scaler # Load dataset df = load_data() df_numeric, df_scaled, scaler = preprocess_data(df) # Load trained models with error handling def load_model(filename): if os.path.exists(filename): return joblib.load(filename) else: st.error(f"Model file {filename} not found. Make sure you uploaded the trained model.") return None kmeans = load_model("kmeans_stock_model.pkl") hierarchical = load_model("hierarchical_stock_model.pkl") dbscan = load_model("dbscan_stock_model.pkl") gmm = load_model("gmm_stock_model.pkl") # Sidebar navigation st.sidebar.title("Stock Market Clustering") page = st.sidebar.radio("Choose a model:", ["Home", "Dataset", "K-Means", "Hierarchical", "DBSCAN", "GMM"]) if page == "Home": st.title("Stock Market Clustering Analysis") st.write(""" This application analyzes stock market data using clustering techniques. Explore different clustering models (K-Means, Hierarchical, DBSCAN, and GMM) to identify patterns in stock prices. **Dataset:** The dataset includes stock market price data with key attributes such as Open, High, Low, and Close values. **Clustering Models:** - **K-Means:** Groups stocks into clusters based on price similarities. - **Hierarchical Clustering:** Forms a tree-like structure to identify stock relationships. - **DBSCAN:** Detects core clusters and outliers in stock prices. - **Gaussian Mixture Model (GMM):** Uses probabilistic clustering for stock analysis. **Instructions:** - Select a clustering model from the sidebar. - View the clustering results, including cluster assignments, distributions, and visualizations. """) elif page == "Dataset": st.title("Stock Market Dataset") st.write("### Dataset Preview:") st.dataframe(df[["Open", "High", "Low", "Close"]].head(), use_container_width=True) # Show only key features st.write("### Full Dataset:") st.dataframe(df, use_container_width=True) st.write("### Dataset Summary:") st.write(df_numeric.describe(), use_container_width=True) st.write("### Correlation Heatmap:") fig, ax = plt.subplots() sns.heatmap(df_numeric.corr(), annot=True, cmap="coolwarm", ax=ax) st.pyplot(fig) elif page == "K-Means": st.title("K-Means Clustering") if kmeans: clusters = kmeans.predict(df_scaled) df_numeric["KMeans Cluster"] = clusters # Display Cluster Assignments st.write("Cluster Assignments:") st.dataframe(df_numeric[["KMeans Cluster"]].head()) # Display Cluster Distribution cluster_counts = df_numeric["KMeans Cluster"].value_counts().reset_index() cluster_counts.columns = ["Cluster", "Count"] st.write("Cluster Distribution:") st.dataframe(cluster_counts) # Compute and Display Silhouette Score silhouette = silhouette_score(df_scaled, clusters) st.write(f"Silhouette Score: {silhouette:.4f}") # Visualization fig, ax = plt.subplots() sns.scatterplot(x=df_numeric.iloc[:, 0], y=df_numeric.iloc[:, 1], hue=clusters, palette="viridis", ax=ax) st.pyplot(fig) elif page == "Hierarchical": st.title("Hierarchical Clustering") # Perform Hierarchical Clustering linked = linkage(df_scaled, method='ward') # Create linkage matrix clusters = fcluster(linked, 4, criterion='maxclust') # Assign cluster labels df_numeric["Hierarchical Cluster"] = clusters # Add to dataframe # Display Cluster Assignments st.write("Cluster Assignments:") st.dataframe(df_numeric[["Hierarchical Cluster"]].head()) # Display Cluster Distribution cluster_counts = df_numeric["Hierarchical Cluster"].value_counts().reset_index() cluster_counts.columns = ["Cluster", "Count"] st.write("Cluster Distribution:") st.dataframe(cluster_counts) # Dendrogram Visualization fig, ax = plt.subplots(figsize=(10, 5)) dendrogram(linked, truncate_mode='level', p=5, ax=ax) # Plot dendrogram st.pyplot(fig) # Scatter Plot of First Two Features fig, ax = plt.subplots() sns.scatterplot(x=df_numeric.iloc[:, 0], y=df_numeric.iloc[:, 1], hue=clusters, palette="viridis", ax=ax) st.pyplot(fig) elif page == "DBSCAN": st.title("DBSCAN Clustering") if dbscan: clusters = dbscan.fit_predict(df_scaled) df_numeric["DBSCAN Cluster"] = clusters # Display Cluster Assignments st.write("Cluster Assignments:") st.dataframe(df_numeric[["DBSCAN Cluster"]].head()) # Display Cluster Distribution (Handle outliers) cluster_counts = df_numeric["DBSCAN Cluster"].value_counts().reset_index() cluster_counts.columns = ["Cluster", "Count"] st.write("Cluster Distribution:") st.dataframe(cluster_counts) # Visualization: Different color for outliers fig, ax = plt.subplots() palette = {i: sns.color_palette("viridis", as_cmap=True)(i / 4) for i in set(clusters) if i != -1} palette[-1] = "red" # Mark outliers in red sns.scatterplot(x=df_numeric.iloc[:, 0], y=df_numeric.iloc[:, 1], hue=clusters, palette=palette, ax=ax) st.pyplot(fig) elif page == "GMM": st.title("Gaussian Mixture Model (GMM)") if gmm: expected_features = gmm.means_.shape[1] # Get the number of features used during training df_gmm = df_scaled[:, :expected_features] # Keep only the needed columns clusters = gmm.predict(df_gmm) df_numeric["GMM Cluster"] = clusters # Display Cluster Assignments st.write("Cluster Assignments:") st.dataframe(df_numeric[["GMM Cluster"]].head()) # Display Cluster Distribution cluster_counts = df_numeric["GMM Cluster"].value_counts().reset_index() cluster_counts.columns = ["Cluster", "Count"] st.write("Cluster Distribution:") st.dataframe(cluster_counts) # Compute and Display Silhouette Score silhouette = silhouette_score(df_gmm, clusters) st.write(f"Silhouette Score: {silhouette:.4f}") # Visualization fig, ax = plt.subplots() sns.scatterplot(x=df_numeric.iloc[:, 0], y=df_numeric.iloc[:, 1], hue=clusters, palette="viridis", ax=ax) st.pyplot(fig)