import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import scipy.cluster.hierarchy as shc from sklearn.preprocessing import StandardScaler from sklearn.ensemble import IsolationForest from sklearn.cluster import AgglomerativeClustering from sklearn.decomposition import PCA # Function to load and preprocess the data def preprocess_data(file): data = pd.read_csv(file) data = data.dropna() # Replace "$null$" with NaN data.replace("$null$", np.nan, inplace=True) # Convert 'type' column to numeric data["type"] = pd.to_numeric(data["type"], errors='coerce') # Columns to convert to numeric numeric_columns = [ 'sales', 'resale', 'price', 'engine_s', 'horsepow', 'wheelbas', 'width', 'length', 'curb_wgt', 'fuel_cap', 'mpg', 'lnsales' ] # Convert the columns to numeric, coercing errors to NaN data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce') data["type"] = data["type"].apply(pd.to_numeric, errors='coerce') # Replace missing values in the specified columns with the mean of those columns for col in numeric_columns: data[col].fillna(data[col].mean(), inplace=True) # Perform one-hot encoding on 'manufact' and 'model' columns data = pd.get_dummies(data, columns=['manufact', 'model']) return data, numeric_columns # Function to train the Isolation Forest model and perform hierarchical clustering def train_model(data, numerical_features): # Normalize numerical variables scaler = StandardScaler() data[numerical_features] = scaler.fit_transform(data[numerical_features]) # Train Isolation Forest model model = IsolationForest(contamination=0.05) model.fit(data) # Predict outliers outliers = model.predict(data) # Get outlier indices outlier_indices = data.index[outliers == -1] # Remove outliers from the DataFrame data.drop(index=outlier_indices, inplace=True) # Hierarchical clustering dend = shc.linkage(data, method='ward') return dend, data, scaler # Main function to run the Streamlit app def main(): st.title('Vehicle Clustering Analysis App') # Allow user to upload a CSV file uploaded_file = st.file_uploader("Upload CSV file", type=['csv']) if uploaded_file is not None: st.subheader('Uploaded Data') # Preprocess the data data, numerical_features = preprocess_data(uploaded_file) # Display the first few rows of the data st.write(data.head()) # Train model and perform clustering dend, data, scaler = train_model(data, numerical_features) st.subheader("Data Dimensions") st.write(data.shape) # Plot histograms for numerical features st.subheader('Histograms') num_cols = 3 num_rows = int(np.ceil(len(numerical_features) / num_cols)) fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 5)) axes = axes.flatten() for ax, col in zip(axes, numerical_features): ax.hist(data[col], bins=20) ax.set_title(col) # Remove any empty subplots for i in range(len(numerical_features), len(axes)): fig.delaxes(axes[i]) st.pyplot(fig) # Plot box plots for numerical features st.subheader('Boxplots') fig, ax = plt.subplots(figsize=(12, 8)) sns.boxplot(data=data[numerical_features], ax=ax) ax.set_xticklabels(ax.get_xticklabels(), rotation=45) st.pyplot(fig) # Plot dendrogram st.subheader('Dendrogram') fig, ax = plt.subplots(figsize=(10, 6)) ax.set_title('Hierarchical Clustering Dendrogram') dendrogram = shc.dendrogram(dend, ax=ax) st.pyplot(fig) # Number of clusters unique_colors = set(dendrogram['color_list']) number_of_clusters = len(unique_colors) - 1 st.subheader("Number of Clusters") st.write(number_of_clusters) # Hierarchical clustering with AgglomerativeClustering agg_clustering = AgglomerativeClustering(n_clusters=number_of_clusters) agg_clustering.fit(data) # Retrieve the cluster labels cluster_labels = agg_clustering.labels_ # Add the cluster labels to the DataFrame data['cluster'] = cluster_labels # Print the counts of each cluster st.subheader("Cluster Counts") st.write(data['cluster'].value_counts()) # PCA for visualization st.subheader('PCA for Cluster Visualization') pca = PCA(n_components=2) principal_components = pca.fit_transform(data[numerical_features]) pca_df = pd.DataFrame(data=principal_components, columns=['Component 1', 'Component 2']) pca_df['cluster'] = cluster_labels fig, ax = plt.subplots(figsize=(10, 6)) sns.scatterplot(x='Component 1', y='Component 2', hue='cluster', data=pca_df, palette='viridis', ax=ax) ax.set_title('Clusters visualized using PCA') st.pyplot(fig) if __name__ == "__main__": main()