| import streamlit as st
|
| import pandas as pd
|
| import numpy as np
|
| import matplotlib.pyplot as plt
|
| import seaborn as sns
|
| import scipy.cluster.hierarchy as shc
|
| from sklearn.preprocessing import StandardScaler
|
| from sklearn.ensemble import IsolationForest
|
| from sklearn.cluster import AgglomerativeClustering
|
| from sklearn.decomposition import PCA
|
|
|
|
|
| def preprocess_data(file):
|
| data = pd.read_csv(file)
|
|
|
| data = data.dropna()
|
|
|
| data.replace("$null$", np.nan, inplace=True)
|
|
|
|
|
| data["type"] = pd.to_numeric(data["type"], errors='coerce')
|
|
|
|
|
| numeric_columns = [
|
| 'sales', 'resale', 'price', 'engine_s', 'horsepow',
|
| 'wheelbas', 'width', 'length', 'curb_wgt', 'fuel_cap',
|
| 'mpg', 'lnsales'
|
| ]
|
|
|
|
|
| data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')
|
| data["type"] = data["type"].apply(pd.to_numeric, errors='coerce')
|
|
|
|
|
| for col in numeric_columns:
|
| data[col].fillna(data[col].mean(), inplace=True)
|
|
|
|
|
| data = pd.get_dummies(data, columns=['manufact', 'model'])
|
|
|
| return data, numeric_columns
|
|
|
|
|
| def train_model(data, numerical_features):
|
|
|
| scaler = StandardScaler()
|
| data[numerical_features] = scaler.fit_transform(data[numerical_features])
|
|
|
|
|
| model = IsolationForest(contamination=0.05)
|
| model.fit(data)
|
|
|
|
|
| outliers = model.predict(data)
|
|
|
|
|
| outlier_indices = data.index[outliers == -1]
|
|
|
|
|
| data.drop(index=outlier_indices, inplace=True)
|
|
|
|
|
| dend = shc.linkage(data, method='ward')
|
|
|
| return dend, data, scaler
|
|
|
|
|
| def main():
|
| st.title('Vehicle Clustering Analysis App')
|
|
|
|
|
| uploaded_file = st.file_uploader("Upload CSV file", type=['csv'])
|
|
|
| if uploaded_file is not None:
|
| st.subheader('Uploaded Data')
|
|
|
| data, numerical_features = preprocess_data(uploaded_file)
|
|
|
|
|
| st.write(data.head())
|
|
|
|
|
| dend, data, scaler = train_model(data, numerical_features)
|
|
|
| st.subheader("Data Dimensions")
|
| st.write(data.shape)
|
|
|
|
|
| st.subheader('Histograms')
|
| num_cols = 3
|
| num_rows = int(np.ceil(len(numerical_features) / num_cols))
|
|
|
| fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 5))
|
| axes = axes.flatten()
|
|
|
| for ax, col in zip(axes, numerical_features):
|
| ax.hist(data[col], bins=20)
|
| ax.set_title(col)
|
|
|
|
|
| for i in range(len(numerical_features), len(axes)):
|
| fig.delaxes(axes[i])
|
|
|
| st.pyplot(fig)
|
|
|
|
|
| st.subheader('Boxplots')
|
| fig, ax = plt.subplots(figsize=(12, 8))
|
| sns.boxplot(data=data[numerical_features], ax=ax)
|
| ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
|
| st.pyplot(fig)
|
|
|
|
|
| st.subheader('Dendrogram')
|
| fig, ax = plt.subplots(figsize=(10, 6))
|
| ax.set_title('Hierarchical Clustering Dendrogram')
|
| dendrogram = shc.dendrogram(dend, ax=ax)
|
| st.pyplot(fig)
|
|
|
|
|
| unique_colors = set(dendrogram['color_list'])
|
| number_of_clusters = len(unique_colors) - 1
|
| st.subheader("Number of Clusters")
|
| st.write(number_of_clusters)
|
|
|
|
|
| agg_clustering = AgglomerativeClustering(n_clusters=number_of_clusters)
|
| agg_clustering.fit(data)
|
|
|
|
|
| cluster_labels = agg_clustering.labels_
|
|
|
|
|
| data['cluster'] = cluster_labels
|
|
|
|
|
| st.subheader("Cluster Counts")
|
| st.write(data['cluster'].value_counts())
|
|
|
|
|
| st.subheader('PCA for Cluster Visualization')
|
| pca = PCA(n_components=2)
|
| principal_components = pca.fit_transform(data[numerical_features])
|
| pca_df = pd.DataFrame(data=principal_components, columns=['Component 1', 'Component 2'])
|
| pca_df['cluster'] = cluster_labels
|
|
|
| fig, ax = plt.subplots(figsize=(10, 6))
|
| sns.scatterplot(x='Component 1', y='Component 2', hue='cluster', data=pca_df, palette='viridis', ax=ax)
|
| ax.set_title('Clusters visualized using PCA')
|
| st.pyplot(fig)
|
|
|
|
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|