Spaces:

rayyanphysicist
/

Vehicle_Clustering_Analysis

Sleeping

App Files Files Community

Vehicle_Clustering_Analysis / app.py

rayyanphysicist

Upload 2 files

1d1c8c1 verified almost 2 years ago

raw

history blame contribute delete

5.34 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import scipy.cluster.hierarchy as shc
	from sklearn.preprocessing import StandardScaler
	from sklearn.ensemble import IsolationForest
	from sklearn.cluster import AgglomerativeClustering
	from sklearn.decomposition import PCA

	# Function to load and preprocess the data
	def preprocess_data(file):
	data = pd.read_csv(file)

	data = data.dropna()
	# Replace "$null$" with NaN
	data.replace("$null$", np.nan, inplace=True)

	# Convert 'type' column to numeric
	data["type"] = pd.to_numeric(data["type"], errors='coerce')

	# Columns to convert to numeric
	numeric_columns = [
	'sales', 'resale', 'price', 'engine_s', 'horsepow',
	'wheelbas', 'width', 'length', 'curb_wgt', 'fuel_cap',
	'mpg', 'lnsales'
	]

	# Convert the columns to numeric, coercing errors to NaN
	data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')
	data["type"] = data["type"].apply(pd.to_numeric, errors='coerce')

	# Replace missing values in the specified columns with the mean of those columns
	for col in numeric_columns:
	data[col].fillna(data[col].mean(), inplace=True)

	# Perform one-hot encoding on 'manufact' and 'model' columns
	data = pd.get_dummies(data, columns=['manufact', 'model'])

	return data, numeric_columns

	# Function to train the Isolation Forest model and perform hierarchical clustering
	def train_model(data, numerical_features):
	# Normalize numerical variables
	scaler = StandardScaler()
	data[numerical_features] = scaler.fit_transform(data[numerical_features])

	# Train Isolation Forest model
	model = IsolationForest(contamination=0.05)
	model.fit(data)

	# Predict outliers
	outliers = model.predict(data)

	# Get outlier indices
	outlier_indices = data.index[outliers == -1]

	# Remove outliers from the DataFrame
	data.drop(index=outlier_indices, inplace=True)

	# Hierarchical clustering
	dend = shc.linkage(data, method='ward')

	return dend, data, scaler

	# Main function to run the Streamlit app
	def main():
	st.title('Vehicle Clustering Analysis App')

	# Allow user to upload a CSV file
	uploaded_file = st.file_uploader("Upload CSV file", type=['csv'])

	if uploaded_file is not None:
	st.subheader('Uploaded Data')
	# Preprocess the data
	data, numerical_features = preprocess_data(uploaded_file)

	# Display the first few rows of the data
	st.write(data.head())

	# Train model and perform clustering
	dend, data, scaler = train_model(data, numerical_features)

	st.subheader("Data Dimensions")
	st.write(data.shape)

	# Plot histograms for numerical features
	st.subheader('Histograms')
	num_cols = 3
	num_rows = int(np.ceil(len(numerical_features) / num_cols))

	fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 5))
	axes = axes.flatten()

	for ax, col in zip(axes, numerical_features):
	ax.hist(data[col], bins=20)
	ax.set_title(col)

	# Remove any empty subplots
	for i in range(len(numerical_features), len(axes)):
	fig.delaxes(axes[i])

	st.pyplot(fig)

	# Plot box plots for numerical features
	st.subheader('Boxplots')
	fig, ax = plt.subplots(figsize=(12, 8))
	sns.boxplot(data=data[numerical_features], ax=ax)
	ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
	st.pyplot(fig)

	# Plot dendrogram
	st.subheader('Dendrogram')
	fig, ax = plt.subplots(figsize=(10, 6))
	ax.set_title('Hierarchical Clustering Dendrogram')
	dendrogram = shc.dendrogram(dend, ax=ax)
	st.pyplot(fig)

	# Number of clusters
	unique_colors = set(dendrogram['color_list'])
	number_of_clusters = len(unique_colors) - 1
	st.subheader("Number of Clusters")
	st.write(number_of_clusters)

	# Hierarchical clustering with AgglomerativeClustering
	agg_clustering = AgglomerativeClustering(n_clusters=number_of_clusters)
	agg_clustering.fit(data)

	# Retrieve the cluster labels
	cluster_labels = agg_clustering.labels_

	# Add the cluster labels to the DataFrame
	data['cluster'] = cluster_labels

	# Print the counts of each cluster
	st.subheader("Cluster Counts")
	st.write(data['cluster'].value_counts())

	# PCA for visualization
	st.subheader('PCA for Cluster Visualization')
	pca = PCA(n_components=2)
	principal_components = pca.fit_transform(data[numerical_features])
	pca_df = pd.DataFrame(data=principal_components, columns=['Component 1', 'Component 2'])
	pca_df['cluster'] = cluster_labels

	fig, ax = plt.subplots(figsize=(10, 6))
	sns.scatterplot(x='Component 1', y='Component 2', hue='cluster', data=pca_df, palette='viridis', ax=ax)
	ax.set_title('Clusters visualized using PCA')
	st.pyplot(fig)



	if __name__ == "__main__":
	main()