|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
from sklearn.preprocessing import StandardScaler |
|
|
from sklearn.cluster import DBSCAN |
|
|
|
|
|
st.title("Music Genre Clustering with DBSCAN") |
|
|
|
|
|
|
|
|
file_path = "top_10000_1950-now.csv" |
|
|
df = pd.read_csv(file_path) |
|
|
|
|
|
|
|
|
df_numeric = df.select_dtypes(include=[np.number]) |
|
|
|
|
|
|
|
|
tab1, tab2, tab3 = st.tabs(["Overview", "Visualization Matrix", "User Input"]) |
|
|
|
|
|
with tab1: |
|
|
st.write("### Dataset Overview") |
|
|
st.dataframe(df.head()) |
|
|
st.write("### Dataset Information") |
|
|
st.write(df_numeric.describe()) |
|
|
|
|
|
with tab2: |
|
|
st.write("### Correlation Matrix") |
|
|
plt.figure(figsize=(10, 6)) |
|
|
sns.heatmap(df_numeric.corr(), annot=True, cmap="coolwarm", fmt=".2f") |
|
|
st.pyplot(plt) |
|
|
|
|
|
st.write("### Pairplot Visualization") |
|
|
pairplot_features = st.multiselect("Select Features for Pairplot", df_numeric.columns.tolist(), |
|
|
default=["Danceability", "Energy", "Tempo", "Loudness", "Valence"]) |
|
|
if pairplot_features: |
|
|
sns.pairplot(df[pairplot_features]) |
|
|
st.pyplot(plt) |
|
|
|
|
|
with tab3: |
|
|
st.write("### Clustering Settings") |
|
|
num_features = st.slider("Select Number of Features", 2, len(df_numeric.columns), 5) |
|
|
features = st.multiselect("Select Features for Clustering", |
|
|
df_numeric.columns.tolist(), |
|
|
default=df_numeric.columns[:num_features]) |
|
|
|
|
|
if st.button("Run Clustering"): |
|
|
if len(features) >= 2: |
|
|
df_filtered = df_numeric[features].dropna() |
|
|
X_scaled = StandardScaler().fit_transform(df_filtered) |
|
|
|
|
|
eps = 1.0 |
|
|
min_samples = 10 |
|
|
|
|
|
dbscan = DBSCAN(eps=eps, min_samples=min_samples) |
|
|
labels = dbscan.fit_predict(X_scaled) |
|
|
|
|
|
df_filtered["Cluster"] = labels |
|
|
df["Cluster"] = np.nan |
|
|
df.loc[df_filtered.index, "Cluster"] = labels |
|
|
|
|
|
st.write("### Clustered Data:") |
|
|
st.dataframe(df[["Track Name", "Artist Name(s)", "Cluster"]].dropna().head(20)) |
|
|
|
|
|
st.write("### Cluster Visualization:") |
|
|
fig, ax = plt.subplots() |
|
|
scatter = ax.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels, cmap="viridis", alpha=0.7) |
|
|
legend1 = ax.legend(*scatter.legend_elements(), title="Clusters") |
|
|
ax.add_artist(legend1) |
|
|
st.pyplot(fig) |
|
|
else: |
|
|
st.warning("Please select at least two features for clustering.") |
|
|
|