3v324v23's picture
final
1770461
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
st.title("Music Genre Clustering with DBSCAN")
# Load dataset directly
file_path = "top_10000_1950-now.csv"
df = pd.read_csv(file_path)
# Remove non-numeric columns
df_numeric = df.select_dtypes(include=[np.number])
# Create tabs
tab1, tab2, tab3 = st.tabs(["Overview", "Visualization Matrix", "User Input"])
with tab1:
st.write("### Dataset Overview")
st.dataframe(df.head())
st.write("### Dataset Information")
st.write(df_numeric.describe())
with tab2:
st.write("### Correlation Matrix")
plt.figure(figsize=(10, 6))
sns.heatmap(df_numeric.corr(), annot=True, cmap="coolwarm", fmt=".2f")
st.pyplot(plt)
st.write("### Pairplot Visualization")
pairplot_features = st.multiselect("Select Features for Pairplot", df_numeric.columns.tolist(),
default=["Danceability", "Energy", "Tempo", "Loudness", "Valence"])
if pairplot_features:
sns.pairplot(df[pairplot_features])
st.pyplot(plt)
with tab3:
st.write("### Clustering Settings")
num_features = st.slider("Select Number of Features", 2, len(df_numeric.columns), 5)
features = st.multiselect("Select Features for Clustering",
df_numeric.columns.tolist(),
default=df_numeric.columns[:num_features])
if st.button("Run Clustering"):
if len(features) >= 2:
df_filtered = df_numeric[features].dropna()
X_scaled = StandardScaler().fit_transform(df_filtered)
eps = 1.0 # Default value, can be modified as needed
min_samples = 10 # Default value, can be modified as needed
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(X_scaled)
df_filtered["Cluster"] = labels
df["Cluster"] = np.nan
df.loc[df_filtered.index, "Cluster"] = labels
st.write("### Clustered Data:")
st.dataframe(df[["Track Name", "Artist Name(s)", "Cluster"]].dropna().head(20))
st.write("### Cluster Visualization:")
fig, ax = plt.subplots()
scatter = ax.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels, cmap="viridis", alpha=0.7)
legend1 = ax.legend(*scatter.legend_elements(), title="Clusters")
ax.add_artist(legend1)
st.pyplot(fig)
else:
st.warning("Please select at least two features for clustering.")