cluster_app / cluster_app.py
TimoPh's picture
Upload 2 files
84e529c verified
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import streamlit as st
import matplotlib.pyplot as plt
# First we load the data
data = pd.read_csv("Spotify.csv")
# Make sure we have the right variables
song_metrics = [
'acousticness', 'danceability', 'duration_ms', 'energy',
'instrumentalness', 'liveness', 'loudness',
'popularity', 'speechiness', 'tempo', 'valence'
]
# Sample for performance
sampled_data = data.sample(n=15000, random_state=42)
# Analysis dataset
song_analysis = sampled_data[['name', 'artists'] + song_metrics].copy()
# Drop the missing values
song_clean = song_analysis.dropna()
# Standardized scales
scaler = StandardScaler()
features_scaled = scaler.fit_transform(song_clean[song_metrics])
features_scaled_df = pd.DataFrame(features_scaled, columns=song_metrics, index=song_clean.index)
# Do the cluster again and have the same random state so its the same as before
kmeans_final = KMeans(n_clusters=4, random_state=42, n_init=10)
cluster_labels_kmeans = kmeans_final.fit_predict(features_scaled_df)
# Keep clusters in a new DataFrame
sampled_clustered = song_clean.copy()
sampled_clustered['Cluster_KMeans'] = cluster_labels_kmeans
# Archetype/types of song names
archetype_names = {
0: "Pop / Singer-Songwriter / Indie Rock",
1: "Instrumental / Acoustic",
2: "Balanced Versatile",
3: "Rap / Spoken Word"
}
# Reverse map for easy lookup
cluster_name_to_id = {v: k for k, v in archetype_names.items()}
# Now time for the app
st.title(" Song Cluster Explorer")
# Add the sidebars for the types and metrics
selected_clusters = st.sidebar.multiselect(
"Choose Cluster(s)",
options=list(archetype_names.values()),
default=[archetype_names[0]]
)
selected_metrics = st.sidebar.multiselect(
"Choose Metrics",
song_metrics,
default=['danceability', 'energy', 'valence']
)
# Map selected names → IDs
selected_ids = [cluster_name_to_id[name] for name in selected_clusters]
# Filtered data
cluster_data = sampled_clustered[sampled_clustered['Cluster_KMeans'].isin(selected_ids)]
cluster_stats = cluster_data[song_metrics].mean()
overall_stats = sampled_clustered[song_metrics].mean()
# Add problem statement
st.subheader("Business Problem Statement: How can Spotify optimize personalized playlists to increase listener engagement, satisfaction, and long-term retention?")
# Make the radar with the different metrics adjustable
if selected_metrics:
angles = np.linspace(0, 2 * np.pi, len(selected_metrics), endpoint=False)
angles = np.concatenate((angles, [angles[0]]))
fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True))
for cid in selected_ids:
c_stats = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid][song_metrics].mean()
stats_norm = (c_stats[selected_metrics] - overall_stats[selected_metrics]) / overall_stats[selected_metrics] + 1
stats_norm = np.concatenate((stats_norm, [stats_norm[0]]))
ax.plot(angles, stats_norm, 'o-', linewidth=2, label=archetype_names[cid])
ax.fill(angles, stats_norm, alpha=0.25)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(selected_metrics)
ax.set_title("Relative Feature Profile", pad=30)
ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0))
st.pyplot(fig)
# A dynamic box that shows the distribution of the clusters that are selected
for cid in selected_ids:
c_data = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid]
st.markdown(f"""
**{archetype_names[cid]}**
- Songs in cluster: {len(c_data)}
- % of dataset: {len(c_data)/len(sampled_clustered)*100:.1f}%
""")
# Showing the distribution with a bar chart
sizes = sampled_clustered['Cluster_KMeans'].value_counts().sort_index()
names = [archetype_names[i] for i in sizes.index]
fig2, ax2 = plt.subplots(figsize=(8,4))
bars = ax2.bar(names, sizes, color="skyblue")
ax2.set_ylabel("Number of Songs")
ax2.set_title("Cluster Distribution")
plt.setp(ax2.get_xticklabels(), rotation=30, ha="right")
for bar, size in zip(bars, sizes):
ax2.text(bar.get_x()+bar.get_width()/2., bar.get_height()+1, str(size), ha="center")
st.pyplot(fig2)
# Show example songs which are also dynamic
st.subheader("Example Songs from Selected Cluster(s)")
st.write(cluster_data[['name', 'artists']].head(20))