Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- cluster_app.py +126 -0
- requirements.txt +7 -0
cluster_app.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.preprocessing import StandardScaler
|
| 5 |
+
from sklearn.cluster import KMeans
|
| 6 |
+
import streamlit as st
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
|
| 9 |
+
# First we load the data
|
| 10 |
+
data = pd.read_csv("Spotify.csv")
|
| 11 |
+
|
| 12 |
+
# Make sure we have the right variables
|
| 13 |
+
song_metrics = [
|
| 14 |
+
'acousticness', 'danceability', 'duration_ms', 'energy',
|
| 15 |
+
'instrumentalness', 'liveness', 'loudness',
|
| 16 |
+
'popularity', 'speechiness', 'tempo', 'valence'
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
# Sample for performance
|
| 20 |
+
sampled_data = data.sample(n=15000, random_state=42)
|
| 21 |
+
|
| 22 |
+
# Analysis dataset
|
| 23 |
+
song_analysis = sampled_data[['name', 'artists'] + song_metrics].copy()
|
| 24 |
+
|
| 25 |
+
# Drop the missing values
|
| 26 |
+
song_clean = song_analysis.dropna()
|
| 27 |
+
|
| 28 |
+
# Standardized scales
|
| 29 |
+
scaler = StandardScaler()
|
| 30 |
+
features_scaled = scaler.fit_transform(song_clean[song_metrics])
|
| 31 |
+
features_scaled_df = pd.DataFrame(features_scaled, columns=song_metrics, index=song_clean.index)
|
| 32 |
+
|
| 33 |
+
# Do the cluster again and have the same random state so its the same as before
|
| 34 |
+
kmeans_final = KMeans(n_clusters=4, random_state=42, n_init=10)
|
| 35 |
+
cluster_labels_kmeans = kmeans_final.fit_predict(features_scaled_df)
|
| 36 |
+
|
| 37 |
+
# Keep clusters in a new DataFrame
|
| 38 |
+
sampled_clustered = song_clean.copy()
|
| 39 |
+
sampled_clustered['Cluster_KMeans'] = cluster_labels_kmeans
|
| 40 |
+
|
| 41 |
+
# Archetype/types of song names
|
| 42 |
+
archetype_names = {
|
| 43 |
+
0: "Pop / Singer-Songwriter / Indie Rock",
|
| 44 |
+
1: "Instrumental / Acoustic",
|
| 45 |
+
2: "Balanced Versatile",
|
| 46 |
+
3: "Rap / Spoken Word"
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
# Reverse map for easy lookup
|
| 50 |
+
cluster_name_to_id = {v: k for k, v in archetype_names.items()}
|
| 51 |
+
|
| 52 |
+
# Now time for the app
|
| 53 |
+
st.title(" Song Cluster Explorer")
|
| 54 |
+
|
| 55 |
+
# Add the sidebars for the types and metrics
|
| 56 |
+
selected_clusters = st.sidebar.multiselect(
|
| 57 |
+
"Choose Cluster(s)",
|
| 58 |
+
options=list(archetype_names.values()),
|
| 59 |
+
default=[archetype_names[0]]
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
selected_metrics = st.sidebar.multiselect(
|
| 63 |
+
"Choose Metrics",
|
| 64 |
+
song_metrics,
|
| 65 |
+
default=['danceability', 'energy', 'valence']
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Map selected names → IDs
|
| 69 |
+
selected_ids = [cluster_name_to_id[name] for name in selected_clusters]
|
| 70 |
+
|
| 71 |
+
# Filtered data
|
| 72 |
+
cluster_data = sampled_clustered[sampled_clustered['Cluster_KMeans'].isin(selected_ids)]
|
| 73 |
+
cluster_stats = cluster_data[song_metrics].mean()
|
| 74 |
+
overall_stats = sampled_clustered[song_metrics].mean()
|
| 75 |
+
|
| 76 |
+
# Add problem statement
|
| 77 |
+
st.subheader("Business Problem Statement: How can Spotify optimize personalized playlists to increase listener engagement, satisfaction, and long-term retention?")
|
| 78 |
+
|
| 79 |
+
# Make the radar with the different metrics adjustable
|
| 80 |
+
if selected_metrics:
|
| 81 |
+
angles = np.linspace(0, 2 * np.pi, len(selected_metrics), endpoint=False)
|
| 82 |
+
angles = np.concatenate((angles, [angles[0]]))
|
| 83 |
+
|
| 84 |
+
fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True))
|
| 85 |
+
|
| 86 |
+
for cid in selected_ids:
|
| 87 |
+
c_stats = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid][song_metrics].mean()
|
| 88 |
+
stats_norm = (c_stats[selected_metrics] - overall_stats[selected_metrics]) / overall_stats[selected_metrics] + 1
|
| 89 |
+
stats_norm = np.concatenate((stats_norm, [stats_norm[0]]))
|
| 90 |
+
|
| 91 |
+
ax.plot(angles, stats_norm, 'o-', linewidth=2, label=archetype_names[cid])
|
| 92 |
+
ax.fill(angles, stats_norm, alpha=0.25)
|
| 93 |
+
|
| 94 |
+
ax.set_xticks(angles[:-1])
|
| 95 |
+
ax.set_xticklabels(selected_metrics)
|
| 96 |
+
ax.set_title("Relative Feature Profile", pad=30)
|
| 97 |
+
ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0))
|
| 98 |
+
st.pyplot(fig)
|
| 99 |
+
|
| 100 |
+
# A dynamic box that shows the distribution of the clusters that are selected
|
| 101 |
+
for cid in selected_ids:
|
| 102 |
+
c_data = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid]
|
| 103 |
+
st.markdown(f"""
|
| 104 |
+
**{archetype_names[cid]}**
|
| 105 |
+
- Songs in cluster: {len(c_data)}
|
| 106 |
+
- % of dataset: {len(c_data)/len(sampled_clustered)*100:.1f}%
|
| 107 |
+
""")
|
| 108 |
+
|
| 109 |
+
# Showing the distribution with a bar chart
|
| 110 |
+
sizes = sampled_clustered['Cluster_KMeans'].value_counts().sort_index()
|
| 111 |
+
names = [archetype_names[i] for i in sizes.index]
|
| 112 |
+
|
| 113 |
+
fig2, ax2 = plt.subplots(figsize=(8,4))
|
| 114 |
+
bars = ax2.bar(names, sizes, color="skyblue")
|
| 115 |
+
ax2.set_ylabel("Number of Songs")
|
| 116 |
+
ax2.set_title("Cluster Distribution")
|
| 117 |
+
plt.setp(ax2.get_xticklabels(), rotation=30, ha="right")
|
| 118 |
+
|
| 119 |
+
for bar, size in zip(bars, sizes):
|
| 120 |
+
ax2.text(bar.get_x()+bar.get_width()/2., bar.get_height()+1, str(size), ha="center")
|
| 121 |
+
|
| 122 |
+
st.pyplot(fig2)
|
| 123 |
+
|
| 124 |
+
# Show example songs which are also dynamic
|
| 125 |
+
st.subheader("Example Songs from Selected Cluster(s)")
|
| 126 |
+
st.write(cluster_data[['name', 'artists']].head(20))
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit==1.36.0
|
| 2 |
+
pandas==2.2.2
|
| 3 |
+
altair==5.3.0
|
| 4 |
+
pyarrow==16.1.0
|
| 5 |
+
duckdb==1.0.0
|
| 6 |
+
polars==1.5.0
|
| 7 |
+
numpy==1.26.4
|