Spaces:

TimoPh
/

cluster_app

Sleeping

App Files Files Community

TimoPh commited on Sep 22, 2025

Commit

84e529c

verified ·

1 Parent(s): 00a5210

Upload 2 files

Browse files

Files changed (2) hide show

cluster_app.py +126 -0
requirements.txt +7 -0

cluster_app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from sklearn.cluster import KMeans
+import streamlit as st
+import matplotlib.pyplot as plt
+# First we load the data
+data = pd.read_csv("Spotify.csv")
+# Make sure we have the right variables
+song_metrics = [
+    'acousticness', 'danceability', 'duration_ms', 'energy',
+    'instrumentalness', 'liveness', 'loudness',
+    'popularity', 'speechiness', 'tempo', 'valence'
+]
+# Sample for performance
+sampled_data = data.sample(n=15000, random_state=42)
+# Analysis dataset
+song_analysis = sampled_data[['name', 'artists'] + song_metrics].copy()
+# Drop the missing values
+song_clean = song_analysis.dropna()
+# Standardized scales
+scaler = StandardScaler()
+features_scaled = scaler.fit_transform(song_clean[song_metrics])
+features_scaled_df = pd.DataFrame(features_scaled, columns=song_metrics, index=song_clean.index)
+# Do the cluster again and have the same random state so its the same as before
+kmeans_final = KMeans(n_clusters=4, random_state=42, n_init=10)
+cluster_labels_kmeans = kmeans_final.fit_predict(features_scaled_df)
+# Keep clusters in a new DataFrame
+sampled_clustered = song_clean.copy()
+sampled_clustered['Cluster_KMeans'] = cluster_labels_kmeans
+# Archetype/types of song names
+archetype_names = {
+    0: "Pop / Singer-Songwriter / Indie Rock",
+    1: "Instrumental / Acoustic",
+    2: "Balanced Versatile",
+    3: "Rap / Spoken Word"
+}
+# Reverse map for easy lookup
+cluster_name_to_id = {v: k for k, v in archetype_names.items()}
+# Now time for the app
+st.title(" Song Cluster Explorer")
+# Add the sidebars for the types and metrics
+selected_clusters = st.sidebar.multiselect(
+    "Choose Cluster(s)",
+    options=list(archetype_names.values()),
+    default=[archetype_names[0]]
+)
+selected_metrics = st.sidebar.multiselect(
+    "Choose Metrics",
+    song_metrics,
+    default=['danceability', 'energy', 'valence']
+)
+# Map selected names → IDs
+selected_ids = [cluster_name_to_id[name] for name in selected_clusters]
+# Filtered data
+cluster_data = sampled_clustered[sampled_clustered['Cluster_KMeans'].isin(selected_ids)]
+cluster_stats = cluster_data[song_metrics].mean()
+overall_stats = sampled_clustered[song_metrics].mean()
+# Add problem statement
+st.subheader("Business Problem Statement: How can Spotify optimize personalized playlists to increase listener engagement, satisfaction, and long-term retention?")
+#  Make the radar with the different metrics adjustable
+if selected_metrics:
+    angles = np.linspace(0, 2 * np.pi, len(selected_metrics), endpoint=False)
+    angles = np.concatenate((angles, [angles[0]]))
+    fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True))
+    for cid in selected_ids:
+        c_stats = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid][song_metrics].mean()
+        stats_norm = (c_stats[selected_metrics] - overall_stats[selected_metrics]) / overall_stats[selected_metrics] + 1
+        stats_norm = np.concatenate((stats_norm, [stats_norm[0]]))
+        ax.plot(angles, stats_norm, 'o-', linewidth=2, label=archetype_names[cid])
+        ax.fill(angles, stats_norm, alpha=0.25)
+    ax.set_xticks(angles[:-1])
+    ax.set_xticklabels(selected_metrics)
+    ax.set_title("Relative Feature Profile", pad=30)
+    ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0))
+    st.pyplot(fig)
+#  A dynamic box that shows the distribution of the clusters that are selected
+for cid in selected_ids:
+    c_data = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid]
+    st.markdown(f"""
+    **{archetype_names[cid]}**
+    - Songs in cluster: {len(c_data)}
+    - % of dataset: {len(c_data)/len(sampled_clustered)*100:.1f}%
+    """)
+# Showing the distribution with a bar chart
+sizes = sampled_clustered['Cluster_KMeans'].value_counts().sort_index()
+names = [archetype_names[i] for i in sizes.index]
+fig2, ax2 = plt.subplots(figsize=(8,4))
+bars = ax2.bar(names, sizes, color="skyblue")
+ax2.set_ylabel("Number of Songs")
+ax2.set_title("Cluster Distribution")
+plt.setp(ax2.get_xticklabels(), rotation=30, ha="right")
+for bar, size in zip(bars, sizes):
+    ax2.text(bar.get_x()+bar.get_width()/2., bar.get_height()+1, str(size), ha="center")
+st.pyplot(fig2)
+# Show example songs which are also dynamic
+st.subheader("Example Songs from Selected Cluster(s)")
+st.write(cluster_data[['name', 'artists']].head(20))

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit==1.36.0
+pandas==2.2.2
+altair==5.3.0
+pyarrow==16.1.0
+duckdb==1.0.0
+polars==1.5.0
+numpy==1.26.4