import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans import streamlit as st import matplotlib.pyplot as plt # First we load the data data = pd.read_csv("Spotify.csv") # Make sure we have the right variables song_metrics = [ 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'popularity', 'speechiness', 'tempo', 'valence' ] # Sample for performance sampled_data = data.sample(n=15000, random_state=42) # Analysis dataset song_analysis = sampled_data[['name', 'artists'] + song_metrics].copy() # Drop the missing values song_clean = song_analysis.dropna() # Standardized scales scaler = StandardScaler() features_scaled = scaler.fit_transform(song_clean[song_metrics]) features_scaled_df = pd.DataFrame(features_scaled, columns=song_metrics, index=song_clean.index) # Do the cluster again and have the same random state so its the same as before kmeans_final = KMeans(n_clusters=4, random_state=42, n_init=10) cluster_labels_kmeans = kmeans_final.fit_predict(features_scaled_df) # Keep clusters in a new DataFrame sampled_clustered = song_clean.copy() sampled_clustered['Cluster_KMeans'] = cluster_labels_kmeans # Archetype/types of song names archetype_names = { 0: "Pop / Singer-Songwriter / Indie Rock", 1: "Instrumental / Acoustic", 2: "Balanced Versatile", 3: "Rap / Spoken Word" } # Reverse map for easy lookup cluster_name_to_id = {v: k for k, v in archetype_names.items()} # Now time for the app st.title(" Song Cluster Explorer") # Add the sidebars for the types and metrics selected_clusters = st.sidebar.multiselect( "Choose Cluster(s)", options=list(archetype_names.values()), default=[archetype_names[0]] ) selected_metrics = st.sidebar.multiselect( "Choose Metrics", song_metrics, default=['danceability', 'energy', 'valence'] ) # Map selected names → IDs selected_ids = [cluster_name_to_id[name] for name in selected_clusters] # Filtered data cluster_data = sampled_clustered[sampled_clustered['Cluster_KMeans'].isin(selected_ids)] cluster_stats = cluster_data[song_metrics].mean() overall_stats = sampled_clustered[song_metrics].mean() # Add problem statement st.subheader("Business Problem Statement: How can Spotify optimize personalized playlists to increase listener engagement, satisfaction, and long-term retention?") # Make the radar with the different metrics adjustable if selected_metrics: angles = np.linspace(0, 2 * np.pi, len(selected_metrics), endpoint=False) angles = np.concatenate((angles, [angles[0]])) fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True)) for cid in selected_ids: c_stats = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid][song_metrics].mean() stats_norm = (c_stats[selected_metrics] - overall_stats[selected_metrics]) / overall_stats[selected_metrics] + 1 stats_norm = np.concatenate((stats_norm, [stats_norm[0]])) ax.plot(angles, stats_norm, 'o-', linewidth=2, label=archetype_names[cid]) ax.fill(angles, stats_norm, alpha=0.25) ax.set_xticks(angles[:-1]) ax.set_xticklabels(selected_metrics) ax.set_title("Relative Feature Profile", pad=30) ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0)) st.pyplot(fig) # A dynamic box that shows the distribution of the clusters that are selected for cid in selected_ids: c_data = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid] st.markdown(f""" **{archetype_names[cid]}** - Songs in cluster: {len(c_data)} - % of dataset: {len(c_data)/len(sampled_clustered)*100:.1f}% """) # Showing the distribution with a bar chart sizes = sampled_clustered['Cluster_KMeans'].value_counts().sort_index() names = [archetype_names[i] for i in sizes.index] fig2, ax2 = plt.subplots(figsize=(8,4)) bars = ax2.bar(names, sizes, color="skyblue") ax2.set_ylabel("Number of Songs") ax2.set_title("Cluster Distribution") plt.setp(ax2.get_xticklabels(), rotation=30, ha="right") for bar, size in zip(bars, sizes): ax2.text(bar.get_x()+bar.get_width()/2., bar.get_height()+1, str(size), ha="center") st.pyplot(fig2) # Show example songs which are also dynamic st.subheader("Example Songs from Selected Cluster(s)") st.write(cluster_data[['name', 'artists']].head(20))