Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.cluster import KMeans | |
| import streamlit as st | |
| import matplotlib.pyplot as plt | |
| # First we load the data | |
| data = pd.read_csv("Spotify.csv") | |
| # Make sure we have the right variables | |
| song_metrics = [ | |
| 'acousticness', 'danceability', 'duration_ms', 'energy', | |
| 'instrumentalness', 'liveness', 'loudness', | |
| 'popularity', 'speechiness', 'tempo', 'valence' | |
| ] | |
| # Sample for performance | |
| sampled_data = data.sample(n=15000, random_state=42) | |
| # Analysis dataset | |
| song_analysis = sampled_data[['name', 'artists'] + song_metrics].copy() | |
| # Drop the missing values | |
| song_clean = song_analysis.dropna() | |
| # Standardized scales | |
| scaler = StandardScaler() | |
| features_scaled = scaler.fit_transform(song_clean[song_metrics]) | |
| features_scaled_df = pd.DataFrame(features_scaled, columns=song_metrics, index=song_clean.index) | |
| # Do the cluster again and have the same random state so its the same as before | |
| kmeans_final = KMeans(n_clusters=4, random_state=42, n_init=10) | |
| cluster_labels_kmeans = kmeans_final.fit_predict(features_scaled_df) | |
| # Keep clusters in a new DataFrame | |
| sampled_clustered = song_clean.copy() | |
| sampled_clustered['Cluster_KMeans'] = cluster_labels_kmeans | |
| # Archetype/types of song names | |
| archetype_names = { | |
| 0: "Pop / Singer-Songwriter / Indie Rock", | |
| 1: "Instrumental / Acoustic", | |
| 2: "Balanced Versatile", | |
| 3: "Rap / Spoken Word" | |
| } | |
| # Reverse map for easy lookup | |
| cluster_name_to_id = {v: k for k, v in archetype_names.items()} | |
| # Now time for the app | |
| st.title(" Song Cluster Explorer") | |
| # Add the sidebars for the types and metrics | |
| selected_clusters = st.sidebar.multiselect( | |
| "Choose Cluster(s)", | |
| options=list(archetype_names.values()), | |
| default=[archetype_names[0]] | |
| ) | |
| selected_metrics = st.sidebar.multiselect( | |
| "Choose Metrics", | |
| song_metrics, | |
| default=['danceability', 'energy', 'valence'] | |
| ) | |
| # Map selected names → IDs | |
| selected_ids = [cluster_name_to_id[name] for name in selected_clusters] | |
| # Filtered data | |
| cluster_data = sampled_clustered[sampled_clustered['Cluster_KMeans'].isin(selected_ids)] | |
| cluster_stats = cluster_data[song_metrics].mean() | |
| overall_stats = sampled_clustered[song_metrics].mean() | |
| # Add problem statement | |
| st.subheader("Business Problem Statement: How can Spotify optimize personalized playlists to increase listener engagement, satisfaction, and long-term retention?") | |
| # Make the radar with the different metrics adjustable | |
| if selected_metrics: | |
| angles = np.linspace(0, 2 * np.pi, len(selected_metrics), endpoint=False) | |
| angles = np.concatenate((angles, [angles[0]])) | |
| fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True)) | |
| for cid in selected_ids: | |
| c_stats = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid][song_metrics].mean() | |
| stats_norm = (c_stats[selected_metrics] - overall_stats[selected_metrics]) / overall_stats[selected_metrics] + 1 | |
| stats_norm = np.concatenate((stats_norm, [stats_norm[0]])) | |
| ax.plot(angles, stats_norm, 'o-', linewidth=2, label=archetype_names[cid]) | |
| ax.fill(angles, stats_norm, alpha=0.25) | |
| ax.set_xticks(angles[:-1]) | |
| ax.set_xticklabels(selected_metrics) | |
| ax.set_title("Relative Feature Profile", pad=30) | |
| ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0)) | |
| st.pyplot(fig) | |
| # A dynamic box that shows the distribution of the clusters that are selected | |
| for cid in selected_ids: | |
| c_data = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid] | |
| st.markdown(f""" | |
| **{archetype_names[cid]}** | |
| - Songs in cluster: {len(c_data)} | |
| - % of dataset: {len(c_data)/len(sampled_clustered)*100:.1f}% | |
| """) | |
| # Showing the distribution with a bar chart | |
| sizes = sampled_clustered['Cluster_KMeans'].value_counts().sort_index() | |
| names = [archetype_names[i] for i in sizes.index] | |
| fig2, ax2 = plt.subplots(figsize=(8,4)) | |
| bars = ax2.bar(names, sizes, color="skyblue") | |
| ax2.set_ylabel("Number of Songs") | |
| ax2.set_title("Cluster Distribution") | |
| plt.setp(ax2.get_xticklabels(), rotation=30, ha="right") | |
| for bar, size in zip(bars, sizes): | |
| ax2.text(bar.get_x()+bar.get_width()/2., bar.get_height()+1, str(size), ha="center") | |
| st.pyplot(fig2) | |
| # Show example songs which are also dynamic | |
| st.subheader("Example Songs from Selected Cluster(s)") | |
| st.write(cluster_data[['name', 'artists']].head(20)) | |