Spaces:

TimoPh
/

cluster_app

Sleeping

File size: 4,522 Bytes

84e529c


import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import streamlit as st
import matplotlib.pyplot as plt

# First we load the data
data = pd.read_csv("Spotify.csv")

# Make sure we have the right variables
song_metrics = [
    'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness',
    'popularity', 'speechiness', 'tempo', 'valence'
]

# Sample for performance
sampled_data = data.sample(n=15000, random_state=42)

# Analysis dataset
song_analysis = sampled_data[['name', 'artists'] + song_metrics].copy()

# Drop the missing values
song_clean = song_analysis.dropna()

# Standardized scales
scaler = StandardScaler()
features_scaled = scaler.fit_transform(song_clean[song_metrics])
features_scaled_df = pd.DataFrame(features_scaled, columns=song_metrics, index=song_clean.index)

# Do the cluster again and have the same random state so its the same as before
kmeans_final = KMeans(n_clusters=4, random_state=42, n_init=10)
cluster_labels_kmeans = kmeans_final.fit_predict(features_scaled_df)

# Keep clusters in a new DataFrame
sampled_clustered = song_clean.copy()
sampled_clustered['Cluster_KMeans'] = cluster_labels_kmeans

# Archetype/types of song names
archetype_names = {
    0: "Pop / Singer-Songwriter / Indie Rock",
    1: "Instrumental / Acoustic",
    2: "Balanced Versatile",
    3: "Rap / Spoken Word"
}

# Reverse map for easy lookup
cluster_name_to_id = {v: k for k, v in archetype_names.items()}

# Now time for the app
st.title(" Song Cluster Explorer")

# Add the sidebars for the types and metrics
selected_clusters = st.sidebar.multiselect(
    "Choose Cluster(s)",
    options=list(archetype_names.values()),
    default=[archetype_names[0]]
)

selected_metrics = st.sidebar.multiselect(
    "Choose Metrics",
    song_metrics,
    default=['danceability', 'energy', 'valence']
)

# Map selected names → IDs
selected_ids = [cluster_name_to_id[name] for name in selected_clusters]

# Filtered data
cluster_data = sampled_clustered[sampled_clustered['Cluster_KMeans'].isin(selected_ids)]
cluster_stats = cluster_data[song_metrics].mean()
overall_stats = sampled_clustered[song_metrics].mean()

# Add problem statement
st.subheader("Business Problem Statement: How can Spotify optimize personalized playlists to increase listener engagement, satisfaction, and long-term retention?")

#  Make the radar with the different metrics adjustable
if selected_metrics:
    angles = np.linspace(0, 2 * np.pi, len(selected_metrics), endpoint=False)
    angles = np.concatenate((angles, [angles[0]]))

    fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True))

    for cid in selected_ids:
        c_stats = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid][song_metrics].mean()
        stats_norm = (c_stats[selected_metrics] - overall_stats[selected_metrics]) / overall_stats[selected_metrics] + 1
        stats_norm = np.concatenate((stats_norm, [stats_norm[0]]))

        ax.plot(angles, stats_norm, 'o-', linewidth=2, label=archetype_names[cid])
        ax.fill(angles, stats_norm, alpha=0.25)

    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(selected_metrics)
    ax.set_title("Relative Feature Profile", pad=30)
    ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0))
    st.pyplot(fig)

#  A dynamic box that shows the distribution of the clusters that are selected
for cid in selected_ids:
    c_data = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid]
    st.markdown(f"""

    **{archetype_names[cid]}**  

    - Songs in cluster: {len(c_data)}  

    - % of dataset: {len(c_data)/len(sampled_clustered)*100:.1f}%  

    """)

# Showing the distribution with a bar chart 
sizes = sampled_clustered['Cluster_KMeans'].value_counts().sort_index()
names = [archetype_names[i] for i in sizes.index]

fig2, ax2 = plt.subplots(figsize=(8,4))
bars = ax2.bar(names, sizes, color="skyblue")
ax2.set_ylabel("Number of Songs")
ax2.set_title("Cluster Distribution")
plt.setp(ax2.get_xticklabels(), rotation=30, ha="right")

for bar, size in zip(bars, sizes):
    ax2.text(bar.get_x()+bar.get_width()/2., bar.get_height()+1, str(size), ha="center")

st.pyplot(fig2)

# Show example songs which are also dynamic
st.subheader("Example Songs from Selected Cluster(s)")
st.write(cluster_data[['name', 'artists']].head(20))