File size: 4,522 Bytes
84e529c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import streamlit as st
import matplotlib.pyplot as plt

# First we load the data
data = pd.read_csv("Spotify.csv")

# Make sure we have the right variables
song_metrics = [
    'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness',
    'popularity', 'speechiness', 'tempo', 'valence'
]

# Sample for performance
sampled_data = data.sample(n=15000, random_state=42)

# Analysis dataset
song_analysis = sampled_data[['name', 'artists'] + song_metrics].copy()

# Drop the missing values
song_clean = song_analysis.dropna()

# Standardized scales
scaler = StandardScaler()
features_scaled = scaler.fit_transform(song_clean[song_metrics])
features_scaled_df = pd.DataFrame(features_scaled, columns=song_metrics, index=song_clean.index)

# Do the cluster again and have the same random state so its the same as before
kmeans_final = KMeans(n_clusters=4, random_state=42, n_init=10)
cluster_labels_kmeans = kmeans_final.fit_predict(features_scaled_df)

# Keep clusters in a new DataFrame
sampled_clustered = song_clean.copy()
sampled_clustered['Cluster_KMeans'] = cluster_labels_kmeans

# Archetype/types of song names
archetype_names = {
    0: "Pop / Singer-Songwriter / Indie Rock",
    1: "Instrumental / Acoustic",
    2: "Balanced Versatile",
    3: "Rap / Spoken Word"
}

# Reverse map for easy lookup
cluster_name_to_id = {v: k for k, v in archetype_names.items()}

# Now time for the app
st.title(" Song Cluster Explorer")

# Add the sidebars for the types and metrics
selected_clusters = st.sidebar.multiselect(
    "Choose Cluster(s)",
    options=list(archetype_names.values()),
    default=[archetype_names[0]]
)

selected_metrics = st.sidebar.multiselect(
    "Choose Metrics",
    song_metrics,
    default=['danceability', 'energy', 'valence']
)

# Map selected names → IDs
selected_ids = [cluster_name_to_id[name] for name in selected_clusters]

# Filtered data
cluster_data = sampled_clustered[sampled_clustered['Cluster_KMeans'].isin(selected_ids)]
cluster_stats = cluster_data[song_metrics].mean()
overall_stats = sampled_clustered[song_metrics].mean()

# Add problem statement
st.subheader("Business Problem Statement: How can Spotify optimize personalized playlists to increase listener engagement, satisfaction, and long-term retention?")

#  Make the radar with the different metrics adjustable
if selected_metrics:
    angles = np.linspace(0, 2 * np.pi, len(selected_metrics), endpoint=False)
    angles = np.concatenate((angles, [angles[0]]))

    fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True))

    for cid in selected_ids:
        c_stats = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid][song_metrics].mean()
        stats_norm = (c_stats[selected_metrics] - overall_stats[selected_metrics]) / overall_stats[selected_metrics] + 1
        stats_norm = np.concatenate((stats_norm, [stats_norm[0]]))

        ax.plot(angles, stats_norm, 'o-', linewidth=2, label=archetype_names[cid])
        ax.fill(angles, stats_norm, alpha=0.25)

    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(selected_metrics)
    ax.set_title("Relative Feature Profile", pad=30)
    ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0))
    st.pyplot(fig)

#  A dynamic box that shows the distribution of the clusters that are selected
for cid in selected_ids:
    c_data = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid]
    st.markdown(f"""

    **{archetype_names[cid]}**  

    - Songs in cluster: {len(c_data)}  

    - % of dataset: {len(c_data)/len(sampled_clustered)*100:.1f}%  

    """)

# Showing the distribution with a bar chart 
sizes = sampled_clustered['Cluster_KMeans'].value_counts().sort_index()
names = [archetype_names[i] for i in sizes.index]

fig2, ax2 = plt.subplots(figsize=(8,4))
bars = ax2.bar(names, sizes, color="skyblue")
ax2.set_ylabel("Number of Songs")
ax2.set_title("Cluster Distribution")
plt.setp(ax2.get_xticklabels(), rotation=30, ha="right")

for bar, size in zip(bars, sizes):
    ax2.text(bar.get_x()+bar.get_width()/2., bar.get_height()+1, str(size), ha="center")

st.pyplot(fig2)

# Show example songs which are also dynamic
st.subheader("Example Songs from Selected Cluster(s)")
st.write(cluster_data[['name', 'artists']].head(20))