TimoPh commited on
Commit
84e529c
·
verified ·
1 Parent(s): 00a5210

Upload 2 files

Browse files
Files changed (2) hide show
  1. cluster_app.py +126 -0
  2. requirements.txt +7 -0
cluster_app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.preprocessing import StandardScaler
5
+ from sklearn.cluster import KMeans
6
+ import streamlit as st
7
+ import matplotlib.pyplot as plt
8
+
9
+ # First we load the data
10
+ data = pd.read_csv("Spotify.csv")
11
+
12
+ # Make sure we have the right variables
13
+ song_metrics = [
14
+ 'acousticness', 'danceability', 'duration_ms', 'energy',
15
+ 'instrumentalness', 'liveness', 'loudness',
16
+ 'popularity', 'speechiness', 'tempo', 'valence'
17
+ ]
18
+
19
+ # Sample for performance
20
+ sampled_data = data.sample(n=15000, random_state=42)
21
+
22
+ # Analysis dataset
23
+ song_analysis = sampled_data[['name', 'artists'] + song_metrics].copy()
24
+
25
+ # Drop the missing values
26
+ song_clean = song_analysis.dropna()
27
+
28
+ # Standardized scales
29
+ scaler = StandardScaler()
30
+ features_scaled = scaler.fit_transform(song_clean[song_metrics])
31
+ features_scaled_df = pd.DataFrame(features_scaled, columns=song_metrics, index=song_clean.index)
32
+
33
+ # Do the cluster again and have the same random state so its the same as before
34
+ kmeans_final = KMeans(n_clusters=4, random_state=42, n_init=10)
35
+ cluster_labels_kmeans = kmeans_final.fit_predict(features_scaled_df)
36
+
37
+ # Keep clusters in a new DataFrame
38
+ sampled_clustered = song_clean.copy()
39
+ sampled_clustered['Cluster_KMeans'] = cluster_labels_kmeans
40
+
41
+ # Archetype/types of song names
42
+ archetype_names = {
43
+ 0: "Pop / Singer-Songwriter / Indie Rock",
44
+ 1: "Instrumental / Acoustic",
45
+ 2: "Balanced Versatile",
46
+ 3: "Rap / Spoken Word"
47
+ }
48
+
49
+ # Reverse map for easy lookup
50
+ cluster_name_to_id = {v: k for k, v in archetype_names.items()}
51
+
52
+ # Now time for the app
53
+ st.title(" Song Cluster Explorer")
54
+
55
+ # Add the sidebars for the types and metrics
56
+ selected_clusters = st.sidebar.multiselect(
57
+ "Choose Cluster(s)",
58
+ options=list(archetype_names.values()),
59
+ default=[archetype_names[0]]
60
+ )
61
+
62
+ selected_metrics = st.sidebar.multiselect(
63
+ "Choose Metrics",
64
+ song_metrics,
65
+ default=['danceability', 'energy', 'valence']
66
+ )
67
+
68
+ # Map selected names → IDs
69
+ selected_ids = [cluster_name_to_id[name] for name in selected_clusters]
70
+
71
+ # Filtered data
72
+ cluster_data = sampled_clustered[sampled_clustered['Cluster_KMeans'].isin(selected_ids)]
73
+ cluster_stats = cluster_data[song_metrics].mean()
74
+ overall_stats = sampled_clustered[song_metrics].mean()
75
+
76
+ # Add problem statement
77
+ st.subheader("Business Problem Statement: How can Spotify optimize personalized playlists to increase listener engagement, satisfaction, and long-term retention?")
78
+
79
+ # Make the radar with the different metrics adjustable
80
+ if selected_metrics:
81
+ angles = np.linspace(0, 2 * np.pi, len(selected_metrics), endpoint=False)
82
+ angles = np.concatenate((angles, [angles[0]]))
83
+
84
+ fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True))
85
+
86
+ for cid in selected_ids:
87
+ c_stats = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid][song_metrics].mean()
88
+ stats_norm = (c_stats[selected_metrics] - overall_stats[selected_metrics]) / overall_stats[selected_metrics] + 1
89
+ stats_norm = np.concatenate((stats_norm, [stats_norm[0]]))
90
+
91
+ ax.plot(angles, stats_norm, 'o-', linewidth=2, label=archetype_names[cid])
92
+ ax.fill(angles, stats_norm, alpha=0.25)
93
+
94
+ ax.set_xticks(angles[:-1])
95
+ ax.set_xticklabels(selected_metrics)
96
+ ax.set_title("Relative Feature Profile", pad=30)
97
+ ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0))
98
+ st.pyplot(fig)
99
+
100
+ # A dynamic box that shows the distribution of the clusters that are selected
101
+ for cid in selected_ids:
102
+ c_data = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid]
103
+ st.markdown(f"""
104
+ **{archetype_names[cid]}**
105
+ - Songs in cluster: {len(c_data)}
106
+ - % of dataset: {len(c_data)/len(sampled_clustered)*100:.1f}%
107
+ """)
108
+
109
+ # Showing the distribution with a bar chart
110
+ sizes = sampled_clustered['Cluster_KMeans'].value_counts().sort_index()
111
+ names = [archetype_names[i] for i in sizes.index]
112
+
113
+ fig2, ax2 = plt.subplots(figsize=(8,4))
114
+ bars = ax2.bar(names, sizes, color="skyblue")
115
+ ax2.set_ylabel("Number of Songs")
116
+ ax2.set_title("Cluster Distribution")
117
+ plt.setp(ax2.get_xticklabels(), rotation=30, ha="right")
118
+
119
+ for bar, size in zip(bars, sizes):
120
+ ax2.text(bar.get_x()+bar.get_width()/2., bar.get_height()+1, str(size), ha="center")
121
+
122
+ st.pyplot(fig2)
123
+
124
+ # Show example songs which are also dynamic
125
+ st.subheader("Example Songs from Selected Cluster(s)")
126
+ st.write(cluster_data[['name', 'artists']].head(20))
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit==1.36.0
2
+ pandas==2.2.2
3
+ altair==5.3.0
4
+ pyarrow==16.1.0
5
+ duckdb==1.0.0
6
+ polars==1.5.0
7
+ numpy==1.26.4