Spaces:

TimoPh
/

cluster_app

Sleeping

App Files Files Community

cluster_app / cluster_app.py

TimoPh

Upload 2 files

84e529c verified 5 months ago

raw

history blame contribute delete

4.52 kB


	import pandas as pd
	import numpy as np
	from sklearn.preprocessing import StandardScaler
	from sklearn.cluster import KMeans
	import streamlit as st
	import matplotlib.pyplot as plt

	# First we load the data
	data = pd.read_csv("Spotify.csv")

	# Make sure we have the right variables
	song_metrics = [
	'acousticness', 'danceability', 'duration_ms', 'energy',
	'instrumentalness', 'liveness', 'loudness',
	'popularity', 'speechiness', 'tempo', 'valence'
	]

	# Sample for performance
	sampled_data = data.sample(n=15000, random_state=42)

	# Analysis dataset
	song_analysis = sampled_data[['name', 'artists'] + song_metrics].copy()

	# Drop the missing values
	song_clean = song_analysis.dropna()

	# Standardized scales
	scaler = StandardScaler()
	features_scaled = scaler.fit_transform(song_clean[song_metrics])
	features_scaled_df = pd.DataFrame(features_scaled, columns=song_metrics, index=song_clean.index)

	# Do the cluster again and have the same random state so its the same as before
	kmeans_final = KMeans(n_clusters=4, random_state=42, n_init=10)
	cluster_labels_kmeans = kmeans_final.fit_predict(features_scaled_df)

	# Keep clusters in a new DataFrame
	sampled_clustered = song_clean.copy()
	sampled_clustered['Cluster_KMeans'] = cluster_labels_kmeans

	# Archetype/types of song names
	archetype_names = {
	0: "Pop / Singer-Songwriter / Indie Rock",
	1: "Instrumental / Acoustic",
	2: "Balanced Versatile",
	3: "Rap / Spoken Word"
	}

	# Reverse map for easy lookup
	cluster_name_to_id = {v: k for k, v in archetype_names.items()}

	# Now time for the app
	st.title(" Song Cluster Explorer")

	# Add the sidebars for the types and metrics
	selected_clusters = st.sidebar.multiselect(
	"Choose Cluster(s)",
	options=list(archetype_names.values()),
	default=[archetype_names[0]]
	)

	selected_metrics = st.sidebar.multiselect(
	"Choose Metrics",
	song_metrics,
	default=['danceability', 'energy', 'valence']
	)

	# Map selected names → IDs
	selected_ids = [cluster_name_to_id[name] for name in selected_clusters]

	# Filtered data
	cluster_data = sampled_clustered[sampled_clustered['Cluster_KMeans'].isin(selected_ids)]
	cluster_stats = cluster_data[song_metrics].mean()
	overall_stats = sampled_clustered[song_metrics].mean()

	# Add problem statement
	st.subheader("Business Problem Statement: How can Spotify optimize personalized playlists to increase listener engagement, satisfaction, and long-term retention?")

	# Make the radar with the different metrics adjustable
	if selected_metrics:
	angles = np.linspace(0, 2 * np.pi, len(selected_metrics), endpoint=False)
	angles = np.concatenate((angles, [angles[0]]))

	fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True))

	for cid in selected_ids:
	c_stats = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid][song_metrics].mean()
	stats_norm = (c_stats[selected_metrics] - overall_stats[selected_metrics]) / overall_stats[selected_metrics] + 1
	stats_norm = np.concatenate((stats_norm, [stats_norm[0]]))

	ax.plot(angles, stats_norm, 'o-', linewidth=2, label=archetype_names[cid])
	ax.fill(angles, stats_norm, alpha=0.25)

	ax.set_xticks(angles[:-1])
	ax.set_xticklabels(selected_metrics)
	ax.set_title("Relative Feature Profile", pad=30)
	ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0))
	st.pyplot(fig)

	# A dynamic box that shows the distribution of the clusters that are selected
	for cid in selected_ids:
	c_data = sampled_clustered[sampled_clustered['Cluster_KMeans'] == cid]
	st.markdown(f"""
	{archetype_names[cid]}
	- Songs in cluster: {len(c_data)}
	- % of dataset: {len(c_data)/len(sampled_clustered)*100:.1f}%
	""")

	# Showing the distribution with a bar chart
	sizes = sampled_clustered['Cluster_KMeans'].value_counts().sort_index()
	names = [archetype_names[i] for i in sizes.index]

	fig2, ax2 = plt.subplots(figsize=(8,4))
	bars = ax2.bar(names, sizes, color="skyblue")
	ax2.set_ylabel("Number of Songs")
	ax2.set_title("Cluster Distribution")
	plt.setp(ax2.get_xticklabels(), rotation=30, ha="right")

	for bar, size in zip(bars, sizes):
	ax2.text(bar.get_x()+bar.get_width()/2., bar.get_height()+1, str(size), ha="center")

	st.pyplot(fig2)

	# Show example songs which are also dynamic
	st.subheader("Example Songs from Selected Cluster(s)")
	st.write(cluster_data[['name', 'artists']].head(20))