import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score import warnings warnings.filterwarnings('ignore') # Page configuration st.set_page_config( page_title="Spotify Playlist Optimizer", page_icon="🎵", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for better styling st.markdown(""" """, unsafe_allow_html=True) @st.cache_data def load_and_process_data(): """Load and process Spotify data with clustering""" # Load data spotify_url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv' df = pd.read_csv(spotify_url) # Audio features for analysis audio_features = [ 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'loudness', 'key', 'mode' ] # Clean data df_clean = df.drop_duplicates(subset=['track_name', 'track_artist'], keep='first') # Remove outliers outlier_conditions = ( (df_clean['duration_ms'] > 30000) & (df_clean['duration_ms'] < 600000) & (df_clean['tempo'] > 50) & (df_clean['tempo'] < 200) & (df_clean['track_popularity'] > 0) ) df_clean = df_clean[outlier_conditions] # Remove missing values df_clean = df_clean.dropna(subset=audio_features) # Scale features scaler = StandardScaler() features_scaled = scaler.fit_transform(df_clean[audio_features]) # Apply PCA pca = PCA() pca_results = pca.fit_transform(features_scaled) # Clustering n_components = 5 kmeans = KMeans(n_clusters=6, random_state=42, n_init=10) clusters = kmeans.fit_predict(pca_results[:, :n_components]) # Add results to dataframe df_final = df_clean.copy() df_final['Cluster'] = clusters df_final['PC1'] = pca_results[:, 0] df_final['PC2'] = pca_results[:, 1] df_final['PC3'] = pca_results[:, 2] # Cluster names based on characteristics cluster_names = { 0: "Energetic Mainstream", 1: "Acoustic Chill", 2: "High-Energy Party", 3: "Moody & Introspective", 4: "Workout & Motivation", 5: "Focus & Background" } df_final['Cluster_Name'] = df_final['Cluster'].map(cluster_names) return df_final, pca, scaler, audio_features, cluster_names def create_cluster_profile(df, cluster_id, audio_features): """Create detailed cluster profile""" cluster_data = df[df['Cluster'] == cluster_id] overall_stats = df[audio_features].mean() cluster_stats = cluster_data[audio_features].mean() # Calculate differences differences = [] for feature in audio_features: diff_pct = ((cluster_stats[feature] - overall_stats[feature]) / overall_stats[feature]) * 100 if abs(diff_pct) > 10: # Only significant differences differences.append({ 'feature': feature.replace('_', ' ').title(), 'value': cluster_stats[feature], 'diff_pct': diff_pct }) differences.sort(key=lambda x: abs(x['diff_pct']), reverse=True) return { 'size': len(cluster_data), 'avg_popularity': cluster_data['track_popularity'].mean(), 'top_genres': cluster_data['playlist_genre'].value_counts().head(3), 'differences': differences, 'sample_tracks': cluster_data.nlargest(5, 'track_popularity')[['track_name', 'track_artist', 'track_popularity']] } def main(): # Load data df, pca, scaler, audio_features, cluster_names = load_and_process_data() # Header st.title("🎵 Spotify Playlist Optimizer") st.markdown("### Data-Driven Solutions for Music Engagement") # Business problem statement with st.expander("📊 Business Problem & Solution", expanded=True): col1, col2 = st.columns(2) with col1: st.markdown(""" **The Challenge:** - Streaming platforms face high skip rates that impact user engagement - Traditional genre-based grouping fails in real contexts - Poor playlist flow leads to user disengagement - Lost revenue from subscription churn """) with col2: st.markdown(""" **Our Solution:** - Audio feature-based clustering identifies 6 playlist types - Data-driven curation reduces skip rates - Context-aware recommendations improve engagement - Actionable insights for streaming platforms """) # Sidebar controls st.sidebar.header("🎛️ Explore Clusters") # Control 1: Cluster Selection selected_cluster = st.sidebar.selectbox( "Select Playlist Category:", options=list(cluster_names.keys()), format_func=lambda x: f"{cluster_names[x]} (Cluster {x})", index=2 # Default to High-Energy Party ) # Control 2: Audio Feature Focus focus_feature = st.sidebar.selectbox( "Focus Audio Feature:", options=['energy', 'danceability', 'valence', 'acousticness', 'tempo'], index=0 ) # Control 3: Popularity Filter min_popularity = st.sidebar.slider( "Minimum Track Popularity:", min_value=0, max_value=100, value=20, step=10 ) # Control 4: Genre Filter available_genres = df['playlist_genre'].unique() selected_genres = st.sidebar.multiselect( "Filter by Genres:", options=available_genres, default=available_genres ) # Filter data based on controls filtered_df = df[ (df['track_popularity'] >= min_popularity) & (df['playlist_genre'].isin(selected_genres)) ] # Main content area col1, col2 = st.columns([2, 1]) with col1: # Visualization 1: Cluster scatter plot st.subheader("🎯 Playlist Categories in Audio Space") fig = px.scatter( filtered_df, x='PC1', y='PC2', color='Cluster_Name', size=focus_feature, hover_data=['track_name', 'track_artist', 'track_popularity'], title=f"Playlist Categories (sized by {focus_feature.title()})", width=700, height=500 ) # Highlight selected cluster selected_cluster_data = filtered_df[filtered_df['Cluster'] == selected_cluster] fig.add_scatter( x=selected_cluster_data['PC1'], y=selected_cluster_data['PC2'], mode='markers', marker=dict(color='red', size=12, symbol='diamond', line=dict(color='white', width=2)), name=f'Selected: {cluster_names[selected_cluster]}', showlegend=True ) fig.update_layout( xaxis_title="PC1: Energy Spectrum (High-Energy ← → Acoustic)", yaxis_title="PC2: Mood Dimension (Positive ← → Introspective)" ) st.plotly_chart(fig, use_container_width=True) with col2: # Key metrics for selected cluster cluster_profile = create_cluster_profile(filtered_df, selected_cluster, audio_features) st.markdown(f"""