import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score import warnings warnings.filterwarnings('ignore') # Page configuration st.set_page_config( page_title="Spotify Playlist Optimizer", page_icon="🎵", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for better styling st.markdown(""" """, unsafe_allow_html=True) @st.cache_data def load_and_process_data(): """Load and process Spotify data with clustering""" # Load data spotify_url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv' df = pd.read_csv(spotify_url) # Audio features for analysis audio_features = [ 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'loudness', 'key', 'mode' ] # Clean data df_clean = df.drop_duplicates(subset=['track_name', 'track_artist'], keep='first') # Remove outliers outlier_conditions = ( (df_clean['duration_ms'] > 30000) & (df_clean['duration_ms'] < 600000) & (df_clean['tempo'] > 50) & (df_clean['tempo'] < 200) & (df_clean['track_popularity'] > 0) ) df_clean = df_clean[outlier_conditions] # Remove missing values df_clean = df_clean.dropna(subset=audio_features) # Scale features scaler = StandardScaler() features_scaled = scaler.fit_transform(df_clean[audio_features]) # Apply PCA pca = PCA() pca_results = pca.fit_transform(features_scaled) # Clustering n_components = 5 kmeans = KMeans(n_clusters=6, random_state=42, n_init=10) clusters = kmeans.fit_predict(pca_results[:, :n_components]) # Add results to dataframe df_final = df_clean.copy() df_final['Cluster'] = clusters df_final['PC1'] = pca_results[:, 0] df_final['PC2'] = pca_results[:, 1] df_final['PC3'] = pca_results[:, 2] # Cluster names based on characteristics cluster_names = { 0: "Energetic Mainstream", 1: "Acoustic Chill", 2: "High-Energy Party", 3: "Moody & Introspective", 4: "Workout & Motivation", 5: "Focus & Background" } df_final['Cluster_Name'] = df_final['Cluster'].map(cluster_names) return df_final, pca, scaler, audio_features, cluster_names def create_cluster_profile(df, cluster_id, audio_features): """Create detailed cluster profile""" cluster_data = df[df['Cluster'] == cluster_id] overall_stats = df[audio_features].mean() cluster_stats = cluster_data[audio_features].mean() # Calculate differences differences = [] for feature in audio_features: diff_pct = ((cluster_stats[feature] - overall_stats[feature]) / overall_stats[feature]) * 100 if abs(diff_pct) > 10: # Only significant differences differences.append({ 'feature': feature.replace('_', ' ').title(), 'value': cluster_stats[feature], 'diff_pct': diff_pct }) differences.sort(key=lambda x: abs(x['diff_pct']), reverse=True) return { 'size': len(cluster_data), 'avg_popularity': cluster_data['track_popularity'].mean(), 'top_genres': cluster_data['playlist_genre'].value_counts().head(3), 'differences': differences, 'sample_tracks': cluster_data.nlargest(5, 'track_popularity')[['track_name', 'track_artist', 'track_popularity']] } def main(): # Load data df, pca, scaler, audio_features, cluster_names = load_and_process_data() # Header st.title("🎵 Spotify Playlist Optimizer") st.markdown("### Data-Driven Solutions for Music Engagement") # Business problem statement with st.expander("📊 Business Problem & Solution", expanded=True): col1, col2 = st.columns(2) with col1: st.markdown(""" **The Challenge:** - Streaming platforms face high skip rates that impact user engagement - Traditional genre-based grouping fails in real contexts - Poor playlist flow leads to user disengagement - Lost revenue from subscription churn """) with col2: st.markdown(""" **Our Solution:** - Audio feature-based clustering identifies 6 playlist types - Data-driven curation reduces skip rates - Context-aware recommendations improve engagement - Actionable insights for streaming platforms """) # Sidebar controls st.sidebar.header("🎛️ Explore Clusters") # Control 1: Cluster Selection selected_cluster = st.sidebar.selectbox( "Select Playlist Category:", options=list(cluster_names.keys()), format_func=lambda x: f"{cluster_names[x]} (Cluster {x})", index=2 # Default to High-Energy Party ) # Control 2: Audio Feature Focus focus_feature = st.sidebar.selectbox( "Focus Audio Feature:", options=['energy', 'danceability', 'valence', 'acousticness', 'tempo'], index=0 ) # Control 3: Popularity Filter min_popularity = st.sidebar.slider( "Minimum Track Popularity:", min_value=0, max_value=100, value=20, step=10 ) # Control 4: Genre Filter available_genres = df['playlist_genre'].unique() selected_genres = st.sidebar.multiselect( "Filter by Genres:", options=available_genres, default=available_genres ) # Filter data based on controls filtered_df = df[ (df['track_popularity'] >= min_popularity) & (df['playlist_genre'].isin(selected_genres)) ] # Main content area col1, col2 = st.columns([2, 1]) with col1: # Visualization 1: Cluster scatter plot st.subheader("🎯 Playlist Categories in Audio Space") fig = px.scatter( filtered_df, x='PC1', y='PC2', color='Cluster_Name', size=focus_feature, hover_data=['track_name', 'track_artist', 'track_popularity'], title=f"Playlist Categories (sized by {focus_feature.title()})", width=700, height=500 ) # Highlight selected cluster selected_cluster_data = filtered_df[filtered_df['Cluster'] == selected_cluster] fig.add_scatter( x=selected_cluster_data['PC1'], y=selected_cluster_data['PC2'], mode='markers', marker=dict(color='red', size=12, symbol='diamond', line=dict(color='white', width=2)), name=f'Selected: {cluster_names[selected_cluster]}', showlegend=True ) fig.update_layout( xaxis_title="PC1: Energy Spectrum (High-Energy ← → Acoustic)", yaxis_title="PC2: Mood Dimension (Positive ← → Introspective)" ) st.plotly_chart(fig, use_container_width=True) with col2: # Key metrics for selected cluster cluster_profile = create_cluster_profile(filtered_df, selected_cluster, audio_features) st.markdown(f"""

{cluster_names[selected_cluster]}

""", unsafe_allow_html=True) st.metric("Tracks in Category", f"{cluster_profile['size']:,}") st.metric("Avg Popularity", f"{cluster_profile['avg_popularity']:.1f}/100") st.metric("Market Share", f"{cluster_profile['size']/len(filtered_df)*100:.1f}%") # Visualization 2: Audio feature radar chart st.subheader("📊 Audio DNA Profile") col1, col2 = st.columns(2) with col1: # Radar chart for selected cluster cluster_data = filtered_df[filtered_df['Cluster'] == selected_cluster] radar_features = ['danceability', 'energy', 'valence', 'acousticness', 'speechiness', 'liveness'] cluster_means = cluster_data[radar_features].mean() overall_means = filtered_df[radar_features].mean() fig = go.Figure() fig.add_trace(go.Scatterpolar( r=cluster_means.values, theta=[f.title() for f in radar_features], fill='toself', name=cluster_names[selected_cluster], line_color='#1DB954' )) fig.add_trace(go.Scatterpolar( r=overall_means.values, theta=[f.title() for f in radar_features], fill='toself', name='Overall Average', line_color='gray', opacity=0.5 )) fig.update_layout( polar=dict( radialaxis=dict( visible=True, range=[0, 1] )), showlegend=True, title="Cluster vs Overall Average" ) st.plotly_chart(fig, use_container_width=True) with col2: # Distinctive characteristics st.write("**Key Characteristics:**") for diff in cluster_profile['differences'][:5]: direction = "📈" if diff['diff_pct'] > 0 else "📉" st.write(f"{direction} **{diff['feature']}**: {diff['value']:.3f} ({diff['diff_pct']:+.1f}%)") st.write("**Top Genres:**") for genre, count in cluster_profile['top_genres'].items(): percentage = (count / cluster_profile['size']) * 100 st.write(f"• {genre}: {percentage:.1f}%") # Visualization 3: Feature distribution comparison st.subheader("🎵 Feature Deep Dive") fig = make_subplots( rows=1, cols=2, subplot_titles=(f'{focus_feature.title()} Distribution', 'All Clusters Comparison') ) # Distribution plot cluster_focus = filtered_df[filtered_df['Cluster'] == selected_cluster][focus_feature] other_focus = filtered_df[filtered_df['Cluster'] != selected_cluster][focus_feature] fig.add_trace( go.Histogram(x=cluster_focus, name=cluster_names[selected_cluster], opacity=0.7, nbinsx=30), row=1, col=1 ) fig.add_trace( go.Histogram(x=other_focus, name='Other Clusters', opacity=0.5, nbinsx=30), row=1, col=1 ) # Box plot comparison for cluster_id in cluster_names.keys(): cluster_data = filtered_df[filtered_df['Cluster'] == cluster_id] fig.add_trace( go.Box(y=cluster_data[focus_feature], name=cluster_names[cluster_id], boxmean=True, marker_color='red' if cluster_id == selected_cluster else None), row=1, col=2 ) fig.update_layout(height=400, showlegend=True) st.plotly_chart(fig, use_container_width=True) # Dynamic Insights st.subheader("💡 Dynamic Business Insights") col1, col2 = st.columns(2) with col1: st.markdown("**Category Strategy:**") market_share = cluster_profile['size'] / len(filtered_df) if market_share > 0.20: strategy = "MARKET LEADER" recommendation = "Focus on differentiation and premium sub-segments" elif market_share > 0.12: strategy = "GROWTH OPPORTUNITY" recommendation = "Expand content library and increase user awareness" else: strategy = "NICHE EXCELLENCE" recommendation = "Perfect the experience for dedicated users" st.success(f"**{strategy}**") st.write(recommendation) # Skip risk assessment avg_popularity = cluster_profile['avg_popularity'] if avg_popularity > 60: skip_risk = "LOW" risk_color = "green" elif avg_popularity > 40: skip_risk = "MEDIUM" risk_color = "orange" else: skip_risk = "HIGH" risk_color = "red" st.markdown(f"**Skip Risk**: :{risk_color}[{skip_risk}]") with col2: st.markdown("**Sample Popular Tracks:**") for i, (_, track) in enumerate(cluster_profile['sample_tracks'].head(3).iterrows(), 1): st.write(f"{i}. **{track['track_name']}** - {track['track_artist']} (Pop: {track['track_popularity']})") # Context recommendations st.markdown("**Best Use Cases:**") use_cases = { 0: ["Background listening", "Casual playlists"], 1: ["Coffee shops", "Study sessions", "Relaxation"], 2: ["Parties", "Clubs", "High-intensity workouts"], 3: ["Evening listening", "Emotional moments"], 4: ["Gym workouts", "Running", "Motivation"], 5: ["Work", "Focus sessions", "Ambient background"] } for use_case in use_cases.get(selected_cluster, ["General listening"]): st.write(f"• {use_case}") # Summary recommendations st.subheader("🎯 Actionable Recommendations") recommendations = [ "**Algorithm Enhancement**: Use cluster boundaries for better song transitions", "**Playlist Curation**: Create context-specific playlists based on cluster profiles", "**User Interface**: Implement audio feature sliders for personalized discovery", "**Skip Prediction**: Monitor cross-cluster jumps to predict skip likelihood", "**Revenue Optimization**: Target B2B licensing for specific cluster use cases" ] for rec in recommendations: st.write(f"• {rec}") # Footer st.markdown("---") st.markdown(""" **Key Insight**: This analysis reveals that audio features, not genres, determine playlist compatibility. By clustering songs based on their acoustic DNA, we can reduce skip rates and improve user engagement through data-driven curation. """) if __name__ == "__main__": main()