Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.decomposition import PCA | |
| from sklearn.cluster import KMeans | |
| from sklearn.metrics import silhouette_score | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Page configuration | |
| st.set_page_config( | |
| page_title="Spotify Playlist Optimizer", | |
| page_icon="🎵", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS for better styling | |
| st.markdown(""" | |
| <style> | |
| .main > div { | |
| padding-top: 2rem; | |
| } | |
| .stMetric > div > div > div > div { | |
| font-size: 1rem; | |
| } | |
| .cluster-header { | |
| background: linear-gradient(90deg, #1DB954, #1ed760); | |
| color: white; | |
| padding: 10px; | |
| border-radius: 5px; | |
| text-align: center; | |
| margin-bottom: 20px; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| def load_and_process_data(): | |
| """Load and process Spotify data with clustering""" | |
| # Load data | |
| spotify_url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv' | |
| df = pd.read_csv(spotify_url) | |
| # Audio features for analysis | |
| audio_features = [ | |
| 'danceability', 'energy', 'speechiness', 'acousticness', | |
| 'instrumentalness', 'liveness', 'valence', 'tempo', | |
| 'duration_ms', 'loudness', 'key', 'mode' | |
| ] | |
| # Clean data | |
| df_clean = df.drop_duplicates(subset=['track_name', 'track_artist'], keep='first') | |
| # Remove outliers | |
| outlier_conditions = ( | |
| (df_clean['duration_ms'] > 30000) & | |
| (df_clean['duration_ms'] < 600000) & | |
| (df_clean['tempo'] > 50) & | |
| (df_clean['tempo'] < 200) & | |
| (df_clean['track_popularity'] > 0) | |
| ) | |
| df_clean = df_clean[outlier_conditions] | |
| # Remove missing values | |
| df_clean = df_clean.dropna(subset=audio_features) | |
| # Scale features | |
| scaler = StandardScaler() | |
| features_scaled = scaler.fit_transform(df_clean[audio_features]) | |
| # Apply PCA | |
| pca = PCA() | |
| pca_results = pca.fit_transform(features_scaled) | |
| # Clustering | |
| n_components = 5 | |
| kmeans = KMeans(n_clusters=6, random_state=42, n_init=10) | |
| clusters = kmeans.fit_predict(pca_results[:, :n_components]) | |
| # Add results to dataframe | |
| df_final = df_clean.copy() | |
| df_final['Cluster'] = clusters | |
| df_final['PC1'] = pca_results[:, 0] | |
| df_final['PC2'] = pca_results[:, 1] | |
| df_final['PC3'] = pca_results[:, 2] | |
| # Cluster names based on characteristics | |
| cluster_names = { | |
| 0: "Energetic Mainstream", | |
| 1: "Acoustic Chill", | |
| 2: "High-Energy Party", | |
| 3: "Moody & Introspective", | |
| 4: "Workout & Motivation", | |
| 5: "Focus & Background" | |
| } | |
| df_final['Cluster_Name'] = df_final['Cluster'].map(cluster_names) | |
| return df_final, pca, scaler, audio_features, cluster_names | |
| def create_cluster_profile(df, cluster_id, audio_features): | |
| """Create detailed cluster profile""" | |
| cluster_data = df[df['Cluster'] == cluster_id] | |
| overall_stats = df[audio_features].mean() | |
| cluster_stats = cluster_data[audio_features].mean() | |
| # Calculate differences | |
| differences = [] | |
| for feature in audio_features: | |
| diff_pct = ((cluster_stats[feature] - overall_stats[feature]) / overall_stats[feature]) * 100 | |
| if abs(diff_pct) > 10: # Only significant differences | |
| differences.append({ | |
| 'feature': feature.replace('_', ' ').title(), | |
| 'value': cluster_stats[feature], | |
| 'diff_pct': diff_pct | |
| }) | |
| differences.sort(key=lambda x: abs(x['diff_pct']), reverse=True) | |
| return { | |
| 'size': len(cluster_data), | |
| 'avg_popularity': cluster_data['track_popularity'].mean(), | |
| 'top_genres': cluster_data['playlist_genre'].value_counts().head(3), | |
| 'differences': differences, | |
| 'sample_tracks': cluster_data.nlargest(5, 'track_popularity')[['track_name', 'track_artist', 'track_popularity']] | |
| } | |
| def main(): | |
| # Load data | |
| df, pca, scaler, audio_features, cluster_names = load_and_process_data() | |
| # Header | |
| st.title("🎵 Spotify Playlist Optimizer") | |
| st.markdown("### Data-Driven Solutions for Music Engagement") | |
| # Business problem statement | |
| with st.expander("📊 Business Problem & Solution", expanded=True): | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown(""" | |
| **The Challenge:** | |
| - Streaming platforms face high skip rates that impact user engagement | |
| - Traditional genre-based grouping fails in real contexts | |
| - Poor playlist flow leads to user disengagement | |
| - Lost revenue from subscription churn | |
| """) | |
| with col2: | |
| st.markdown(""" | |
| **Our Solution:** | |
| - Audio feature-based clustering identifies 6 playlist types | |
| - Data-driven curation reduces skip rates | |
| - Context-aware recommendations improve engagement | |
| - Actionable insights for streaming platforms | |
| """) | |
| # Sidebar controls | |
| st.sidebar.header("🎛️ Explore Clusters") | |
| # Control 1: Cluster Selection | |
| selected_cluster = st.sidebar.selectbox( | |
| "Select Playlist Category:", | |
| options=list(cluster_names.keys()), | |
| format_func=lambda x: f"{cluster_names[x]} (Cluster {x})", | |
| index=2 # Default to High-Energy Party | |
| ) | |
| # Control 2: Audio Feature Focus | |
| focus_feature = st.sidebar.selectbox( | |
| "Focus Audio Feature:", | |
| options=['energy', 'danceability', 'valence', 'acousticness', 'tempo'], | |
| index=0 | |
| ) | |
| # Control 3: Popularity Filter | |
| min_popularity = st.sidebar.slider( | |
| "Minimum Track Popularity:", | |
| min_value=0, | |
| max_value=100, | |
| value=20, | |
| step=10 | |
| ) | |
| # Control 4: Genre Filter | |
| available_genres = df['playlist_genre'].unique() | |
| selected_genres = st.sidebar.multiselect( | |
| "Filter by Genres:", | |
| options=available_genres, | |
| default=available_genres | |
| ) | |
| # Filter data based on controls | |
| filtered_df = df[ | |
| (df['track_popularity'] >= min_popularity) & | |
| (df['playlist_genre'].isin(selected_genres)) | |
| ] | |
| # Main content area | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| # Visualization 1: Cluster scatter plot | |
| st.subheader("🎯 Playlist Categories in Audio Space") | |
| fig = px.scatter( | |
| filtered_df, | |
| x='PC1', | |
| y='PC2', | |
| color='Cluster_Name', | |
| size=focus_feature, | |
| hover_data=['track_name', 'track_artist', 'track_popularity'], | |
| title=f"Playlist Categories (sized by {focus_feature.title()})", | |
| width=700, | |
| height=500 | |
| ) | |
| # Highlight selected cluster | |
| selected_cluster_data = filtered_df[filtered_df['Cluster'] == selected_cluster] | |
| fig.add_scatter( | |
| x=selected_cluster_data['PC1'], | |
| y=selected_cluster_data['PC2'], | |
| mode='markers', | |
| marker=dict(color='red', size=12, symbol='diamond', line=dict(color='white', width=2)), | |
| name=f'Selected: {cluster_names[selected_cluster]}', | |
| showlegend=True | |
| ) | |
| fig.update_layout( | |
| xaxis_title="PC1: Energy Spectrum (High-Energy ← → Acoustic)", | |
| yaxis_title="PC2: Mood Dimension (Positive ← → Introspective)" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| # Key metrics for selected cluster | |
| cluster_profile = create_cluster_profile(filtered_df, selected_cluster, audio_features) | |
| st.markdown(f""" | |
| <div class="cluster-header"> | |
| <h3>{cluster_names[selected_cluster]}</h3> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.metric("Tracks in Category", f"{cluster_profile['size']:,}") | |
| st.metric("Avg Popularity", f"{cluster_profile['avg_popularity']:.1f}/100") | |
| st.metric("Market Share", f"{cluster_profile['size']/len(filtered_df)*100:.1f}%") | |
| # Visualization 2: Audio feature radar chart | |
| st.subheader("📊 Audio DNA Profile") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Radar chart for selected cluster | |
| cluster_data = filtered_df[filtered_df['Cluster'] == selected_cluster] | |
| radar_features = ['danceability', 'energy', 'valence', 'acousticness', 'speechiness', 'liveness'] | |
| cluster_means = cluster_data[radar_features].mean() | |
| overall_means = filtered_df[radar_features].mean() | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatterpolar( | |
| r=cluster_means.values, | |
| theta=[f.title() for f in radar_features], | |
| fill='toself', | |
| name=cluster_names[selected_cluster], | |
| line_color='#1DB954' | |
| )) | |
| fig.add_trace(go.Scatterpolar( | |
| r=overall_means.values, | |
| theta=[f.title() for f in radar_features], | |
| fill='toself', | |
| name='Overall Average', | |
| line_color='gray', | |
| opacity=0.5 | |
| )) | |
| fig.update_layout( | |
| polar=dict( | |
| radialaxis=dict( | |
| visible=True, | |
| range=[0, 1] | |
| )), | |
| showlegend=True, | |
| title="Cluster vs Overall Average" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| # Distinctive characteristics | |
| st.write("**Key Characteristics:**") | |
| for diff in cluster_profile['differences'][:5]: | |
| direction = "📈" if diff['diff_pct'] > 0 else "📉" | |
| st.write(f"{direction} **{diff['feature']}**: {diff['value']:.3f} ({diff['diff_pct']:+.1f}%)") | |
| st.write("**Top Genres:**") | |
| for genre, count in cluster_profile['top_genres'].items(): | |
| percentage = (count / cluster_profile['size']) * 100 | |
| st.write(f"• {genre}: {percentage:.1f}%") | |
| # Visualization 3: Feature distribution comparison | |
| st.subheader("🎵 Feature Deep Dive") | |
| fig = make_subplots( | |
| rows=1, cols=2, | |
| subplot_titles=(f'{focus_feature.title()} Distribution', 'All Clusters Comparison') | |
| ) | |
| # Distribution plot | |
| cluster_focus = filtered_df[filtered_df['Cluster'] == selected_cluster][focus_feature] | |
| other_focus = filtered_df[filtered_df['Cluster'] != selected_cluster][focus_feature] | |
| fig.add_trace( | |
| go.Histogram(x=cluster_focus, name=cluster_names[selected_cluster], opacity=0.7, nbinsx=30), | |
| row=1, col=1 | |
| ) | |
| fig.add_trace( | |
| go.Histogram(x=other_focus, name='Other Clusters', opacity=0.5, nbinsx=30), | |
| row=1, col=1 | |
| ) | |
| # Box plot comparison | |
| for cluster_id in cluster_names.keys(): | |
| cluster_data = filtered_df[filtered_df['Cluster'] == cluster_id] | |
| fig.add_trace( | |
| go.Box(y=cluster_data[focus_feature], name=cluster_names[cluster_id], | |
| boxmean=True, marker_color='red' if cluster_id == selected_cluster else None), | |
| row=1, col=2 | |
| ) | |
| fig.update_layout(height=400, showlegend=True) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Dynamic Insights | |
| st.subheader("💡 Dynamic Business Insights") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown("**Category Strategy:**") | |
| market_share = cluster_profile['size'] / len(filtered_df) | |
| if market_share > 0.20: | |
| strategy = "MARKET LEADER" | |
| recommendation = "Focus on differentiation and premium sub-segments" | |
| elif market_share > 0.12: | |
| strategy = "GROWTH OPPORTUNITY" | |
| recommendation = "Expand content library and increase user awareness" | |
| else: | |
| strategy = "NICHE EXCELLENCE" | |
| recommendation = "Perfect the experience for dedicated users" | |
| st.success(f"**{strategy}**") | |
| st.write(recommendation) | |
| # Skip risk assessment | |
| avg_popularity = cluster_profile['avg_popularity'] | |
| if avg_popularity > 60: | |
| skip_risk = "LOW" | |
| risk_color = "green" | |
| elif avg_popularity > 40: | |
| skip_risk = "MEDIUM" | |
| risk_color = "orange" | |
| else: | |
| skip_risk = "HIGH" | |
| risk_color = "red" | |
| st.markdown(f"**Skip Risk**: :{risk_color}[{skip_risk}]") | |
| with col2: | |
| st.markdown("**Sample Popular Tracks:**") | |
| for i, (_, track) in enumerate(cluster_profile['sample_tracks'].head(3).iterrows(), 1): | |
| st.write(f"{i}. **{track['track_name']}** - {track['track_artist']} (Pop: {track['track_popularity']})") | |
| # Context recommendations | |
| st.markdown("**Best Use Cases:**") | |
| use_cases = { | |
| 0: ["Background listening", "Casual playlists"], | |
| 1: ["Coffee shops", "Study sessions", "Relaxation"], | |
| 2: ["Parties", "Clubs", "High-intensity workouts"], | |
| 3: ["Evening listening", "Emotional moments"], | |
| 4: ["Gym workouts", "Running", "Motivation"], | |
| 5: ["Work", "Focus sessions", "Ambient background"] | |
| } | |
| for use_case in use_cases.get(selected_cluster, ["General listening"]): | |
| st.write(f"• {use_case}") | |
| # Summary recommendations | |
| st.subheader("🎯 Actionable Recommendations") | |
| recommendations = [ | |
| "**Algorithm Enhancement**: Use cluster boundaries for better song transitions", | |
| "**Playlist Curation**: Create context-specific playlists based on cluster profiles", | |
| "**User Interface**: Implement audio feature sliders for personalized discovery", | |
| "**Skip Prediction**: Monitor cross-cluster jumps to predict skip likelihood", | |
| "**Revenue Optimization**: Target B2B licensing for specific cluster use cases" | |
| ] | |
| for rec in recommendations: | |
| st.write(f"• {rec}") | |
| # Footer | |
| st.markdown("---") | |
| st.markdown(""" | |
| **Key Insight**: This analysis reveals that audio features, not genres, determine playlist compatibility. | |
| By clustering songs based on their acoustic DNA, we can reduce skip rates and improve user engagement | |
| through data-driven curation. | |
| """) | |
| if __name__ == "__main__": | |
| main() |