Spaces:

Peter512
/

UML_assignment

Sleeping

File size: 14,781 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

# Page configuration
st.set_page_config(
    page_title="Spotify Playlist Optimizer",
    page_icon="🎵",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS for better styling
st.markdown("""
<style>
    .main > div {
        padding-top: 2rem;
    }
    .stMetric > div > div > div > div {
        font-size: 1rem;
    }
    .cluster-header {
        background: linear-gradient(90deg, #1DB954, #1ed760);
        color: white;
        padding: 10px;
        border-radius: 5px;
        text-align: center;
        margin-bottom: 20px;
    }
</style>
""", unsafe_allow_html=True)

@st.cache_data
def load_and_process_data():
    """Load and process Spotify data with clustering"""
    # Load data
    spotify_url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv'
    df = pd.read_csv(spotify_url)
    
    # Audio features for analysis
    audio_features = [
        'danceability', 'energy', 'speechiness', 'acousticness',
        'instrumentalness', 'liveness', 'valence', 'tempo',
        'duration_ms', 'loudness', 'key', 'mode'
    ]
    
    # Clean data
    df_clean = df.drop_duplicates(subset=['track_name', 'track_artist'], keep='first')
    
    # Remove outliers
    outlier_conditions = (
        (df_clean['duration_ms'] > 30000) &
        (df_clean['duration_ms'] < 600000) &
        (df_clean['tempo'] > 50) &
        (df_clean['tempo'] < 200) &
        (df_clean['track_popularity'] > 0)
    )
    df_clean = df_clean[outlier_conditions]
    
    # Remove missing values
    df_clean = df_clean.dropna(subset=audio_features)
    
    # Scale features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(df_clean[audio_features])
    
    # Apply PCA
    pca = PCA()
    pca_results = pca.fit_transform(features_scaled)
    
    # Clustering
    n_components = 5
    kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(pca_results[:, :n_components])
    
    # Add results to dataframe
    df_final = df_clean.copy()
    df_final['Cluster'] = clusters
    df_final['PC1'] = pca_results[:, 0]
    df_final['PC2'] = pca_results[:, 1]
    df_final['PC3'] = pca_results[:, 2]
    
    # Cluster names based on characteristics
    cluster_names = {
        0: "Energetic Mainstream",
        1: "Acoustic Chill", 
        2: "High-Energy Party",
        3: "Moody & Introspective",
        4: "Workout & Motivation",
        5: "Focus & Background"
    }
    
    df_final['Cluster_Name'] = df_final['Cluster'].map(cluster_names)
    
    return df_final, pca, scaler, audio_features, cluster_names

def create_cluster_profile(df, cluster_id, audio_features):
    """Create detailed cluster profile"""
    cluster_data = df[df['Cluster'] == cluster_id]
    overall_stats = df[audio_features].mean()
    cluster_stats = cluster_data[audio_features].mean()
    
    # Calculate differences
    differences = []
    for feature in audio_features:
        diff_pct = ((cluster_stats[feature] - overall_stats[feature]) / overall_stats[feature]) * 100
        if abs(diff_pct) > 10:  # Only significant differences
            differences.append({
                'feature': feature.replace('_', ' ').title(),
                'value': cluster_stats[feature],
                'diff_pct': diff_pct
            })
    
    differences.sort(key=lambda x: abs(x['diff_pct']), reverse=True)
    
    return {
        'size': len(cluster_data),
        'avg_popularity': cluster_data['track_popularity'].mean(),
        'top_genres': cluster_data['playlist_genre'].value_counts().head(3),
        'differences': differences,
        'sample_tracks': cluster_data.nlargest(5, 'track_popularity')[['track_name', 'track_artist', 'track_popularity']]
    }

def main():
    # Load data
    df, pca, scaler, audio_features, cluster_names = load_and_process_data()
    
    # Header
    st.title("🎵 Spotify Playlist Optimizer")
    st.markdown("### Data-Driven Solutions for Music Engagement")
    
    # Business problem statement
    with st.expander("📊 Business Problem & Solution", expanded=True):
        col1, col2 = st.columns(2)
        
        with col1:
            st.markdown("""
            **The Challenge:**
            - Streaming platforms face high skip rates that impact user engagement
            - Traditional genre-based grouping fails in real contexts
            - Poor playlist flow leads to user disengagement
            - Lost revenue from subscription churn
            """)
        
        with col2:
            st.markdown("""
            **Our Solution:**
            - Audio feature-based clustering identifies 6 playlist types
            - Data-driven curation reduces skip rates
            - Context-aware recommendations improve engagement
            - Actionable insights for streaming platforms
            """)
    
    # Sidebar controls
    st.sidebar.header("🎛️ Explore Clusters")
    
    # Control 1: Cluster Selection
    selected_cluster = st.sidebar.selectbox(
        "Select Playlist Category:",
        options=list(cluster_names.keys()),
        format_func=lambda x: f"{cluster_names[x]} (Cluster {x})",
        index=2  # Default to High-Energy Party
    )
    
    # Control 2: Audio Feature Focus
    focus_feature = st.sidebar.selectbox(
        "Focus Audio Feature:",
        options=['energy', 'danceability', 'valence', 'acousticness', 'tempo'],
        index=0
    )
    
    # Control 3: Popularity Filter
    min_popularity = st.sidebar.slider(
        "Minimum Track Popularity:",
        min_value=0,
        max_value=100,
        value=20,
        step=10
    )
    
    # Control 4: Genre Filter
    available_genres = df['playlist_genre'].unique()
    selected_genres = st.sidebar.multiselect(
        "Filter by Genres:",
        options=available_genres,
        default=available_genres
    )
    
    # Filter data based on controls
    filtered_df = df[
        (df['track_popularity'] >= min_popularity) &
        (df['playlist_genre'].isin(selected_genres))
    ]
    
    # Main content area
    col1, col2 = st.columns([2, 1])
    
    with col1:
        # Visualization 1: Cluster scatter plot
        st.subheader("🎯 Playlist Categories in Audio Space")
        
        fig = px.scatter(
            filtered_df,
            x='PC1',
            y='PC2',
            color='Cluster_Name',
            size=focus_feature,
            hover_data=['track_name', 'track_artist', 'track_popularity'],
            title=f"Playlist Categories (sized by {focus_feature.title()})",
            width=700,
            height=500
        )
        
        # Highlight selected cluster
        selected_cluster_data = filtered_df[filtered_df['Cluster'] == selected_cluster]
        fig.add_scatter(
            x=selected_cluster_data['PC1'],
            y=selected_cluster_data['PC2'],
            mode='markers',
            marker=dict(color='red', size=12, symbol='diamond', line=dict(color='white', width=2)),
            name=f'Selected: {cluster_names[selected_cluster]}',
            showlegend=True
        )
        
        fig.update_layout(
            xaxis_title="PC1: Energy Spectrum (High-Energy ← → Acoustic)",
            yaxis_title="PC2: Mood Dimension (Positive ← → Introspective)"
        )
        
        st.plotly_chart(fig, use_container_width=True)
    
    with col2:
        # Key metrics for selected cluster
        cluster_profile = create_cluster_profile(filtered_df, selected_cluster, audio_features)
        
        st.markdown(f"""
        <div class="cluster-header">
            <h3>{cluster_names[selected_cluster]}</h3>
        </div>
        """, unsafe_allow_html=True)
        
        st.metric("Tracks in Category", f"{cluster_profile['size']:,}")
        st.metric("Avg Popularity", f"{cluster_profile['avg_popularity']:.1f}/100")
        st.metric("Market Share", f"{cluster_profile['size']/len(filtered_df)*100:.1f}%")
    
    # Visualization 2: Audio feature radar chart
    st.subheader("📊 Audio DNA Profile")
    
    col1, col2 = st.columns(2)
    
    with col1:
        # Radar chart for selected cluster
        cluster_data = filtered_df[filtered_df['Cluster'] == selected_cluster]
        radar_features = ['danceability', 'energy', 'valence', 'acousticness', 'speechiness', 'liveness']
        
        cluster_means = cluster_data[radar_features].mean()
        overall_means = filtered_df[radar_features].mean()
        
        fig = go.Figure()
        
        fig.add_trace(go.Scatterpolar(
            r=cluster_means.values,
            theta=[f.title() for f in radar_features],
            fill='toself',
            name=cluster_names[selected_cluster],
            line_color='#1DB954'
        ))
        
        fig.add_trace(go.Scatterpolar(
            r=overall_means.values,
            theta=[f.title() for f in radar_features],
            fill='toself',
            name='Overall Average',
            line_color='gray',
            opacity=0.5
        ))
        
        fig.update_layout(
            polar=dict(
                radialaxis=dict(
                    visible=True,
                    range=[0, 1]
                )),
            showlegend=True,
            title="Cluster vs Overall Average"
        )
        
        st.plotly_chart(fig, use_container_width=True)
    
    with col2:
        # Distinctive characteristics
        st.write("**Key Characteristics:**")
        for diff in cluster_profile['differences'][:5]:
            direction = "📈" if diff['diff_pct'] > 0 else "📉"
            st.write(f"{direction} **{diff['feature']}**: {diff['value']:.3f} ({diff['diff_pct']:+.1f}%)")
        
        st.write("**Top Genres:**")
        for genre, count in cluster_profile['top_genres'].items():
            percentage = (count / cluster_profile['size']) * 100
            st.write(f"• {genre}: {percentage:.1f}%")
    
    # Visualization 3: Feature distribution comparison
    st.subheader("🎵 Feature Deep Dive")
    
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=(f'{focus_feature.title()} Distribution', 'All Clusters Comparison')
    )
    
    # Distribution plot
    cluster_focus = filtered_df[filtered_df['Cluster'] == selected_cluster][focus_feature]
    other_focus = filtered_df[filtered_df['Cluster'] != selected_cluster][focus_feature]
    
    fig.add_trace(
        go.Histogram(x=cluster_focus, name=cluster_names[selected_cluster], opacity=0.7, nbinsx=30),
        row=1, col=1
    )
    fig.add_trace(
        go.Histogram(x=other_focus, name='Other Clusters', opacity=0.5, nbinsx=30),
        row=1, col=1
    )
    
    # Box plot comparison
    for cluster_id in cluster_names.keys():
        cluster_data = filtered_df[filtered_df['Cluster'] == cluster_id]
        fig.add_trace(
            go.Box(y=cluster_data[focus_feature], name=cluster_names[cluster_id], 
                   boxmean=True, marker_color='red' if cluster_id == selected_cluster else None),
            row=1, col=2
        )
    
    fig.update_layout(height=400, showlegend=True)
    st.plotly_chart(fig, use_container_width=True)
    
    # Dynamic Insights
    st.subheader("💡 Dynamic Business Insights")
    
    col1, col2 = st.columns(2)
    
    with col1:
        st.markdown("**Category Strategy:**")
        market_share = cluster_profile['size'] / len(filtered_df)
        
        if market_share > 0.20:
            strategy = "MARKET LEADER"
            recommendation = "Focus on differentiation and premium sub-segments"
        elif market_share > 0.12:
            strategy = "GROWTH OPPORTUNITY" 
            recommendation = "Expand content library and increase user awareness"
        else:
            strategy = "NICHE EXCELLENCE"
            recommendation = "Perfect the experience for dedicated users"
        
        st.success(f"**{strategy}**")
        st.write(recommendation)
        
        # Skip risk assessment
        avg_popularity = cluster_profile['avg_popularity']
        if avg_popularity > 60:
            skip_risk = "LOW"
            risk_color = "green"
        elif avg_popularity > 40:
            skip_risk = "MEDIUM"
            risk_color = "orange"
        else:
            skip_risk = "HIGH"
            risk_color = "red"
            
        st.markdown(f"**Skip Risk**: :{risk_color}[{skip_risk}]")
    
    with col2:
        st.markdown("**Sample Popular Tracks:**")
        for i, (_, track) in enumerate(cluster_profile['sample_tracks'].head(3).iterrows(), 1):
            st.write(f"{i}. **{track['track_name']}** - {track['track_artist']} (Pop: {track['track_popularity']})")
        
        # Context recommendations
        st.markdown("**Best Use Cases:**")
        use_cases = {
            0: ["Background listening", "Casual playlists"],
            1: ["Coffee shops", "Study sessions", "Relaxation"],
            2: ["Parties", "Clubs", "High-intensity workouts"],
            3: ["Evening listening", "Emotional moments"],
            4: ["Gym workouts", "Running", "Motivation"],
            5: ["Work", "Focus sessions", "Ambient background"]
        }
        
        for use_case in use_cases.get(selected_cluster, ["General listening"]):
            st.write(f"• {use_case}")
    
    # Summary recommendations
    st.subheader("🎯 Actionable Recommendations")
    
    recommendations = [
        "**Algorithm Enhancement**: Use cluster boundaries for better song transitions",
        "**Playlist Curation**: Create context-specific playlists based on cluster profiles",
        "**User Interface**: Implement audio feature sliders for personalized discovery",
        "**Skip Prediction**: Monitor cross-cluster jumps to predict skip likelihood",
        "**Revenue Optimization**: Target B2B licensing for specific cluster use cases"
    ]
    
    for rec in recommendations:
        st.write(f"• {rec}")
    
    # Footer
    st.markdown("---")
    st.markdown("""
    **Key Insight**: This analysis reveals that audio features, not genres, determine playlist compatibility. 
    By clustering songs based on their acoustic DNA, we can reduce skip rates and improve user engagement 
    through data-driven curation.
    """)

if __name__ == "__main__":
    main()