Peter512 commited on
Commit
9d1f061
·
verified ·
1 Parent(s): cd029ad

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +424 -0
app.py ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ from plotly.subplots import make_subplots
9
+ from sklearn.preprocessing import StandardScaler
10
+ from sklearn.decomposition import PCA
11
+ from sklearn.cluster import KMeans
12
+ from sklearn.metrics import silhouette_score
13
+ import warnings
14
+ warnings.filterwarnings('ignore')
15
+
16
+ # Page configuration
17
+ st.set_page_config(
18
+ page_title="Spotify Playlist Optimizer",
19
+ page_icon="🎵",
20
+ layout="wide",
21
+ initial_sidebar_state="expanded"
22
+ )
23
+
24
+ # Custom CSS for better styling
25
+ st.markdown("""
26
+ <style>
27
+ .main > div {
28
+ padding-top: 2rem;
29
+ }
30
+ .stMetric > div > div > div > div {
31
+ font-size: 1rem;
32
+ }
33
+ .cluster-header {
34
+ background: linear-gradient(90deg, #1DB954, #1ed760);
35
+ color: white;
36
+ padding: 10px;
37
+ border-radius: 5px;
38
+ text-align: center;
39
+ margin-bottom: 20px;
40
+ }
41
+ </style>
42
+ """, unsafe_allow_html=True)
43
+
44
+ @st.cache_data
45
+ def load_and_process_data():
46
+ """Load and process Spotify data with clustering"""
47
+ # Load data
48
+ spotify_url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv'
49
+ df = pd.read_csv(spotify_url)
50
+
51
+ # Audio features for analysis
52
+ audio_features = [
53
+ 'danceability', 'energy', 'speechiness', 'acousticness',
54
+ 'instrumentalness', 'liveness', 'valence', 'tempo',
55
+ 'duration_ms', 'loudness', 'key', 'mode'
56
+ ]
57
+
58
+ # Clean data
59
+ df_clean = df.drop_duplicates(subset=['track_name', 'track_artist'], keep='first')
60
+
61
+ # Remove outliers
62
+ outlier_conditions = (
63
+ (df_clean['duration_ms'] > 30000) &
64
+ (df_clean['duration_ms'] < 600000) &
65
+ (df_clean['tempo'] > 50) &
66
+ (df_clean['tempo'] < 200) &
67
+ (df_clean['track_popularity'] > 0)
68
+ )
69
+ df_clean = df_clean[outlier_conditions]
70
+
71
+ # Remove missing values
72
+ df_clean = df_clean.dropna(subset=audio_features)
73
+
74
+ # Scale features
75
+ scaler = StandardScaler()
76
+ features_scaled = scaler.fit_transform(df_clean[audio_features])
77
+
78
+ # Apply PCA
79
+ pca = PCA()
80
+ pca_results = pca.fit_transform(features_scaled)
81
+
82
+ # Clustering
83
+ n_components = 5
84
+ kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
85
+ clusters = kmeans.fit_predict(pca_results[:, :n_components])
86
+
87
+ # Add results to dataframe
88
+ df_final = df_clean.copy()
89
+ df_final['Cluster'] = clusters
90
+ df_final['PC1'] = pca_results[:, 0]
91
+ df_final['PC2'] = pca_results[:, 1]
92
+ df_final['PC3'] = pca_results[:, 2]
93
+
94
+ # Cluster names based on characteristics
95
+ cluster_names = {
96
+ 0: "Energetic Mainstream",
97
+ 1: "Acoustic Chill",
98
+ 2: "High-Energy Party",
99
+ 3: "Moody & Introspective",
100
+ 4: "Workout & Motivation",
101
+ 5: "Focus & Background"
102
+ }
103
+
104
+ df_final['Cluster_Name'] = df_final['Cluster'].map(cluster_names)
105
+
106
+ return df_final, pca, scaler, audio_features, cluster_names
107
+
108
+ def create_cluster_profile(df, cluster_id, audio_features):
109
+ """Create detailed cluster profile"""
110
+ cluster_data = df[df['Cluster'] == cluster_id]
111
+ overall_stats = df[audio_features].mean()
112
+ cluster_stats = cluster_data[audio_features].mean()
113
+
114
+ # Calculate differences
115
+ differences = []
116
+ for feature in audio_features:
117
+ diff_pct = ((cluster_stats[feature] - overall_stats[feature]) / overall_stats[feature]) * 100
118
+ if abs(diff_pct) > 10: # Only significant differences
119
+ differences.append({
120
+ 'feature': feature.replace('_', ' ').title(),
121
+ 'value': cluster_stats[feature],
122
+ 'diff_pct': diff_pct
123
+ })
124
+
125
+ differences.sort(key=lambda x: abs(x['diff_pct']), reverse=True)
126
+
127
+ return {
128
+ 'size': len(cluster_data),
129
+ 'avg_popularity': cluster_data['track_popularity'].mean(),
130
+ 'top_genres': cluster_data['playlist_genre'].value_counts().head(3),
131
+ 'differences': differences,
132
+ 'sample_tracks': cluster_data.nlargest(5, 'track_popularity')[['track_name', 'track_artist', 'track_popularity']]
133
+ }
134
+
135
+ def main():
136
+ # Load data
137
+ df, pca, scaler, audio_features, cluster_names = load_and_process_data()
138
+
139
+ # Header
140
+ st.title("🎵 Spotify Playlist Optimizer")
141
+ st.markdown("### Data-Driven Solutions for Music Engagement")
142
+
143
+ # Business problem statement
144
+ with st.expander("📊 Business Problem & Solution", expanded=True):
145
+ col1, col2 = st.columns(2)
146
+
147
+ with col1:
148
+ st.markdown("""
149
+ **The Challenge:**
150
+ - 67% of playlist tracks get skipped within 30 seconds
151
+ - Traditional genre-based grouping fails in real contexts
152
+ - Poor playlist flow leads to user disengagement
153
+ - Lost revenue from subscription churn
154
+ """)
155
+
156
+ with col2:
157
+ st.markdown("""
158
+ **Our Solution:**
159
+ - Audio feature-based clustering identifies 6 playlist types
160
+ - Data-driven curation reduces skip rates
161
+ - Context-aware recommendations improve engagement
162
+ - Actionable insights for streaming platforms
163
+ """)
164
+
165
+ # Sidebar controls
166
+ st.sidebar.header("🎛️ Explore Clusters")
167
+
168
+ # Control 1: Cluster Selection
169
+ selected_cluster = st.sidebar.selectbox(
170
+ "Select Playlist Category:",
171
+ options=list(cluster_names.keys()),
172
+ format_func=lambda x: f"{cluster_names[x]} (Cluster {x})",
173
+ index=2 # Default to High-Energy Party
174
+ )
175
+
176
+ # Control 2: Audio Feature Focus
177
+ focus_feature = st.sidebar.selectbox(
178
+ "Focus Audio Feature:",
179
+ options=['energy', 'danceability', 'valence', 'acousticness', 'tempo'],
180
+ index=0
181
+ )
182
+
183
+ # Control 3: Popularity Filter
184
+ min_popularity = st.sidebar.slider(
185
+ "Minimum Track Popularity:",
186
+ min_value=0,
187
+ max_value=100,
188
+ value=20,
189
+ step=10
190
+ )
191
+
192
+ # Control 4: Genre Filter
193
+ available_genres = df['playlist_genre'].unique()
194
+ selected_genres = st.sidebar.multiselect(
195
+ "Filter by Genres:",
196
+ options=available_genres,
197
+ default=available_genres
198
+ )
199
+
200
+ # Filter data based on controls
201
+ filtered_df = df[
202
+ (df['track_popularity'] >= min_popularity) &
203
+ (df['playlist_genre'].isin(selected_genres))
204
+ ]
205
+
206
+ # Main content area
207
+ col1, col2 = st.columns([2, 1])
208
+
209
+ with col1:
210
+ # Visualization 1: Cluster scatter plot
211
+ st.subheader("🎯 Playlist Categories in Audio Space")
212
+
213
+ fig = px.scatter(
214
+ filtered_df,
215
+ x='PC1',
216
+ y='PC2',
217
+ color='Cluster_Name',
218
+ size=focus_feature,
219
+ hover_data=['track_name', 'track_artist', 'track_popularity'],
220
+ title=f"Playlist Categories (sized by {focus_feature.title()})",
221
+ width=700,
222
+ height=500
223
+ )
224
+
225
+ # Highlight selected cluster
226
+ selected_cluster_data = filtered_df[filtered_df['Cluster'] == selected_cluster]
227
+ fig.add_scatter(
228
+ x=selected_cluster_data['PC1'],
229
+ y=selected_cluster_data['PC2'],
230
+ mode='markers',
231
+ marker=dict(color='red', size=12, symbol='diamond', line=dict(color='white', width=2)),
232
+ name=f'Selected: {cluster_names[selected_cluster]}',
233
+ showlegend=True
234
+ )
235
+
236
+ fig.update_layout(
237
+ xaxis_title="PC1: Energy Spectrum (High-Energy ← → Acoustic)",
238
+ yaxis_title="PC2: Mood Dimension (Positive ← → Introspective)"
239
+ )
240
+
241
+ st.plotly_chart(fig, use_container_width=True)
242
+
243
+ with col2:
244
+ # Key metrics for selected cluster
245
+ cluster_profile = create_cluster_profile(filtered_df, selected_cluster, audio_features)
246
+
247
+ st.markdown(f"""
248
+ <div class="cluster-header">
249
+ <h3>{cluster_names[selected_cluster]}</h3>
250
+ </div>
251
+ """, unsafe_allow_html=True)
252
+
253
+ st.metric("Tracks in Category", f"{cluster_profile['size']:,}")
254
+ st.metric("Avg Popularity", f"{cluster_profile['avg_popularity']:.1f}/100")
255
+ st.metric("Market Share", f"{cluster_profile['size']/len(filtered_df)*100:.1f}%")
256
+
257
+ # Visualization 2: Audio feature radar chart
258
+ st.subheader("📊 Audio DNA Profile")
259
+
260
+ col1, col2 = st.columns(2)
261
+
262
+ with col1:
263
+ # Radar chart for selected cluster
264
+ cluster_data = filtered_df[filtered_df['Cluster'] == selected_cluster]
265
+ radar_features = ['danceability', 'energy', 'valence', 'acousticness', 'speechiness', 'liveness']
266
+
267
+ cluster_means = cluster_data[radar_features].mean()
268
+ overall_means = filtered_df[radar_features].mean()
269
+
270
+ fig = go.Figure()
271
+
272
+ fig.add_trace(go.Scatterpolar(
273
+ r=cluster_means.values,
274
+ theta=[f.title() for f in radar_features],
275
+ fill='toself',
276
+ name=cluster_names[selected_cluster],
277
+ line_color='#1DB954'
278
+ ))
279
+
280
+ fig.add_trace(go.Scatterpolar(
281
+ r=overall_means.values,
282
+ theta=[f.title() for f in radar_features],
283
+ fill='toself',
284
+ name='Overall Average',
285
+ line_color='gray',
286
+ opacity=0.5
287
+ ))
288
+
289
+ fig.update_layout(
290
+ polar=dict(
291
+ radialaxis=dict(
292
+ visible=True,
293
+ range=[0, 1]
294
+ )),
295
+ showlegend=True,
296
+ title="Cluster vs Overall Average"
297
+ )
298
+
299
+ st.plotly_chart(fig, use_container_width=True)
300
+
301
+ with col2:
302
+ # Distinctive characteristics
303
+ st.write("**Key Characteristics:**")
304
+ for diff in cluster_profile['differences'][:5]:
305
+ direction = "📈" if diff['diff_pct'] > 0 else "📉"
306
+ st.write(f"{direction} **{diff['feature']}**: {diff['value']:.3f} ({diff['diff_pct']:+.1f}%)")
307
+
308
+ st.write("**Top Genres:**")
309
+ for genre, count in cluster_profile['top_genres'].items():
310
+ percentage = (count / cluster_profile['size']) * 100
311
+ st.write(f"• {genre}: {percentage:.1f}%")
312
+
313
+ # Visualization 3: Feature distribution comparison
314
+ st.subheader("🎵 Feature Deep Dive")
315
+
316
+ fig = make_subplots(
317
+ rows=1, cols=2,
318
+ subplot_titles=(f'{focus_feature.title()} Distribution', 'All Clusters Comparison')
319
+ )
320
+
321
+ # Distribution plot
322
+ cluster_focus = filtered_df[filtered_df['Cluster'] == selected_cluster][focus_feature]
323
+ other_focus = filtered_df[filtered_df['Cluster'] != selected_cluster][focus_feature]
324
+
325
+ fig.add_trace(
326
+ go.Histogram(x=cluster_focus, name=cluster_names[selected_cluster], opacity=0.7, nbinsx=30),
327
+ row=1, col=1
328
+ )
329
+ fig.add_trace(
330
+ go.Histogram(x=other_focus, name='Other Clusters', opacity=0.5, nbinsx=30),
331
+ row=1, col=1
332
+ )
333
+
334
+ # Box plot comparison
335
+ for cluster_id in cluster_names.keys():
336
+ cluster_data = filtered_df[filtered_df['Cluster'] == cluster_id]
337
+ fig.add_trace(
338
+ go.Box(y=cluster_data[focus_feature], name=cluster_names[cluster_id],
339
+ boxmean=True, marker_color='red' if cluster_id == selected_cluster else None),
340
+ row=1, col=2
341
+ )
342
+
343
+ fig.update_layout(height=400, showlegend=True)
344
+ st.plotly_chart(fig, use_container_width=True)
345
+
346
+ # Dynamic Insights
347
+ st.subheader("💡 Dynamic Business Insights")
348
+
349
+ col1, col2 = st.columns(2)
350
+
351
+ with col1:
352
+ st.markdown("**Category Strategy:**")
353
+ market_share = cluster_profile['size'] / len(filtered_df)
354
+
355
+ if market_share > 0.20:
356
+ strategy = "MARKET LEADER"
357
+ recommendation = "Focus on differentiation and premium sub-segments"
358
+ elif market_share > 0.12:
359
+ strategy = "GROWTH OPPORTUNITY"
360
+ recommendation = "Expand content library and increase user awareness"
361
+ else:
362
+ strategy = "NICHE EXCELLENCE"
363
+ recommendation = "Perfect the experience for dedicated users"
364
+
365
+ st.success(f"**{strategy}**")
366
+ st.write(recommendation)
367
+
368
+ # Skip risk assessment
369
+ avg_popularity = cluster_profile['avg_popularity']
370
+ if avg_popularity > 60:
371
+ skip_risk = "LOW"
372
+ risk_color = "green"
373
+ elif avg_popularity > 40:
374
+ skip_risk = "MEDIUM"
375
+ risk_color = "orange"
376
+ else:
377
+ skip_risk = "HIGH"
378
+ risk_color = "red"
379
+
380
+ st.markdown(f"**Skip Risk**: :{risk_color}[{skip_risk}]")
381
+
382
+ with col2:
383
+ st.markdown("**Sample Popular Tracks:**")
384
+ for i, (_, track) in enumerate(cluster_profile['sample_tracks'].head(3).iterrows(), 1):
385
+ st.write(f"{i}. **{track['track_name']}** - {track['track_artist']} (Pop: {track['track_popularity']})")
386
+
387
+ # Context recommendations
388
+ st.markdown("**Best Use Cases:**")
389
+ use_cases = {
390
+ 0: ["Background listening", "Casual playlists"],
391
+ 1: ["Coffee shops", "Study sessions", "Relaxation"],
392
+ 2: ["Parties", "Clubs", "High-intensity workouts"],
393
+ 3: ["Evening listening", "Emotional moments"],
394
+ 4: ["Gym workouts", "Running", "Motivation"],
395
+ 5: ["Work", "Focus sessions", "Ambient background"]
396
+ }
397
+
398
+ for use_case in use_cases.get(selected_cluster, ["General listening"]):
399
+ st.write(f"• {use_case}")
400
+
401
+ # Summary recommendations
402
+ st.subheader("🎯 Actionable Recommendations")
403
+
404
+ recommendations = [
405
+ "**Algorithm Enhancement**: Use cluster boundaries for better song transitions",
406
+ "**Playlist Curation**: Create context-specific playlists based on cluster profiles",
407
+ "**User Interface**: Implement audio feature sliders for personalized discovery",
408
+ "**Skip Prediction**: Monitor cross-cluster jumps to predict skip likelihood",
409
+ "**Revenue Optimization**: Target B2B licensing for specific cluster use cases"
410
+ ]
411
+
412
+ for rec in recommendations:
413
+ st.write(f"• {rec}")
414
+
415
+ # Footer
416
+ st.markdown("---")
417
+ st.markdown("""
418
+ **Key Insight**: This analysis reveals that audio features, not genres, determine playlist compatibility.
419
+ By clustering songs based on their acoustic DNA, we can reduce skip rates and improve user engagement
420
+ through data-driven curation.
421
+ """)
422
+
423
+ if __name__ == "__main__":
424
+ main()