Ezhil commited on
Commit
a31e167
·
1 Parent(s): b595166

pushing - 1

Browse files
functions/__pycache__/visualizations.cpython-310.pyc CHANGED
Binary files a/functions/__pycache__/visualizations.cpython-310.pyc and b/functions/__pycache__/visualizations.cpython-310.pyc differ
 
functions/visualizations.py CHANGED
@@ -5,20 +5,23 @@ import seaborn as sns
5
  import matplotlib.pyplot as plt
6
  import networkx as nx
7
  import plotly.graph_objects as go
8
- from itertools import chain,combinations
9
  import numpy as np
10
  from collections import Counter
11
 
12
 
13
  def generate_popularity_trends(df):
14
  st.header("Popularity Trends Over Time")
15
- tab1, tab2, tab3 = st.tabs(["Average Popularity", "Individual Songs", "Top 10 Songs"])
16
-
 
17
  with tab1:
18
- st.markdown("**Average Popularity by Decade:** This chart shows how the average popularity of songs has changed over different decades.")
 
19
  if 'Decade' in df.columns:
20
- top_decades = df.groupby('Decade')['Popularity'].mean().reset_index().nlargest(10, 'Popularity')
21
-
 
22
  fig1 = go.Figure()
23
  fig1.add_trace(go.Scatter(
24
  x=top_decades['Decade'],
@@ -26,7 +29,8 @@ def generate_popularity_trends(df):
26
  mode='lines+markers',
27
  fill='tonexty',
28
  line=dict(color='royalblue', width=3),
29
- marker=dict(size=8, color='darkblue', line=dict(width=2, color='white')),
 
30
  name='Popularity',
31
  hovertext=top_decades['Decade']
32
  ))
@@ -41,9 +45,10 @@ def generate_popularity_trends(df):
41
  st.plotly_chart(fig1)
42
  else:
43
  st.error("Cannot plot: 'Decade' column missing.")
44
-
45
  with tab2:
46
- st.markdown("**Top 10 Individual Songs:** This scatter plot highlights the popularity of the top 10 most popular songs over time.")
 
47
  if 'Year' in df.columns:
48
  top_songs = df.nlargest(10, 'Popularity')
49
  fig2 = px.scatter(
@@ -64,17 +69,20 @@ def generate_popularity_trends(df):
64
  st.plotly_chart(fig2)
65
  else:
66
  st.error("Cannot plot: 'Year' column missing.")
67
-
68
  with tab3:
69
- st.markdown("**Top 10 Most Popular Songs:** This bar chart displays the top 10 songs based on their popularity scores.")
 
70
  if 'Track Name' in df.columns and 'Popularity' in df.columns:
71
- top_songs = df.nlargest(10, 'Popularity')[['Track Name', 'Artist Name(s)', 'Popularity']]
 
72
  fig3 = px.bar(
73
  top_songs, y='Track Name', x='Popularity',
74
  orientation='h', color='Popularity',
75
  color_continuous_scale='deep',
76
  title='Top 10 Most Popular Songs',
77
- labels={'Track Name': 'Song Title', 'Popularity': 'Popularity Score'},
 
78
  hover_data=['Track Name', 'Artist Name(s)']
79
  )
80
  fig3.update_layout(
@@ -89,21 +97,22 @@ def generate_popularity_trends(df):
89
  st.error("Cannot plot: 'Track Name' or 'Popularity' column missing.")
90
 
91
 
92
- def generate_audio_features(df):
93
  st.header("Audio Features Analysis")
94
 
95
  feature = st.selectbox(
96
  "Select Feature", ['Danceability', 'Energy', 'Tempo', 'Loudness']
97
  )
98
-
99
  tab1, tab2 = st.tabs(["Distribution", "By Decade"])
100
-
101
  with tab1:
102
- st.markdown(f"**Top 20 {feature} Values:** This bar chart displays the distribution of the top 20 songs based on {feature}.")
 
103
  top_features = df.nlargest(20, feature)
104
 
105
  fig = px.bar(
106
- top_features, x='Track Name', y=feature,
107
  color='Decade' if 'Decade' in df.columns else None,
108
  title=f'Top 20 Songs by {feature}',
109
  color_discrete_sequence=px.colors.qualitative.Set2,
@@ -111,12 +120,14 @@ def generate_audio_features(df):
111
  )
112
  fig.update_layout(xaxis_tickangle=-45, template='plotly_white')
113
  st.plotly_chart(fig)
114
-
115
  with tab2:
116
- st.markdown(f"**{feature} by Decade:** This line chart compares the top {feature} trends over different decades.")
 
117
 
118
  if 'Decade' in df.columns:
119
- avg_feature_by_decade = df.groupby('Decade')[feature].mean().reset_index()
 
120
 
121
  fig2 = px.line(
122
  avg_feature_by_decade, x='Decade', y=feature,
@@ -130,14 +141,18 @@ def generate_audio_features(df):
130
  else:
131
  st.error("Cannot plot: 'Decade' column missing.")
132
 
 
133
  def generate_genre_analysis(df):
134
  st.header("Genre & Artist Analysis")
135
- tab1, tab2, tab3 = st.tabs(["Top Genres", "Genre Distribution", "Artist Popularity"])
136
-
 
137
  with tab1:
138
- st.markdown("**Top Genres in Top 10 Songs:** Displays the most common genres among the top 10 most popular songs.")
 
139
  top_songs = df.nlargest(10, 'Popularity')
140
- top_genres = top_songs.explode('Genres')['Genres'].value_counts().reset_index()
 
141
  fig1 = px.bar(
142
  top_genres, x='count', y='Genres',
143
  orientation='h', color='count',
@@ -148,45 +163,55 @@ def generate_genre_analysis(df):
148
  )
149
  fig1.update_layout(template='plotly_white', width=900, height=500)
150
  st.plotly_chart(fig1)
151
-
152
  with tab2:
153
- st.markdown("**Genre Distribution in Top 10 Songs:** Shows how different genres contribute to the top 10 songs.")
 
154
  genre_song_data = top_songs.explode('Genres')
155
  fig2 = px.bar(
156
  genre_song_data, x='Track Name', y='Popularity', color='Genres',
157
  title='Genre Distribution in Top 10 Songs',
158
- labels={'Track Name': 'Song Title', 'Popularity': 'Popularity Score', 'Genres': 'Genre'},
 
159
  barmode='stack',
160
  hover_data=['Track Name', 'Genres']
161
  )
162
  fig2.update_layout(template='plotly_white', width=900, height=500)
163
  st.plotly_chart(fig2)
164
-
165
  with tab3:
166
- st.markdown("**Artist Popularity in Top 10 Songs:** Visualizes the most popular artists in the top 10 songs with their song count and names.")
167
- artist_popularity = top_songs.groupby('Artist Name(s)').agg({'Popularity': 'sum', 'Track Name': lambda x: list(x)}).reset_index().sort_values(by='Popularity', ascending=False)
168
- artist_popularity['Song Count'] = artist_popularity['Track Name'].apply(len)
 
 
 
169
  fig3 = px.bar(
170
  artist_popularity, x='Popularity', y='Artist Name(s)',
171
  orientation='h', color='Popularity',
172
  color_continuous_scale='blues',
173
  title='Artist Popularity in Top 10 Songs',
174
- labels={'Artist Name(s)': 'Artist Name', 'Popularity': 'Total Popularity Score', 'Song Count': 'Number of Songs'},
175
- hover_data={'Artist Name(s)': True, 'Popularity': True, 'Song Count': True, 'Track Name': True}
 
 
176
  )
177
  fig3.update_layout(template='plotly_white', width=900, height=500)
178
  st.plotly_chart(fig3)
179
 
 
180
  def generate_explicit_trends(df):
181
  st.header("Explicit Content Trends")
182
  st.markdown("**Explicit vs Non-Explicit Songs Over Time:** This line chart shows how the number of explicit and non-explicit songs has changed over different decades.")
183
  if 'Decade' in df.columns and 'Explicit' in df.columns:
184
- explicit_trends = df.groupby(['Decade', 'Explicit']).size().reset_index(name='Count')
 
185
  fig = px.line(
186
  explicit_trends, x='Decade', y='Count', color='Explicit',
187
  markers=True, line_shape='linear',
188
  title='Explicit vs Non-Explicit Songs Over Time',
189
- labels={'Decade': 'Decade', 'Count': 'Number of Songs', 'Explicit': 'Song Type'},
 
190
  color_discrete_map={True: 'purple', False: 'green'}
191
  )
192
  fig.update_layout(template='plotly_white', width=900, height=500)
@@ -194,12 +219,14 @@ def generate_explicit_trends(df):
194
  else:
195
  st.error("Cannot plot: 'Decade' or 'Explicit' column missing.")
196
 
 
197
  def generate_album_insights(df):
198
  st.header("Album & Label Insights")
199
  tab1, tab2 = st.tabs(["Top Labels", "Album Popularity"])
200
-
201
  with tab1:
202
- st.markdown("**Top Record Labels:** Displays the most dominant record labels based on the number of songs they have released.")
 
203
  if 'Label' in df.columns:
204
  top_labels = df['Label'].value_counts().nlargest(10).reset_index()
205
  fig9 = px.sunburst(
@@ -212,17 +239,21 @@ def generate_album_insights(df):
212
  st.plotly_chart(fig9)
213
  else:
214
  st.error("Cannot plot: 'Label' column missing.")
215
-
216
  with tab2:
217
- st.markdown("**Album Popularity:** Compares the popularity of albums based on the number of songs and their average popularity score.")
 
218
  if 'Album Name' in df.columns and 'Popularity' in df.columns:
219
- album_pop = df.groupby('Album Name')['Popularity'].agg(['mean', 'count']).reset_index()
220
- album_pop = album_pop.sort_values(by=['mean', 'count'], ascending=[False, False]).nlargest(10, 'mean')
 
 
221
  fig10 = px.strip(
222
  album_pop, x='mean', y='Album Name',
223
  color='count',
224
  title='Top 10 Albums by Popularity',
225
- labels={'Album Name': 'Album', 'mean': 'Average Popularity Score', 'count': 'Number of Songs'},
 
226
  hover_data={'Album Name': True, 'count': True, 'mean': True},
227
  color_discrete_sequence=px.colors.qualitative.Pastel
228
  )
@@ -239,16 +270,19 @@ def generate_tempo_mood(df):
239
  st.markdown("**Tempo Trends:** Tracks tempo changes.")
240
  if 'Year' in df.columns and 'Tempo' in df.columns:
241
  tempo_by_year = df.groupby('Year')['Tempo'].mean().reset_index()
242
- fig11 = px.line(tempo_by_year, x='Year', y='Tempo', title='Average Tempo Over Time', color_discrete_sequence=['orange'])
 
243
  fig11.update_layout(template='plotly_white', width=800, height=400)
244
  st.plotly_chart(fig11)
245
  else:
246
  st.error("Cannot plot: 'Year' or 'Tempo' column missing.")
247
  with tab2:
248
- st.markdown("**Mood Analysis (Valence & Energy):** Categorizes songs based on mood and energy.")
 
249
  if 'Valence' in df.columns and 'Energy' in df.columns:
250
  top_songs = df.nlargest(10, 'Popularity')
251
- mood_by_valence = top_songs.groupby('Valence')['Energy'].mean().reset_index()
 
252
  fig12 = px.bar(
253
  mood_by_valence, x='Valence', y='Energy',
254
  title='Average Energy Levels by Valence (Mood Analysis)',
@@ -258,14 +292,17 @@ def generate_tempo_mood(df):
258
  st.plotly_chart(fig12)
259
  else:
260
  st.error("Cannot plot: 'Valence' or 'Energy' column missing.")
 
 
261
  def generate_top_artists_songs(df):
262
  st.header("Top Artists and Songs")
263
  tab1, tab2 = st.tabs(["Top Artists", "Top Songs"])
264
-
265
  with tab1:
266
  st.markdown("**Most Featured Artists:** Shows top artists.")
267
  if 'Artist Name(s)' in df.columns:
268
- top_artists = df['Artist Name(s)'].value_counts().nlargest(10).reset_index()
 
269
  fig13 = px.bar(
270
  top_artists, x='count', y='Artist Name(s)',
271
  orientation='h',
@@ -276,11 +313,12 @@ def generate_top_artists_songs(df):
276
  st.plotly_chart(fig13)
277
  else:
278
  st.error("Cannot plot: 'Artist Name(s)' column missing.")
279
-
280
  with tab2:
281
  st.markdown("**Top 10 Songs:** Lists top songs.")
282
  if 'Track Name' in df.columns and 'Popularity' in df.columns:
283
- top_songs = df.nlargest(10, 'Popularity')[['Track Name', 'Popularity']]
 
284
  fig14 = px.pie(
285
  top_songs, values='Popularity', names='Track Name',
286
  title='Top 10 Songs by Popularity', color_discrete_sequence=px.colors.qualitative.Set3
@@ -297,8 +335,10 @@ def generate_album_release_trends(df):
297
  with tab1:
298
  st.markdown("**Albums per Year:** Tracks release patterns.")
299
  if 'Year' in df.columns:
300
- albums_per_year = df['Year'].value_counts().sort_index().reset_index()
301
- fig15 = px.line(albums_per_year, x='Year', y='count', title='Number of Albums Released per Year', color_discrete_sequence=['purple'])
 
 
302
  fig15.update_layout(template='plotly_white', width=800, height=400)
303
  st.plotly_chart(fig15)
304
  else:
@@ -307,11 +347,13 @@ def generate_album_release_trends(df):
307
  st.markdown("**Songs by Artists and Years:** Visualizes trends.")
308
  if 'Artist Name(s)' in df.columns and 'Year' in df.columns:
309
  # Filter to only show the top 10 most featured artists
310
- top_artists = df['Artist Name(s)'].value_counts().nlargest(10).index
 
311
  filtered_df = df[df['Artist Name(s)'].isin(top_artists)]
312
-
313
  # Grouping data
314
- artist_year = filtered_df.groupby(['Year', 'Artist Name(s)']).size().reset_index(name='Count')
 
315
 
316
  # Create a grouped bar chart
317
  fig16 = px.bar(
@@ -325,6 +367,8 @@ def generate_album_release_trends(df):
325
  st.plotly_chart(fig16)
326
  else:
327
  st.error("Cannot plot: 'Artist Name(s)' or 'Year' column missing.")
 
 
328
  def generate_duration_analysis(df):
329
  st.header("Track Duration Analysis")
330
  tab1, tab2 = st.tabs(["Distribution", "By Decade"])
@@ -333,7 +377,8 @@ def generate_duration_analysis(df):
333
  df = df[df['Track Duration (ms)'] <= 900000]
334
 
335
  with tab1:
336
- st.markdown("**Track Duration Distribution:** Illustrates how track durations vary, helping identify common song lengths.")
 
337
  if 'Track Duration (ms)' in df.columns:
338
  fig17 = px.histogram(
339
  df, x='Track Duration (ms)',
@@ -345,12 +390,14 @@ def generate_duration_analysis(df):
345
  st.plotly_chart(fig17)
346
  else:
347
  st.error("Cannot plot: 'Track Duration (ms)' column missing.")
348
-
349
  with tab2:
350
- st.markdown("**Duration by Decade:** Compares the evolution of average track durations across decades, showing historical trends.")
 
351
  if 'Decade' in df.columns and 'Track Duration (ms)' in df.columns:
352
  fig18 = px.pie(
353
- df.groupby('Decade')['Track Duration (ms)'].mean().reset_index(),
 
354
  names='Decade', values='Track Duration (ms)',
355
  title='Average Track Duration by Decade',
356
  color_discrete_sequence=px.colors.qualitative.Set2
@@ -358,53 +405,63 @@ def generate_duration_analysis(df):
358
  fig18.update_layout(template='plotly_white', width=800, height=400)
359
  st.plotly_chart(fig18)
360
  else:
361
- st.error("Cannot plot: 'Decade' or 'Track Duration (ms)' column missing.")
362
-
363
 
364
 
365
  def generate_streaming_insights(df):
366
  st.header("Streaming and Engagement Insights")
367
  tab1, tab2 = st.tabs(["Popularity vs Duration", "Time Signature"])
368
-
369
  with tab1:
370
- st.markdown("**Popularity vs Track Duration:** This line chart shows the trend of song popularity based on their duration.")
371
-
 
372
  if 'Track Duration (ms)' in df.columns and 'Popularity' in df.columns:
373
  df['Duration (minutes)'] = df['Track Duration (ms)'] / 60000
374
- duration_bins = pd.cut(df['Duration (minutes)'], bins=[0, 2, 4, 6, 8, 10, 15], labels=['0-2', '2-4', '4-6', '6-8', '8-10', '10+'])
375
- avg_popularity = df.groupby(duration_bins)['Popularity'].mean().reset_index()
 
 
376
 
377
  fig1 = px.line(
378
- avg_popularity,
379
- x='Duration (minutes)',
380
  y='Popularity',
381
  title='Popularity vs. Track Duration',
382
  markers=True, # Adds points to the line
383
  line_shape='spline', # Smoothens the line
384
  color_discrete_sequence=['blue']
385
  )
386
- fig1.update_layout(template='plotly_white', xaxis_title='Track Duration (Minutes)', yaxis_title='Average Popularity')
 
387
  st.plotly_chart(fig1)
388
  else:
389
- st.error("Cannot plot: 'Track Duration (ms)' or 'Popularity' column missing.")
 
390
 
391
  with tab2:
392
- st.markdown("**Popularity by Time Signature:** This bar chart compares the average popularity of songs based on their time signatures.")
 
393
 
394
  if 'Time Signature' in df.columns and 'Popularity' in df.columns:
395
- pop_by_time = df.groupby('Time Signature')['Popularity'].mean().reset_index()
 
396
  fig2 = px.bar(
397
- pop_by_time,
398
- x='Time Signature',
399
  y='Popularity',
400
  title='Average Popularity by Time Signature',
401
  color='Popularity',
402
  color_continuous_scale='purples'
403
  )
404
- fig2.update_layout(template='plotly_white', xaxis_title='Time Signature', yaxis_title='Average Popularity')
 
405
  st.plotly_chart(fig2)
406
  else:
407
- st.error("Cannot plot: 'Time Signature' or 'Popularity' column missing.")
 
 
408
 
409
  def generate_feature_comparisons(df):
410
  st.header("Feature Comparisons Across Decades")
@@ -412,7 +469,8 @@ def generate_feature_comparisons(df):
412
  with tab1:
413
  st.markdown("**Feature Comparison:** Compares features across decades.")
414
  if 'Decade' in df.columns:
415
- features_by_decade = df.groupby('Decade')[['Danceability', 'Energy', 'Valence']].mean().reset_index()
 
416
  fig21 = px.bar(features_by_decade.melt(id_vars='Decade'), x='Decade', y='value', color='variable',
417
  barmode='group', title='Feature Comparison by Decade', color_discrete_sequence=px.colors.qualitative.Pastel)
418
  fig21.update_layout(template='plotly_white', width=800, height=400)
@@ -422,21 +480,26 @@ def generate_feature_comparisons(df):
422
  with tab2:
423
  st.markdown("**Loudness Over Time:** Tracks loudness trends.")
424
  if 'Year' in df.columns and 'Loudness' in df.columns:
425
- loudness_by_year = df.groupby('Year')['Loudness'].mean().reset_index()
426
- fig22 = px.line(loudness_by_year, x='Year', y='Loudness', title='Average Loudness Over Time', color_discrete_sequence=['green'])
 
 
427
  fig22.update_layout(template='plotly_white', width=800, height=400)
428
  st.plotly_chart(fig22)
429
  else:
430
  st.error("Cannot plot: 'Year' or 'Loudness' column missing.")
431
 
 
432
  def generate_top_artists_songs(df):
433
  st.header("Top Artists and Songs")
434
  tab1, tab2 = st.tabs(["Top Artists", "Top Songs"])
435
-
436
  with tab1:
437
- st.markdown("**Most Featured Artists:** Displays the top 10 artists with the highest song counts, highlighting their dominance in the dataset.")
 
438
  if 'Artist Name(s)' in df.columns:
439
- top_artists = df['Artist Name(s)'].value_counts().nlargest(10).reset_index()
 
440
  top_artists.columns = ['Artist Name(s)', 'Count']
441
  fig13 = px.sunburst(
442
  top_artists, path=['Artist Name(s)'], values='Count',
@@ -448,11 +511,13 @@ def generate_top_artists_songs(df):
448
  st.plotly_chart(fig13)
449
  else:
450
  st.error("Cannot plot: 'Artist Name(s)' column missing.")
451
-
452
  with tab2:
453
- st.markdown("**Songs by Artists and Years:** Analyzes song release trends across different years, focusing on the top artists.")
 
454
  if 'Artist Name(s)' in df.columns and 'Year' in df.columns:
455
- artist_year = df.groupby(['Artist Name(s)', 'Year']).size().reset_index(name='Count')
 
456
  fig16 = px.sunburst(
457
  artist_year, path=['Year', 'Artist Name(s)'], values='Count',
458
  title='Songs Released by Artists Over the Years',
@@ -464,60 +529,74 @@ def generate_top_artists_songs(df):
464
  else:
465
  st.error("Cannot plot: 'Artist Name(s)' or 'Year' column missing.")
466
 
 
467
  def generate_network_analysis(df):
468
  st.header("Network Analysis")
469
  tab1, tab2 = st.tabs(["Artist Collaborations", "Genre Crossover"])
470
-
471
  # Ensure column names are stripped of spaces
472
  df.columns = df.columns.str.strip()
473
-
474
  with tab1:
475
- st.markdown("**Top Collaborating Artists:** This chart highlights artists who frequently collaborate with each other.")
 
476
  if 'Artist Name(s)' in df.columns:
477
- df['Artist Name(s)'] = df['Artist Name(s)'].astype(str).str.split(', ')
 
478
  collaborations = []
479
  for artists in df['Artist Name(s)']:
480
  collaborations.extend(combinations(sorted(artists), 2))
481
-
482
  collab_counts = Counter(collaborations)
483
- top_collabs = sorted(collab_counts.items(), key=lambda x: x[1], reverse=True)[:20]
484
-
 
485
  G = nx.Graph()
486
  for (artist1, artist2), weight in top_collabs:
487
  G.add_edge(artist1, artist2, weight=weight)
488
-
489
  pos = nx.spring_layout(G, seed=42)
490
  plt.figure(figsize=(12, 8))
491
- edges = nx.draw_networkx_edges(G, pos, alpha=0.5, width=[G[u][v]['weight'] for u,v in G.edges()])
492
- nodes = nx.draw_networkx_nodes(G, pos, node_size=700, node_color='orange')
493
- labels = nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')
 
 
 
494
  plt.title("Top 20 Artist Collaborations")
495
  st.pyplot(plt)
496
  else:
497
- st.error("Cannot plot: 'Artist Name(s)' column missing. Available columns: " + ", ".join(df.columns))
498
-
 
499
  with tab2:
500
- st.markdown("**Genre Crossover:** This chart shows how different music genres are connected and often blend together.")
 
501
  if 'Genres' in df.columns:
502
  df['Genres'] = df['Genres'].astype(str).str.split(', ')
503
  genre_pairs = []
504
  for genres in df['Genres']:
505
  genre_pairs.extend(combinations(sorted(set(genres)), 2))
506
-
507
  genre_counts = Counter(genre_pairs)
508
- top_genre_pairs = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)[:20]
509
-
510
- labels = list(set(chain.from_iterable([pair[0] for pair in top_genre_pairs])))
 
 
511
  matrix = [[0] * len(labels) for _ in range(len(labels))]
512
-
513
  label_index = {label: i for i, label in enumerate(labels)}
514
  for (genre1, genre2), count in top_genre_pairs:
515
  i, j = label_index[genre1], label_index[genre2]
516
  matrix[i][j] = count
517
  matrix[j][i] = count
518
-
519
- fig = go.Figure(data=[go.Heatmap(z=matrix, x=labels, y=labels, colorscale='OrRd', text=matrix, hoverinfo='text')])
520
- fig.update_layout(title="Genre Crossover Chord Diagram", xaxis_title="Genres", yaxis_title="Genres")
 
 
521
  st.plotly_chart(fig)
522
  else:
523
- st.error("Cannot plot: 'Genres' column missing. Available columns: " + ", ".join(df.columns))
 
 
5
  import matplotlib.pyplot as plt
6
  import networkx as nx
7
  import plotly.graph_objects as go
8
+ from itertools import chain, combinations
9
  import numpy as np
10
  from collections import Counter
11
 
12
 
13
  def generate_popularity_trends(df):
14
  st.header("Popularity Trends Over Time")
15
+ tab1, tab2, tab3 = st.tabs(
16
+ ["Average Popularity", "Individual Songs", "Top 10 Songs"])
17
+
18
  with tab1:
19
+ st.markdown(
20
+ "**Average Popularity by Decade:** This chart shows how the average popularity of songs has changed over different decades.")
21
  if 'Decade' in df.columns:
22
+ top_decades = df.groupby('Decade')['Popularity'].mean(
23
+ ).reset_index().nlargest(10, 'Popularity')
24
+
25
  fig1 = go.Figure()
26
  fig1.add_trace(go.Scatter(
27
  x=top_decades['Decade'],
 
29
  mode='lines+markers',
30
  fill='tonexty',
31
  line=dict(color='royalblue', width=3),
32
+ marker=dict(size=8, color='darkblue',
33
+ line=dict(width=2, color='white')),
34
  name='Popularity',
35
  hovertext=top_decades['Decade']
36
  ))
 
45
  st.plotly_chart(fig1)
46
  else:
47
  st.error("Cannot plot: 'Decade' column missing.")
48
+
49
  with tab2:
50
+ st.markdown(
51
+ "**Top 10 Individual Songs:** This scatter plot highlights the popularity of the top 10 most popular songs over time.")
52
  if 'Year' in df.columns:
53
  top_songs = df.nlargest(10, 'Popularity')
54
  fig2 = px.scatter(
 
69
  st.plotly_chart(fig2)
70
  else:
71
  st.error("Cannot plot: 'Year' column missing.")
72
+
73
  with tab3:
74
+ st.markdown(
75
+ "**Top 10 Most Popular Songs:** This bar chart displays the top 10 songs based on their popularity scores.")
76
  if 'Track Name' in df.columns and 'Popularity' in df.columns:
77
+ top_songs = df.nlargest(10, 'Popularity')[
78
+ ['Track Name', 'Artist Name(s)', 'Popularity']]
79
  fig3 = px.bar(
80
  top_songs, y='Track Name', x='Popularity',
81
  orientation='h', color='Popularity',
82
  color_continuous_scale='deep',
83
  title='Top 10 Most Popular Songs',
84
+ labels={'Track Name': 'Song Title',
85
+ 'Popularity': 'Popularity Score'},
86
  hover_data=['Track Name', 'Artist Name(s)']
87
  )
88
  fig3.update_layout(
 
97
  st.error("Cannot plot: 'Track Name' or 'Popularity' column missing.")
98
 
99
 
100
+ def generate_audio_features(df):
101
  st.header("Audio Features Analysis")
102
 
103
  feature = st.selectbox(
104
  "Select Feature", ['Danceability', 'Energy', 'Tempo', 'Loudness']
105
  )
106
+
107
  tab1, tab2 = st.tabs(["Distribution", "By Decade"])
108
+
109
  with tab1:
110
+ st.markdown(
111
+ f"**Top 20 {feature} Values:** This bar chart displays the distribution of the top 20 songs based on {feature}.")
112
  top_features = df.nlargest(20, feature)
113
 
114
  fig = px.bar(
115
+ top_features, x='Track Name', y=feature,
116
  color='Decade' if 'Decade' in df.columns else None,
117
  title=f'Top 20 Songs by {feature}',
118
  color_discrete_sequence=px.colors.qualitative.Set2,
 
120
  )
121
  fig.update_layout(xaxis_tickangle=-45, template='plotly_white')
122
  st.plotly_chart(fig)
123
+
124
  with tab2:
125
+ st.markdown(
126
+ f"**{feature} by Decade:** This line chart compares the top {feature} trends over different decades.")
127
 
128
  if 'Decade' in df.columns:
129
+ avg_feature_by_decade = df.groupby(
130
+ 'Decade')[feature].mean().reset_index()
131
 
132
  fig2 = px.line(
133
  avg_feature_by_decade, x='Decade', y=feature,
 
141
  else:
142
  st.error("Cannot plot: 'Decade' column missing.")
143
 
144
+
145
  def generate_genre_analysis(df):
146
  st.header("Genre & Artist Analysis")
147
+ tab1, tab2, tab3 = st.tabs(
148
+ ["Top Genres", "Genre Distribution", "Artist Popularity"])
149
+
150
  with tab1:
151
+ st.markdown(
152
+ "**Top Genres in Top 10 Songs:** Displays the most common genres among the top 10 most popular songs.")
153
  top_songs = df.nlargest(10, 'Popularity')
154
+ top_genres = top_songs.explode(
155
+ 'Genres')['Genres'].value_counts().reset_index()
156
  fig1 = px.bar(
157
  top_genres, x='count', y='Genres',
158
  orientation='h', color='count',
 
163
  )
164
  fig1.update_layout(template='plotly_white', width=900, height=500)
165
  st.plotly_chart(fig1)
166
+
167
  with tab2:
168
+ st.markdown(
169
+ "**Genre Distribution in Top 10 Songs:** Shows how different genres contribute to the top 10 songs.")
170
  genre_song_data = top_songs.explode('Genres')
171
  fig2 = px.bar(
172
  genre_song_data, x='Track Name', y='Popularity', color='Genres',
173
  title='Genre Distribution in Top 10 Songs',
174
+ labels={'Track Name': 'Song Title',
175
+ 'Popularity': 'Popularity Score', 'Genres': 'Genre'},
176
  barmode='stack',
177
  hover_data=['Track Name', 'Genres']
178
  )
179
  fig2.update_layout(template='plotly_white', width=900, height=500)
180
  st.plotly_chart(fig2)
181
+
182
  with tab3:
183
+ st.markdown(
184
+ "**Artist Popularity in Top 10 Songs:** Visualizes the most popular artists in the top 10 songs with their song count and names.")
185
+ artist_popularity = top_songs.groupby('Artist Name(s)').agg(
186
+ {'Popularity': 'sum', 'Track Name': lambda x: list(x)}).reset_index().sort_values(by='Popularity', ascending=False)
187
+ artist_popularity['Song Count'] = artist_popularity['Track Name'].apply(
188
+ len)
189
  fig3 = px.bar(
190
  artist_popularity, x='Popularity', y='Artist Name(s)',
191
  orientation='h', color='Popularity',
192
  color_continuous_scale='blues',
193
  title='Artist Popularity in Top 10 Songs',
194
+ labels={'Artist Name(s)': 'Artist Name',
195
+ 'Popularity': 'Total Popularity Score', 'Song Count': 'Number of Songs'},
196
+ hover_data={'Artist Name(s)': True, 'Popularity': True,
197
+ 'Song Count': True, 'Track Name': True}
198
  )
199
  fig3.update_layout(template='plotly_white', width=900, height=500)
200
  st.plotly_chart(fig3)
201
 
202
+
203
  def generate_explicit_trends(df):
204
  st.header("Explicit Content Trends")
205
  st.markdown("**Explicit vs Non-Explicit Songs Over Time:** This line chart shows how the number of explicit and non-explicit songs has changed over different decades.")
206
  if 'Decade' in df.columns and 'Explicit' in df.columns:
207
+ explicit_trends = df.groupby(
208
+ ['Decade', 'Explicit']).size().reset_index(name='Count')
209
  fig = px.line(
210
  explicit_trends, x='Decade', y='Count', color='Explicit',
211
  markers=True, line_shape='linear',
212
  title='Explicit vs Non-Explicit Songs Over Time',
213
+ labels={'Decade': 'Decade', 'Count': 'Number of Songs',
214
+ 'Explicit': 'Song Type'},
215
  color_discrete_map={True: 'purple', False: 'green'}
216
  )
217
  fig.update_layout(template='plotly_white', width=900, height=500)
 
219
  else:
220
  st.error("Cannot plot: 'Decade' or 'Explicit' column missing.")
221
 
222
+
223
  def generate_album_insights(df):
224
  st.header("Album & Label Insights")
225
  tab1, tab2 = st.tabs(["Top Labels", "Album Popularity"])
226
+
227
  with tab1:
228
+ st.markdown(
229
+ "**Top Record Labels:** Displays the most dominant record labels based on the number of songs they have released.")
230
  if 'Label' in df.columns:
231
  top_labels = df['Label'].value_counts().nlargest(10).reset_index()
232
  fig9 = px.sunburst(
 
239
  st.plotly_chart(fig9)
240
  else:
241
  st.error("Cannot plot: 'Label' column missing.")
242
+
243
  with tab2:
244
+ st.markdown(
245
+ "**Album Popularity:** Compares the popularity of albums based on the number of songs and their average popularity score.")
246
  if 'Album Name' in df.columns and 'Popularity' in df.columns:
247
+ album_pop = df.groupby('Album Name')['Popularity'].agg(
248
+ ['mean', 'count']).reset_index()
249
+ album_pop = album_pop.sort_values(by=['mean', 'count'], ascending=[
250
+ False, False]).nlargest(10, 'mean')
251
  fig10 = px.strip(
252
  album_pop, x='mean', y='Album Name',
253
  color='count',
254
  title='Top 10 Albums by Popularity',
255
+ labels={'Album Name': 'Album',
256
+ 'mean': 'Average Popularity Score', 'count': 'Number of Songs'},
257
  hover_data={'Album Name': True, 'count': True, 'mean': True},
258
  color_discrete_sequence=px.colors.qualitative.Pastel
259
  )
 
270
  st.markdown("**Tempo Trends:** Tracks tempo changes.")
271
  if 'Year' in df.columns and 'Tempo' in df.columns:
272
  tempo_by_year = df.groupby('Year')['Tempo'].mean().reset_index()
273
+ fig11 = px.line(tempo_by_year, x='Year', y='Tempo',
274
+ title='Average Tempo Over Time', color_discrete_sequence=['orange'])
275
  fig11.update_layout(template='plotly_white', width=800, height=400)
276
  st.plotly_chart(fig11)
277
  else:
278
  st.error("Cannot plot: 'Year' or 'Tempo' column missing.")
279
  with tab2:
280
+ st.markdown(
281
+ "**Mood Analysis (Valence & Energy):** Categorizes songs based on mood and energy.")
282
  if 'Valence' in df.columns and 'Energy' in df.columns:
283
  top_songs = df.nlargest(10, 'Popularity')
284
+ mood_by_valence = top_songs.groupby(
285
+ 'Valence')['Energy'].mean().reset_index()
286
  fig12 = px.bar(
287
  mood_by_valence, x='Valence', y='Energy',
288
  title='Average Energy Levels by Valence (Mood Analysis)',
 
292
  st.plotly_chart(fig12)
293
  else:
294
  st.error("Cannot plot: 'Valence' or 'Energy' column missing.")
295
+
296
+
297
  def generate_top_artists_songs(df):
298
  st.header("Top Artists and Songs")
299
  tab1, tab2 = st.tabs(["Top Artists", "Top Songs"])
300
+
301
  with tab1:
302
  st.markdown("**Most Featured Artists:** Shows top artists.")
303
  if 'Artist Name(s)' in df.columns:
304
+ top_artists = df['Artist Name(s)'].value_counts().nlargest(
305
+ 10).reset_index()
306
  fig13 = px.bar(
307
  top_artists, x='count', y='Artist Name(s)',
308
  orientation='h',
 
313
  st.plotly_chart(fig13)
314
  else:
315
  st.error("Cannot plot: 'Artist Name(s)' column missing.")
316
+
317
  with tab2:
318
  st.markdown("**Top 10 Songs:** Lists top songs.")
319
  if 'Track Name' in df.columns and 'Popularity' in df.columns:
320
+ top_songs = df.nlargest(10, 'Popularity')[
321
+ ['Track Name', 'Popularity']]
322
  fig14 = px.pie(
323
  top_songs, values='Popularity', names='Track Name',
324
  title='Top 10 Songs by Popularity', color_discrete_sequence=px.colors.qualitative.Set3
 
335
  with tab1:
336
  st.markdown("**Albums per Year:** Tracks release patterns.")
337
  if 'Year' in df.columns:
338
+ albums_per_year = df['Year'].value_counts(
339
+ ).sort_index().reset_index()
340
+ fig15 = px.line(albums_per_year, x='Year', y='count',
341
+ title='Number of Albums Released per Year', color_discrete_sequence=['purple'])
342
  fig15.update_layout(template='plotly_white', width=800, height=400)
343
  st.plotly_chart(fig15)
344
  else:
 
347
  st.markdown("**Songs by Artists and Years:** Visualizes trends.")
348
  if 'Artist Name(s)' in df.columns and 'Year' in df.columns:
349
  # Filter to only show the top 10 most featured artists
350
+ top_artists = df['Artist Name(s)'].value_counts().nlargest(
351
+ 10).index
352
  filtered_df = df[df['Artist Name(s)'].isin(top_artists)]
353
+
354
  # Grouping data
355
+ artist_year = filtered_df.groupby(
356
+ ['Year', 'Artist Name(s)']).size().reset_index(name='Count')
357
 
358
  # Create a grouped bar chart
359
  fig16 = px.bar(
 
367
  st.plotly_chart(fig16)
368
  else:
369
  st.error("Cannot plot: 'Artist Name(s)' or 'Year' column missing.")
370
+
371
+
372
  def generate_duration_analysis(df):
373
  st.header("Track Duration Analysis")
374
  tab1, tab2 = st.tabs(["Distribution", "By Decade"])
 
377
  df = df[df['Track Duration (ms)'] <= 900000]
378
 
379
  with tab1:
380
+ st.markdown(
381
+ "**Track Duration Distribution:** Illustrates how track durations vary, helping identify common song lengths.")
382
  if 'Track Duration (ms)' in df.columns:
383
  fig17 = px.histogram(
384
  df, x='Track Duration (ms)',
 
390
  st.plotly_chart(fig17)
391
  else:
392
  st.error("Cannot plot: 'Track Duration (ms)' column missing.")
393
+
394
  with tab2:
395
+ st.markdown(
396
+ "**Duration by Decade:** Compares the evolution of average track durations across decades, showing historical trends.")
397
  if 'Decade' in df.columns and 'Track Duration (ms)' in df.columns:
398
  fig18 = px.pie(
399
+ df.groupby('Decade')[
400
+ 'Track Duration (ms)'].mean().reset_index(),
401
  names='Decade', values='Track Duration (ms)',
402
  title='Average Track Duration by Decade',
403
  color_discrete_sequence=px.colors.qualitative.Set2
 
405
  fig18.update_layout(template='plotly_white', width=800, height=400)
406
  st.plotly_chart(fig18)
407
  else:
408
+ st.error(
409
+ "Cannot plot: 'Decade' or 'Track Duration (ms)' column missing.")
410
 
411
 
412
  def generate_streaming_insights(df):
413
  st.header("Streaming and Engagement Insights")
414
  tab1, tab2 = st.tabs(["Popularity vs Duration", "Time Signature"])
415
+
416
  with tab1:
417
+ st.markdown(
418
+ "**Popularity vs Track Duration:** This line chart shows the trend of song popularity based on their duration.")
419
+
420
  if 'Track Duration (ms)' in df.columns and 'Popularity' in df.columns:
421
  df['Duration (minutes)'] = df['Track Duration (ms)'] / 60000
422
+ duration_bins = pd.cut(df['Duration (minutes)'], bins=[
423
+ 0, 2, 4, 6, 8, 10, 15], labels=['0-2', '2-4', '4-6', '6-8', '8-10', '10+'])
424
+ avg_popularity = df.groupby(duration_bins)[
425
+ 'Popularity'].mean().reset_index()
426
 
427
  fig1 = px.line(
428
+ avg_popularity,
429
+ x='Duration (minutes)',
430
  y='Popularity',
431
  title='Popularity vs. Track Duration',
432
  markers=True, # Adds points to the line
433
  line_shape='spline', # Smoothens the line
434
  color_discrete_sequence=['blue']
435
  )
436
+ fig1.update_layout(
437
+ template='plotly_white', xaxis_title='Track Duration (Minutes)', yaxis_title='Average Popularity')
438
  st.plotly_chart(fig1)
439
  else:
440
+ st.error(
441
+ "Cannot plot: 'Track Duration (ms)' or 'Popularity' column missing.")
442
 
443
  with tab2:
444
+ st.markdown(
445
+ "**Popularity by Time Signature:** This bar chart compares the average popularity of songs based on their time signatures.")
446
 
447
  if 'Time Signature' in df.columns and 'Popularity' in df.columns:
448
+ pop_by_time = df.groupby('Time Signature')[
449
+ 'Popularity'].mean().reset_index()
450
  fig2 = px.bar(
451
+ pop_by_time,
452
+ x='Time Signature',
453
  y='Popularity',
454
  title='Average Popularity by Time Signature',
455
  color='Popularity',
456
  color_continuous_scale='purples'
457
  )
458
+ fig2.update_layout(
459
+ template='plotly_white', xaxis_title='Time Signature', yaxis_title='Average Popularity')
460
  st.plotly_chart(fig2)
461
  else:
462
+ st.error(
463
+ "Cannot plot: 'Time Signature' or 'Popularity' column missing.")
464
+
465
 
466
  def generate_feature_comparisons(df):
467
  st.header("Feature Comparisons Across Decades")
 
469
  with tab1:
470
  st.markdown("**Feature Comparison:** Compares features across decades.")
471
  if 'Decade' in df.columns:
472
+ features_by_decade = df.groupby(
473
+ 'Decade')[['Danceability', 'Energy', 'Valence']].mean().reset_index()
474
  fig21 = px.bar(features_by_decade.melt(id_vars='Decade'), x='Decade', y='value', color='variable',
475
  barmode='group', title='Feature Comparison by Decade', color_discrete_sequence=px.colors.qualitative.Pastel)
476
  fig21.update_layout(template='plotly_white', width=800, height=400)
 
480
  with tab2:
481
  st.markdown("**Loudness Over Time:** Tracks loudness trends.")
482
  if 'Year' in df.columns and 'Loudness' in df.columns:
483
+ loudness_by_year = df.groupby(
484
+ 'Year')['Loudness'].mean().reset_index()
485
+ fig22 = px.line(loudness_by_year, x='Year', y='Loudness',
486
+ title='Average Loudness Over Time', color_discrete_sequence=['green'])
487
  fig22.update_layout(template='plotly_white', width=800, height=400)
488
  st.plotly_chart(fig22)
489
  else:
490
  st.error("Cannot plot: 'Year' or 'Loudness' column missing.")
491
 
492
+
493
  def generate_top_artists_songs(df):
494
  st.header("Top Artists and Songs")
495
  tab1, tab2 = st.tabs(["Top Artists", "Top Songs"])
496
+
497
  with tab1:
498
+ st.markdown(
499
+ "**Most Featured Artists:** Displays the top 10 artists with the highest song counts, highlighting their dominance in the dataset.")
500
  if 'Artist Name(s)' in df.columns:
501
+ top_artists = df['Artist Name(s)'].value_counts().nlargest(
502
+ 10).reset_index()
503
  top_artists.columns = ['Artist Name(s)', 'Count']
504
  fig13 = px.sunburst(
505
  top_artists, path=['Artist Name(s)'], values='Count',
 
511
  st.plotly_chart(fig13)
512
  else:
513
  st.error("Cannot plot: 'Artist Name(s)' column missing.")
514
+
515
  with tab2:
516
+ st.markdown(
517
+ "**Songs by Artists and Years:** Analyzes song release trends across different years, focusing on the top artists.")
518
  if 'Artist Name(s)' in df.columns and 'Year' in df.columns:
519
+ artist_year = df.groupby(
520
+ ['Artist Name(s)', 'Year']).size().reset_index(name='Count')
521
  fig16 = px.sunburst(
522
  artist_year, path=['Year', 'Artist Name(s)'], values='Count',
523
  title='Songs Released by Artists Over the Years',
 
529
  else:
530
  st.error("Cannot plot: 'Artist Name(s)' or 'Year' column missing.")
531
 
532
+
533
  def generate_network_analysis(df):
534
  st.header("Network Analysis")
535
  tab1, tab2 = st.tabs(["Artist Collaborations", "Genre Crossover"])
536
+
537
  # Ensure column names are stripped of spaces
538
  df.columns = df.columns.str.strip()
539
+
540
  with tab1:
541
+ st.markdown(
542
+ "**Top Collaborating Artists:** This chart highlights artists who frequently collaborate with each other.")
543
  if 'Artist Name(s)' in df.columns:
544
+ df['Artist Name(s)'] = df['Artist Name(s)'].astype(
545
+ str).str.split(', ')
546
  collaborations = []
547
  for artists in df['Artist Name(s)']:
548
  collaborations.extend(combinations(sorted(artists), 2))
549
+
550
  collab_counts = Counter(collaborations)
551
+ top_collabs = sorted(collab_counts.items(),
552
+ key=lambda x: x[1], reverse=True)[:20]
553
+
554
  G = nx.Graph()
555
  for (artist1, artist2), weight in top_collabs:
556
  G.add_edge(artist1, artist2, weight=weight)
557
+
558
  pos = nx.spring_layout(G, seed=42)
559
  plt.figure(figsize=(12, 8))
560
+ edges = nx.draw_networkx_edges(G, pos, alpha=0.5, width=[
561
+ G[u][v]['weight'] for u, v in G.edges()])
562
+ nodes = nx.draw_networkx_nodes(
563
+ G, pos, node_size=700, node_color='orange')
564
+ labels = nx.draw_networkx_labels(
565
+ G, pos, font_size=10, font_weight='bold')
566
  plt.title("Top 20 Artist Collaborations")
567
  st.pyplot(plt)
568
  else:
569
+ st.error(
570
+ "Cannot plot: 'Artist Name(s)' column missing. Available columns: " + ", ".join(df.columns))
571
+
572
  with tab2:
573
+ st.markdown(
574
+ "**Genre Crossover:** This chart shows how different music genres are connected and often blend together.")
575
  if 'Genres' in df.columns:
576
  df['Genres'] = df['Genres'].astype(str).str.split(', ')
577
  genre_pairs = []
578
  for genres in df['Genres']:
579
  genre_pairs.extend(combinations(sorted(set(genres)), 2))
580
+
581
  genre_counts = Counter(genre_pairs)
582
+ top_genre_pairs = sorted(
583
+ genre_counts.items(), key=lambda x: x[1], reverse=True)[:20]
584
+
585
+ labels = list(set(chain.from_iterable(
586
+ [pair[0] for pair in top_genre_pairs])))
587
  matrix = [[0] * len(labels) for _ in range(len(labels))]
588
+
589
  label_index = {label: i for i, label in enumerate(labels)}
590
  for (genre1, genre2), count in top_genre_pairs:
591
  i, j = label_index[genre1], label_index[genre2]
592
  matrix[i][j] = count
593
  matrix[j][i] = count
594
+
595
+ fig = go.Figure(data=[go.Heatmap(
596
+ z=matrix, x=labels, y=labels, colorscale='OrRd', text=matrix, hoverinfo='text')])
597
+ fig.update_layout(title="Genre Crossover Chord Diagram",
598
+ xaxis_title="Genres", yaxis_title="Genres")
599
  st.plotly_chart(fig)
600
  else:
601
+ st.error(
602
+ "Cannot plot: 'Genres' column missing. Available columns: " + ", ".join(df.columns))