Ezhil commited on
Commit
be1014f
·
1 Parent(s): f814502

Changes DV - network analysis

Browse files
functions/__pycache__/visualizations.cpython-310.pyc CHANGED
Binary files a/functions/__pycache__/visualizations.cpython-310.pyc and b/functions/__pycache__/visualizations.cpython-310.pyc differ
 
functions/visualizations.py CHANGED
@@ -5,6 +5,7 @@ import seaborn as sns
5
  import matplotlib.pyplot as plt
6
  import networkx as nx
7
  import plotly.graph_objects as go
 
8
 
9
  def generate_popularity_trends(df):
10
  st.header("Popularity Trends Over Time")
@@ -250,25 +251,36 @@ def generate_tempo_mood(df):
250
  def generate_top_artists_songs(df):
251
  st.header("Top Artists and Songs")
252
  tab1, tab2 = st.tabs(["Top Artists", "Top Songs"])
 
253
  with tab1:
254
  st.markdown("**Most Featured Artists:** Shows top artists.")
255
  if 'Artist Name(s)' in df.columns:
256
  top_artists = df['Artist Name(s)'].value_counts().nlargest(10).reset_index()
257
- fig13 = px.bar(top_artists, x='Artist Name(s)', y='count', title='Most Featured Artists', color_discrete_sequence=['green'])
258
- fig13.update_layout(template='plotly_white', width=800, height=400)
 
 
 
 
 
259
  st.plotly_chart(fig13)
260
  else:
261
  st.error("Cannot plot: 'Artist Name(s)' column missing.")
 
262
  with tab2:
263
  st.markdown("**Top 10 Songs:** Lists top songs.")
264
  if 'Track Name' in df.columns and 'Popularity' in df.columns:
265
  top_songs = df.nlargest(10, 'Popularity')[['Track Name', 'Popularity']]
266
- fig14 = px.bar(top_songs, y='Track Name', x='Popularity', orientation='h', title='Top 10 Songs by Popularity', color_discrete_sequence=['blue'])
267
- fig14.update_layout(template='plotly_white', width=800, height=400)
 
 
 
268
  st.plotly_chart(fig14)
269
  else:
270
  st.error("Cannot plot: 'Track Name' or 'Popularity' column missing.")
271
 
 
272
  def generate_album_release_trends(df):
273
  st.header("Album Release Trends")
274
  tab1, tab2 = st.tabs(["Albums per Year", "Artist-Year Heatmap"])
@@ -284,54 +296,96 @@ def generate_album_release_trends(df):
284
  with tab2:
285
  st.markdown("**Songs by Artists and Years:** Visualizes trends.")
286
  if 'Artist Name(s)' in df.columns and 'Year' in df.columns:
287
- artist_year = df.groupby(['Artist Name(s)', 'Year']).size().unstack().fillna(0)
288
- fig16 = px.imshow(artist_year, title='Songs Released by Artists Across Years', color_continuous_scale='Viridis')
289
- fig16.update_layout(width=800, height=400)
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  st.plotly_chart(fig16)
291
  else:
292
  st.error("Cannot plot: 'Artist Name(s)' or 'Year' column missing.")
293
-
294
  def generate_duration_analysis(df):
295
  st.header("Track Duration Analysis")
296
  tab1, tab2 = st.tabs(["Distribution", "By Decade"])
 
 
 
 
297
  with tab1:
298
- st.markdown("**Track Duration Distribution:** Shows duration lengths.")
299
  if 'Track Duration (ms)' in df.columns:
300
- fig17 = px.histogram(df, x='Track Duration (ms)', title='Distribution of Track Durations', color_discrete_sequence=['orange'])
 
 
 
 
 
301
  fig17.update_layout(template='plotly_white', width=800, height=400)
302
  st.plotly_chart(fig17)
303
  else:
304
  st.error("Cannot plot: 'Track Duration (ms)' column missing.")
 
305
  with tab2:
306
- st.markdown("**Duration by Decade:** Compares durations.")
307
  if 'Decade' in df.columns and 'Track Duration (ms)' in df.columns:
308
- fig18 = px.box(df, x='Decade', y='Track Duration (ms)', title='Track Duration by Decade', color_discrete_sequence=['green'])
 
 
 
 
 
309
  fig18.update_layout(template='plotly_white', width=800, height=400)
310
  st.plotly_chart(fig18)
311
  else:
312
  st.error("Cannot plot: 'Decade' or 'Track Duration (ms)' column missing.")
313
 
 
314
  def generate_streaming_insights(df):
315
  st.header("Streaming and Engagement Insights")
316
  tab1, tab2 = st.tabs(["Popularity vs Duration", "Time Signature"])
 
317
  with tab1:
318
- st.markdown("**Popularity vs Duration:** Explores engagement trends.")
319
  if 'Track Duration (ms)' in df.columns and 'Popularity' in df.columns:
320
- fig19 = px.scatter(df, x='Track Duration (ms)', y='Popularity', title='Popularity vs Track Duration', color_discrete_sequence=['blue'])
321
- fig19.update_layout(template='plotly_white', width=800, height=400)
 
 
 
 
 
 
322
  st.plotly_chart(fig19)
323
  else:
324
  st.error("Cannot plot: 'Track Duration (ms)' or 'Popularity' column missing.")
 
325
  with tab2:
326
- st.markdown("**Popularity by Time Signature:** Compares popularity.")
327
  if 'Time Signature' in df.columns and 'Popularity' in df.columns:
328
  pop_by_time = df.groupby('Time Signature')['Popularity'].mean().reset_index()
329
- fig20 = px.bar(pop_by_time, x='Time Signature', y='Popularity', title='Average Popularity by Time Signature', color_discrete_sequence=['purple'])
 
 
 
 
 
330
  fig20.update_layout(template='plotly_white', width=800, height=400)
331
  st.plotly_chart(fig20)
332
  else:
333
  st.error("Cannot plot: 'Time Signature' or 'Popularity' column missing.")
334
 
 
335
  def generate_feature_comparisons(df):
336
  st.header("Feature Comparisons Across Decades")
337
  tab1, tab2 = st.tabs(["Feature Comparison", "Loudness Trends"])
@@ -355,65 +409,74 @@ def generate_feature_comparisons(df):
355
  else:
356
  st.error("Cannot plot: 'Year' or 'Loudness' column missing.")
357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  def generate_network_analysis(df):
359
  st.header("Network Analysis")
360
  tab1, tab2 = st.tabs(["Artist Collaborations", "Genre Crossover"])
 
361
  with tab1:
362
- st.markdown("**Artist Collaborations:** Visualizes artist connections.")
363
- if 'Artist Name(s)' in df.columns:
364
- valid_artists = df['Artist Name(s)'].dropna().astype(str)
365
- G = nx.Graph()
366
- for artists in valid_artists:
367
- artists_list = [a.strip() for a in artists.split(',') if a.strip()]
368
- if len(artists_list) > 1:
369
- for a1, a2 in combinations(artists_list, 2):
370
- G.add_edge(a1, a2)
371
- if G.number_of_nodes() > 0:
372
- pos = nx.spring_layout(G)
373
- edge_x = []
374
- edge_y = []
375
- for edge in G.edges():
376
- x0, y0 = pos[edge[0]]
377
- x1, y1 = pos[edge[1]]
378
- edge_x.extend([x0, x1, None])
379
- edge_y.extend([y0, y1, None])
380
-
381
- edge_trace = go.Scatter(
382
- x=edge_x, y=edge_y,
383
- line=dict(width=0.5, color='#888'),
384
- hoverinfo='none',
385
- mode='lines')
386
-
387
- node_x = [pos[node][0] for node in G.nodes()]
388
- node_y = [pos[node][1] for node in G.nodes()]
389
- node_trace = go.Scatter(
390
- x=node_x, y=node_y,
391
- mode='markers+text',
392
- hoverinfo='text',
393
- marker=dict(size=10, color='red'),
394
- text=list(G.nodes()),
395
- textposition="top center")
396
-
397
- fig = go.Figure(data=[edge_trace, node_trace],
398
- layout=go.Layout(
399
- title='Artist Collaborations',
400
- showlegend=False,
401
- hovermode='closest',
402
- margin=dict(b=0, l=0, r=0, t=40),
403
- width=800, height=600))
404
- st.plotly_chart(fig)
405
- else:
406
- st.warning("No artist collaborations to display.")
407
  else:
408
- st.error("Cannot plot: 'Artist Name(s)' column missing.")
 
409
  with tab2:
410
- st.markdown("**Genre Crossover:** Placeholder for future visualization.")
411
- st.write("To implement, install `holoviews` and use the following code:")
412
- st.code("""
413
- import holoviews as hv
414
- hv.extension('bokeh')
415
- genre_pairs = df.explode('Genres')[['Genres']].merge(df.explode('Genres')[['Genres']], how='cross')
416
- chord_data = genre_pairs.groupby(['Genres_x', 'Genres_y']).size().reset_index(name='value')
417
- chord = hv.Chord(chord_data).opts(title="Genre Crossover")
418
- st.write(hv.render(chord, backend='bokeh'))
419
- """)
 
 
 
 
5
  import matplotlib.pyplot as plt
6
  import networkx as nx
7
  import plotly.graph_objects as go
8
+ from itertools import combinations
9
 
10
  def generate_popularity_trends(df):
11
  st.header("Popularity Trends Over Time")
 
251
  def generate_top_artists_songs(df):
252
  st.header("Top Artists and Songs")
253
  tab1, tab2 = st.tabs(["Top Artists", "Top Songs"])
254
+
255
  with tab1:
256
  st.markdown("**Most Featured Artists:** Shows top artists.")
257
  if 'Artist Name(s)' in df.columns:
258
  top_artists = df['Artist Name(s)'].value_counts().nlargest(10).reset_index()
259
+ fig13 = px.bar(
260
+ top_artists, x='count', y='Artist Name(s)',
261
+ orientation='h',
262
+ title='Most Featured Artists',
263
+ color='count', color_continuous_scale='greens'
264
+ )
265
+ fig13.update_layout(template='plotly_white', width=900, height=500)
266
  st.plotly_chart(fig13)
267
  else:
268
  st.error("Cannot plot: 'Artist Name(s)' column missing.")
269
+
270
  with tab2:
271
  st.markdown("**Top 10 Songs:** Lists top songs.")
272
  if 'Track Name' in df.columns and 'Popularity' in df.columns:
273
  top_songs = df.nlargest(10, 'Popularity')[['Track Name', 'Popularity']]
274
+ fig14 = px.pie(
275
+ top_songs, values='Popularity', names='Track Name',
276
+ title='Top 10 Songs by Popularity', color_discrete_sequence=px.colors.qualitative.Set3
277
+ )
278
+ fig14.update_layout(template='plotly_white', width=900, height=500)
279
  st.plotly_chart(fig14)
280
  else:
281
  st.error("Cannot plot: 'Track Name' or 'Popularity' column missing.")
282
 
283
+
284
  def generate_album_release_trends(df):
285
  st.header("Album Release Trends")
286
  tab1, tab2 = st.tabs(["Albums per Year", "Artist-Year Heatmap"])
 
296
  with tab2:
297
  st.markdown("**Songs by Artists and Years:** Visualizes trends.")
298
  if 'Artist Name(s)' in df.columns and 'Year' in df.columns:
299
+ # Filter to only show the top 10 most featured artists
300
+ top_artists = df['Artist Name(s)'].value_counts().nlargest(10).index
301
+ filtered_df = df[df['Artist Name(s)'].isin(top_artists)]
302
+
303
+ # Grouping data
304
+ artist_year = filtered_df.groupby(['Year', 'Artist Name(s)']).size().reset_index(name='Count')
305
+
306
+ # Create a grouped bar chart
307
+ fig16 = px.bar(
308
+ artist_year, x='Year', y='Count', color='Artist Name(s)',
309
+ title='Songs Released by Top Artists Over the Years',
310
+ labels={'Count': 'Number of Songs', 'Year': 'Year'},
311
+ barmode='group', # Grouped bars for each artist per year
312
+ color_discrete_sequence=px.colors.qualitative.Set2
313
+ )
314
+ fig16.update_layout(width=900, height=500)
315
  st.plotly_chart(fig16)
316
  else:
317
  st.error("Cannot plot: 'Artist Name(s)' or 'Year' column missing.")
 
318
  def generate_duration_analysis(df):
319
  st.header("Track Duration Analysis")
320
  tab1, tab2 = st.tabs(["Distribution", "By Decade"])
321
+
322
+ # Filter out tracks longer than 900,000ms (15 minutes)
323
+ df = df[df['Track Duration (ms)'] <= 900000]
324
+
325
  with tab1:
326
+ st.markdown("**Track Duration Distribution:** Illustrates how track durations vary, helping identify common song lengths.")
327
  if 'Track Duration (ms)' in df.columns:
328
+ fig17 = px.histogram(
329
+ df, x='Track Duration (ms)',
330
+ title='Track Duration Distribution (Filtered)',
331
+ nbins=50,
332
+ color_discrete_sequence=['orange']
333
+ )
334
  fig17.update_layout(template='plotly_white', width=800, height=400)
335
  st.plotly_chart(fig17)
336
  else:
337
  st.error("Cannot plot: 'Track Duration (ms)' column missing.")
338
+
339
  with tab2:
340
+ st.markdown("**Duration by Decade:** Compares the evolution of average track durations across decades, showing historical trends.")
341
  if 'Decade' in df.columns and 'Track Duration (ms)' in df.columns:
342
+ fig18 = px.pie(
343
+ df.groupby('Decade')['Track Duration (ms)'].mean().reset_index(),
344
+ names='Decade', values='Track Duration (ms)',
345
+ title='Average Track Duration by Decade',
346
+ color_discrete_sequence=px.colors.qualitative.Set2
347
+ )
348
  fig18.update_layout(template='plotly_white', width=800, height=400)
349
  st.plotly_chart(fig18)
350
  else:
351
  st.error("Cannot plot: 'Decade' or 'Track Duration (ms)' column missing.")
352
 
353
+
354
  def generate_streaming_insights(df):
355
  st.header("Streaming and Engagement Insights")
356
  tab1, tab2 = st.tabs(["Popularity vs Duration", "Time Signature"])
357
+
358
  with tab1:
359
+ st.markdown("**Popularity vs Duration:** Examines how track length influences popularity trends.")
360
  if 'Track Duration (ms)' in df.columns and 'Popularity' in df.columns:
361
+ df['Duration (minutes)'] = df['Track Duration (ms)'] / 60000
362
+ fig19 = px.box(
363
+ df, x=pd.cut(df['Duration (minutes)'], bins=[0, 2, 4, 6, 8, 10, 15], labels=['0-2', '2-4', '4-6', '6-8', '8-10', '10+']),
364
+ y='Popularity',
365
+ title='Popularity Distribution Across Track Durations',
366
+ color_discrete_sequence=['blue']
367
+ )
368
+ fig19.update_layout(template='plotly_white', width=800, height=400, xaxis_title='Track Duration (Minutes)')
369
  st.plotly_chart(fig19)
370
  else:
371
  st.error("Cannot plot: 'Track Duration (ms)' or 'Popularity' column missing.")
372
+
373
  with tab2:
374
+ st.markdown("**Popularity by Time Signature:** Analyzes the average popularity of songs across different time signatures.")
375
  if 'Time Signature' in df.columns and 'Popularity' in df.columns:
376
  pop_by_time = df.groupby('Time Signature')['Popularity'].mean().reset_index()
377
+ fig20 = px.bar(
378
+ pop_by_time, x='Time Signature', y='Popularity',
379
+ title='Average Popularity by Time Signature',
380
+ color='Popularity',
381
+ color_continuous_scale='purples'
382
+ )
383
  fig20.update_layout(template='plotly_white', width=800, height=400)
384
  st.plotly_chart(fig20)
385
  else:
386
  st.error("Cannot plot: 'Time Signature' or 'Popularity' column missing.")
387
 
388
+
389
  def generate_feature_comparisons(df):
390
  st.header("Feature Comparisons Across Decades")
391
  tab1, tab2 = st.tabs(["Feature Comparison", "Loudness Trends"])
 
409
  else:
410
  st.error("Cannot plot: 'Year' or 'Loudness' column missing.")
411
 
412
+ def generate_top_artists_songs(df):
413
+ st.header("Top Artists and Songs")
414
+ tab1, tab2 = st.tabs(["Top Artists", "Top Songs"])
415
+
416
+ with tab1:
417
+ st.markdown("**Most Featured Artists:** Displays the top 10 artists with the highest song counts, highlighting their dominance in the dataset.")
418
+ if 'Artist Name(s)' in df.columns:
419
+ top_artists = df['Artist Name(s)'].value_counts().nlargest(10).reset_index()
420
+ top_artists.columns = ['Artist Name(s)', 'Count']
421
+ fig13 = px.sunburst(
422
+ top_artists, path=['Artist Name(s)'], values='Count',
423
+ title='Most Featured Artists',
424
+ color='Count',
425
+ color_continuous_scale='greens'
426
+ )
427
+ fig13.update_layout(template='plotly_white', width=900, height=500)
428
+ st.plotly_chart(fig13)
429
+ else:
430
+ st.error("Cannot plot: 'Artist Name(s)' column missing.")
431
+
432
+ with tab2:
433
+ st.markdown("**Songs by Artists and Years:** Analyzes song release trends across different years, focusing on the top artists.")
434
+ if 'Artist Name(s)' in df.columns and 'Year' in df.columns:
435
+ artist_year = df.groupby(['Artist Name(s)', 'Year']).size().reset_index(name='Count')
436
+ fig16 = px.sunburst(
437
+ artist_year, path=['Year', 'Artist Name(s)'], values='Count',
438
+ title='Songs Released by Artists Over the Years',
439
+ color='Count',
440
+ color_continuous_scale=px.colors.qualitative.Set2
441
+ )
442
+ fig16.update_layout(width=900, height=500)
443
+ st.plotly_chart(fig16)
444
+ else:
445
+ st.error("Cannot plot: 'Artist Name(s)' or 'Year' column missing.")
446
+
447
  def generate_network_analysis(df):
448
  st.header("Network Analysis")
449
  tab1, tab2 = st.tabs(["Artist Collaborations", "Genre Crossover"])
450
+
451
  with tab1:
452
+ st.markdown("**Artist Collaborations:** Visualizes artist connections over time.")
453
+ if 'Artist Name(s)' in df.columns and 'Year' in df.columns:
454
+ df['Num_Artists'] = df['Artist Name(s)'].apply(lambda x: len(str(x).split(',')))
455
+ df['Is_Collaboration'] = df['Num_Artists'] > 1
456
+ collab_trend = df.groupby('Year')['Is_Collaboration'].mean().reset_index()
457
+ collab_trend['Is_Collaboration'] *= 100 # Convert to percentage
458
+
459
+ fig = px.line(collab_trend, x='Year', y='Is_Collaboration', markers=True,
460
+ title="% of Songs in the Top 10 That Are Collaborative",
461
+ labels={'Year': 'Year', 'Is_Collaboration': 'Collaborative Songs (%)'})
462
+ fig.update_traces(marker=dict(size=8, color='blue'))
463
+ fig.update_layout(width=900, height=500, template='plotly_white')
464
+
465
+ st.plotly_chart(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
  else:
467
+ st.error("Cannot plot: 'Year' or 'Artist Name(s)' column missing.")
468
+
469
  with tab2:
470
+ st.markdown("**Genre Crossover:** Displays the statistical distribution of genres using a violin plot.")
471
+ if 'Genres' in df.columns:
472
+ df['Genres'] = df['Genres'].astype(str).str.split(', ')
473
+ genre_counts = df.explode('Genres')['Genres'].value_counts().reset_index()
474
+ genre_counts.columns = ['Genre', 'Count']
475
+
476
+ fig = px.violin(genre_counts, y='Genre', x='Count', box=True, points="all",
477
+ title='Genre Popularity Distribution',
478
+ color_discrete_sequence=['purple'])
479
+ fig.update_layout(width=900, height=600, template='plotly_white')
480
+ st.plotly_chart(fig)
481
+ else:
482
+ st.error("Cannot plot: 'Genres' column missing.")