3v324v23 commited on
Commit
1770461
·
1 Parent(s): 533b3b1
Files changed (3) hide show
  1. app.py +72 -0
  2. requirements.txt +6 -0
  3. top_10000_1950-now.csv +0 -0
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from sklearn.preprocessing import StandardScaler
7
+ from sklearn.cluster import DBSCAN
8
+
9
+ st.title("Music Genre Clustering with DBSCAN")
10
+
11
+ # Load dataset directly
12
+ file_path = "top_10000_1950-now.csv"
13
+ df = pd.read_csv(file_path)
14
+
15
+ # Remove non-numeric columns
16
+ df_numeric = df.select_dtypes(include=[np.number])
17
+
18
+ # Create tabs
19
+ tab1, tab2, tab3 = st.tabs(["Overview", "Visualization Matrix", "User Input"])
20
+
21
+ with tab1:
22
+ st.write("### Dataset Overview")
23
+ st.dataframe(df.head())
24
+ st.write("### Dataset Information")
25
+ st.write(df_numeric.describe())
26
+
27
+ with tab2:
28
+ st.write("### Correlation Matrix")
29
+ plt.figure(figsize=(10, 6))
30
+ sns.heatmap(df_numeric.corr(), annot=True, cmap="coolwarm", fmt=".2f")
31
+ st.pyplot(plt)
32
+
33
+ st.write("### Pairplot Visualization")
34
+ pairplot_features = st.multiselect("Select Features for Pairplot", df_numeric.columns.tolist(),
35
+ default=["Danceability", "Energy", "Tempo", "Loudness", "Valence"])
36
+ if pairplot_features:
37
+ sns.pairplot(df[pairplot_features])
38
+ st.pyplot(plt)
39
+
40
+ with tab3:
41
+ st.write("### Clustering Settings")
42
+ num_features = st.slider("Select Number of Features", 2, len(df_numeric.columns), 5)
43
+ features = st.multiselect("Select Features for Clustering",
44
+ df_numeric.columns.tolist(),
45
+ default=df_numeric.columns[:num_features])
46
+
47
+ if st.button("Run Clustering"):
48
+ if len(features) >= 2:
49
+ df_filtered = df_numeric[features].dropna()
50
+ X_scaled = StandardScaler().fit_transform(df_filtered)
51
+
52
+ eps = 1.0 # Default value, can be modified as needed
53
+ min_samples = 10 # Default value, can be modified as needed
54
+
55
+ dbscan = DBSCAN(eps=eps, min_samples=min_samples)
56
+ labels = dbscan.fit_predict(X_scaled)
57
+
58
+ df_filtered["Cluster"] = labels
59
+ df["Cluster"] = np.nan
60
+ df.loc[df_filtered.index, "Cluster"] = labels
61
+
62
+ st.write("### Clustered Data:")
63
+ st.dataframe(df[["Track Name", "Artist Name(s)", "Cluster"]].dropna().head(20))
64
+
65
+ st.write("### Cluster Visualization:")
66
+ fig, ax = plt.subplots()
67
+ scatter = ax.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels, cmap="viridis", alpha=0.7)
68
+ legend1 = ax.legend(*scatter.legend_elements(), title="Clusters")
69
+ ax.add_artist(legend1)
70
+ st.pyplot(fig)
71
+ else:
72
+ st.warning("Please select at least two features for clustering.")
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ panda
3
+ numpy
4
+ matplotlib
5
+ seaborn
6
+ scikit-learn
top_10000_1950-now.csv ADDED
The diff for this file is too large to render. See raw diff