aikanava commited on
Commit
1c6ddfa
Β·
verified Β·
1 Parent(s): f53e67b

upload files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +102 -0
  3. requirements.txt +6 -0
  4. spotify_millsongdata.csv +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ spotify_millsongdata.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.neighbors import NearestNeighbors
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ from collections import Counter
9
+
10
+ st.set_page_config(page_title="🎡 Lyrics-Based Song Recommendations")
11
+
12
+ # Load dataset
13
+ @st.cache_data
14
+ def load_data():
15
+ df = pd.read_csv("spotify_millsongdata.csv") # Update with actual file path
16
+ df = df.dropna(subset=["text"]) # Remove missing lyrics
17
+ return df
18
+
19
+ df = load_data()
20
+
21
+ # Convert lyrics into numerical features using TF-IDF
22
+ vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
23
+ lyrics_matrix = vectorizer.fit_transform(df["text"])
24
+
25
+ # Train KNN Model
26
+ knn = NearestNeighbors(n_neighbors=5, metric="cosine")
27
+ knn.fit(lyrics_matrix)
28
+
29
+ # Streamlit UI
30
+ st.title("🎢 Lyrics-Based Song Recommendation System")
31
+
32
+ st.markdown(
33
+ "Discover songs that match your favorite lyrics! This app uses **TF-IDF** and **KNN** to find songs with similar lyrical content."
34
+ )
35
+
36
+ # Tabs for better UI
37
+ tab1, tab2 = st.tabs(["πŸ“Š Dataset Overview", "🎀 Lyrics-Based Recommendation"])
38
+
39
+ with tab1:
40
+ # Dataset Sample
41
+ sample_df = df.sample(20)
42
+ st.dataframe(sample_df[["song", "artist", "text"]])
43
+
44
+ # Expander for Dataset Statistics
45
+ with st.expander("πŸ“Š Dataset Statistics"):
46
+ # Dataset Statistics
47
+ total_songs = df.shape[0]
48
+ unique_artists = df["artist"].nunique()
49
+ avg_lyrics_length = df["text"].apply(lambda x: len(x.split())).mean()
50
+
51
+ st.write(f"πŸ“Š **Total Songs**: {total_songs}")
52
+ st.write(f"🎀 **Unique Artists**: {unique_artists}")
53
+ st.write(f"πŸ“– **Average Lyrics Length**: {avg_lyrics_length:.2f} words")
54
+
55
+ # Expander for Lyrics Length Distribution
56
+ with st.expander("πŸ“– Lyrics Length Distribution (Word Count per Song)"):
57
+ # Lyrics Length Distribution
58
+ lyrics_length = df["text"].apply(lambda x: len(x.split()))
59
+ fig, ax = plt.subplots(figsize=(8, 4))
60
+ sns.histplot(lyrics_length, kde=True, ax=ax, color="skyblue")
61
+ ax.set_xlabel("Word Count")
62
+ ax.set_ylabel("Number of Songs")
63
+ st.pyplot(fig)
64
+
65
+ # Most Frequent Artists
66
+ with st.expander("🎀 Most Frequent Artists in the Dataset"):
67
+ artist_counts = df["artist"].value_counts().head(10)
68
+ fig, ax = plt.subplots(figsize=(8, 4))
69
+ sns.barplot(y=artist_counts.index, x=artist_counts.values, ax=ax, palette="mako")
70
+ ax.set_xlabel("Number of Songs")
71
+ ax.set_ylabel("Artist")
72
+ st.pyplot(fig)
73
+
74
+ with tab2:
75
+ st.subheader("Enter Lyrics Snippet")
76
+ user_lyrics = st.text_area("Type lyrics snippet:", "")
77
+
78
+ if st.button("🎢 Get Recommendations") and user_lyrics.strip():
79
+ # Convert user input into the same TF-IDF space
80
+ user_vector = vectorizer.transform([user_lyrics])
81
+
82
+ # Find similar songs
83
+ distances, indices = knn.kneighbors(user_vector)
84
+
85
+ st.subheader("🎧 Recommended Songs:")
86
+ recommendations = []
87
+ for i, idx in enumerate(indices[0]):
88
+ recommended_song = df.iloc[idx]["song"]
89
+ recommended_artist = df.iloc[idx]["artist"]
90
+ similarity_score = 1 - distances[0][i] # Convert cosine distance to similarity
91
+ recommendations.append((recommended_song, recommended_artist, similarity_score))
92
+ st.write(f"🎢 **{recommended_song}** - {recommended_artist} (Similarity: `{similarity_score:.2f}`)")
93
+
94
+ # Plot similarity scores
95
+ with st.expander("πŸ“Š Similarity Scores"):
96
+ fig, ax = plt.subplots(figsize=(8, 4))
97
+ song_names = [rec[0] for rec in recommendations]
98
+ similarity_scores = [rec[2] for rec in recommendations]
99
+ sns.barplot(x=similarity_scores, y=song_names, ax=ax, palette="coolwarm")
100
+ ax.set_xlabel("Similarity Score")
101
+ ax.set_ylabel("Recommended Songs")
102
+ st.pyplot(fig)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ scikit-learn
5
+ matplotlib
6
+ seaborn
spotify_millsongdata.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cd19a8adf74791bfd99e1ccb8b1fc3bd2ed33399faeb86fa3677638a5623afd
3
+ size 74864162