aikanava's picture
update app.py
988844b verified
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
st.set_page_config(page_title="🎡 Lyrics-Based Song Recommendations")
# Load dataset
@st.cache_data
def load_data():
df = pd.read_csv("spotify_millsongdata.csv") # Update with actual file path
df = df.dropna(subset=["text"]) # Remove missing lyrics
return df
df = load_data()
# Convert lyrics into numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
lyrics_matrix = vectorizer.fit_transform(df["text"])
# Train KNN Model
knn = NearestNeighbors(n_neighbors=5, metric="cosine")
knn.fit(lyrics_matrix)
# Streamlit UI
st.title("🎢 Lyrics-Based Song Recommendation System")
st.markdown(
"Discover songs that match your favorite lyrics! This app uses **TF-IDF** and **KNN** to find songs with similar lyrical content."
)
# Tabs for better UI
tab1, tab2 = st.tabs(["πŸ“Š Dataset Overview", "🎀 Lyrics-Based Recommendation"])
with tab1:
# Dataset Sample
sample_df = df.sample(20)
st.dataframe(sample_df[["song", "artist", "text"]])
# Expander for Dataset Statistics
with st.expander("πŸ“Š Dataset Statistics"):
# Dataset Statistics
total_songs = df.shape[0]
unique_artists = df["artist"].nunique()
avg_lyrics_length = df["text"].apply(lambda x: len(x.split())).mean()
st.write(f"πŸ“Š **Total Songs**: {total_songs}")
st.write(f"🎀 **Unique Artists**: {unique_artists}")
st.write(f"πŸ“– **Average Lyrics Length**: {avg_lyrics_length:.2f} words")
# Expander for Lyrics Length Distribution
with st.expander("πŸ“– Lyrics Length Distribution (Word Count per Song)"):
# Lyrics Length Distribution
lyrics_length = df["text"].apply(lambda x: len(x.split()))
fig, ax = plt.subplots(figsize=(8, 4))
sns.histplot(lyrics_length, kde=True, ax=ax, color="skyblue")
ax.set_xlabel("Word Count")
ax.set_ylabel("Number of Songs")
st.pyplot(fig)
# Most Frequent Artists
with st.expander("🎀 Most Frequent Artists in the Dataset"):
artist_counts = df["artist"].value_counts().head(10)
fig, ax = plt.subplots(figsize=(8, 4))
sns.barplot(y=artist_counts.index, x=artist_counts.values, ax=ax, palette="mako")
ax.set_xlabel("Number of Songs")
ax.set_ylabel("Artist")
st.pyplot(fig)
with tab2:
st.subheader("Enter Lyrics Snippet")
user_lyrics = st.text_area("Type lyrics snippet:", "")
if st.button("🎢 Get Recommendations") and user_lyrics.strip():
# Convert user input into the same TF-IDF space
user_vector = vectorizer.transform([user_lyrics])
# Find similar songs
distances, indices = knn.kneighbors(user_vector)
st.subheader("Recommended Songs:")
recommendations = []
for i, idx in enumerate(indices[0]):
recommended_song = df.iloc[idx]["song"]
recommended_artist = df.iloc[idx]["artist"]
similarity_score = 1 - distances[0][i] # Convert cosine distance to similarity
recommendations.append((recommended_song, recommended_artist, similarity_score))
st.write(f"🎢 **{recommended_song}** - {recommended_artist} (Similarity: `{similarity_score:.2f}`)")