Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.neighbors import NearestNeighbors | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from collections import Counter | |
| st.set_page_config(page_title="π΅ Lyrics-Based Song Recommendations") | |
| # Load dataset | |
| def load_data(): | |
| df = pd.read_csv("spotify_millsongdata.csv") # Update with actual file path | |
| df = df.dropna(subset=["text"]) # Remove missing lyrics | |
| return df | |
| df = load_data() | |
| # Convert lyrics into numerical features using TF-IDF | |
| vectorizer = TfidfVectorizer(stop_words="english", max_features=5000) | |
| lyrics_matrix = vectorizer.fit_transform(df["text"]) | |
| # Train KNN Model | |
| knn = NearestNeighbors(n_neighbors=5, metric="cosine") | |
| knn.fit(lyrics_matrix) | |
| # Streamlit UI | |
| st.title("πΆ Lyrics-Based Song Recommendation System") | |
| st.markdown( | |
| "Discover songs that match your favorite lyrics! This app uses **TF-IDF** and **KNN** to find songs with similar lyrical content." | |
| ) | |
| # Tabs for better UI | |
| tab1, tab2 = st.tabs(["π Dataset Overview", "π€ Lyrics-Based Recommendation"]) | |
| with tab1: | |
| # Dataset Sample | |
| sample_df = df.sample(20) | |
| st.dataframe(sample_df[["song", "artist", "text"]]) | |
| # Expander for Dataset Statistics | |
| with st.expander("π Dataset Statistics"): | |
| # Dataset Statistics | |
| total_songs = df.shape[0] | |
| unique_artists = df["artist"].nunique() | |
| avg_lyrics_length = df["text"].apply(lambda x: len(x.split())).mean() | |
| st.write(f"π **Total Songs**: {total_songs}") | |
| st.write(f"π€ **Unique Artists**: {unique_artists}") | |
| st.write(f"π **Average Lyrics Length**: {avg_lyrics_length:.2f} words") | |
| # Expander for Lyrics Length Distribution | |
| with st.expander("π Lyrics Length Distribution (Word Count per Song)"): | |
| # Lyrics Length Distribution | |
| lyrics_length = df["text"].apply(lambda x: len(x.split())) | |
| fig, ax = plt.subplots(figsize=(8, 4)) | |
| sns.histplot(lyrics_length, kde=True, ax=ax, color="skyblue") | |
| ax.set_xlabel("Word Count") | |
| ax.set_ylabel("Number of Songs") | |
| st.pyplot(fig) | |
| # Most Frequent Artists | |
| with st.expander("π€ Most Frequent Artists in the Dataset"): | |
| artist_counts = df["artist"].value_counts().head(10) | |
| fig, ax = plt.subplots(figsize=(8, 4)) | |
| sns.barplot(y=artist_counts.index, x=artist_counts.values, ax=ax, palette="mako") | |
| ax.set_xlabel("Number of Songs") | |
| ax.set_ylabel("Artist") | |
| st.pyplot(fig) | |
| with tab2: | |
| st.subheader("Enter Lyrics Snippet") | |
| user_lyrics = st.text_area("Type lyrics snippet:", "") | |
| if st.button("πΆ Get Recommendations") and user_lyrics.strip(): | |
| # Convert user input into the same TF-IDF space | |
| user_vector = vectorizer.transform([user_lyrics]) | |
| # Find similar songs | |
| distances, indices = knn.kneighbors(user_vector) | |
| st.subheader("Recommended Songs:") | |
| recommendations = [] | |
| for i, idx in enumerate(indices[0]): | |
| recommended_song = df.iloc[idx]["song"] | |
| recommended_artist = df.iloc[idx]["artist"] | |
| similarity_score = 1 - distances[0][i] # Convert cosine distance to similarity | |
| recommendations.append((recommended_song, recommended_artist, similarity_score)) | |
| st.write(f"πΆ **{recommended_song}** - {recommended_artist} (Similarity: `{similarity_score:.2f}`)") | |