import gradio as gr import pandas as pd import pickle from sklearn.metrics.pairwise import cosine_similarity import heapq # Load data and model df = pd.read_csv('./DATA/spotify_millsongdata.csv') # Load saved embeddings with open("./DATA/lyrics_embeddings.pkl", "rb") as f: lyrics_embeddings = pickle.load(f) # List of artists and songs artists = df['artist'].unique() song_titles = df['song'] # Recommendation logic def recommend_songs(song_index, top_n=5, batch_size=100): top_sim_scores = [] num_batches = len(df) // batch_size + 1 for i in range(num_batches): start_idx = i * batch_size end_idx = min((i + 1) * batch_size, len(df)) # Compute cosine similarity for the current batch cosine_sim_batch = cosine_similarity( lyrics_embeddings[start_idx:end_idx], [lyrics_embeddings[song_index]] ) # Select the top N most similar songs for j, sim_score in enumerate(cosine_sim_batch): global_idx = start_idx + j heapq.heappush(top_sim_scores, (sim_score[0], global_idx)) if len(top_sim_scores) > top_n + 1: heapq.heappop(top_sim_scores) # Exclude the selected song itself and return the most similar songs with their similarity scores top_sim_scores = sorted(top_sim_scores, key=lambda x: x[0], reverse=True)[1:top_n+1] recommended_songs = [(song_titles[i[1]], df['link'][i[1]], round(i[0], 2)) for i in top_sim_scores] return recommended_songs # Interface logic function def get_songs_by_artist(artist_name): filtered_songs = df[df['artist'] == artist_name]['song'].tolist() return gr.update(choices=filtered_songs, value=filtered_songs[0] if filtered_songs else None) def gradio_recommend(song_title): try: # Find the index of the selected song song_index = song_titles[song_titles == song_title].index[0] # Get recommended songs recommendations = recommend_songs(song_index) # Format the output, making song links clickable result = "
Get the most relevant song recommendations based on lyrics similarity