Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from gradio import components | |
| import numpy as np | |
| import pandas as pd | |
| import pyarrow | |
| import os | |
| import requests | |
| url = 'https://huggingface.co/datasets/sheacon/song_lyrics/resolve/main/v2ga_w_embeddings_half.parquet' | |
| response = requests.get(url, stream=True) | |
| filename = os.path.join(os.getcwd(), url.split('/')[-1]) | |
| with open(filename, 'wb') as file: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| if chunk: | |
| file.write(chunk) | |
| print(f"File '{filename}' download complete.") | |
| df = pd.read_parquet('v2ga_w_embeddings_half.parquet') | |
| def cosine_similarity(v1, v2): | |
| dot_product = np.dot(v1, v2) | |
| v1_norm = np.linalg.norm(v1) | |
| v2_norm = np.linalg.norm(v2) | |
| if v1_norm == 0.0 or v2_norm == 0.0: | |
| return np.nan | |
| else: | |
| similarity = dot_product / (v1_norm * v2_norm) | |
| return similarity | |
| def relevance_scores(query_embed,df,embeddings): | |
| scores = [cosine_similarity(query_embed, v2) for v2 in df[embeddings]] | |
| scores = pd.Series(scores) | |
| # sort scores in descending order | |
| scores = scores.sort_values(ascending=False) | |
| # set first score to 0 | |
| scores.iloc[0] = 0 | |
| return(scores) | |
| def semantic_search(artist, title): | |
| chosen_song = df[(df['artist'] == artist) & (df['title'] == title)] | |
| scores_glove = relevance_scores(chosen_song["embedding_glove"].values[0],df,"embedding_glove") | |
| index_glove = scores_glove.idxmax() | |
| result_glove = df.iloc[index_glove][['title', 'artist', 'lyrics']] | |
| result_glove['lyrics'] = result_glove['lyrics'].replace('\n', '. ') | |
| scores_minilm = relevance_scores(chosen_song["embedding_minilm"].values[0],df,"embedding_minilm") | |
| index_minilm = scores_minilm.idxmax() | |
| result_minilm = df.iloc[index_minilm][['title', 'artist', 'lyrics']] | |
| result_minilm['lyrics'] = result_minilm['lyrics'].replace('\n', '. ') | |
| scores_roberta = relevance_scores(chosen_song["embedding_roberta"].values[0],df,"embedding_roberta") | |
| index_roberta = scores_roberta.idxmax() | |
| result_roberta = df.iloc[index_roberta][['title', 'artist', 'lyrics']] | |
| result_roberta['lyrics'] = result_roberta['lyrics'].replace('\n', '. ') | |
| scores_gpt = relevance_scores(chosen_song["embedding_gpt"].values[0],df,"embedding_gpt") | |
| index_gpt = scores_gpt.idxmax() | |
| result_gpt = df.iloc[index_gpt][['title', 'artist', 'lyrics']] | |
| result_gpt['lyrics'] = result_gpt['lyrics'].replace('\n', '. ') | |
| chosen_song = chosen_song[['title', 'artist', 'lyrics']].iloc[0] | |
| chosen_song['lyrics'] = chosen_song['lyrics'].replace('\n', '. ') | |
| results = { | |
| 'chosen_song': chosen_song.to_dict(), | |
| 'glove': result_glove.to_dict(), | |
| 'minilm': result_minilm.to_dict(), | |
| 'roberta': result_roberta.to_dict(), | |
| 'gpt': result_gpt.to_dict() | |
| } | |
| return results | |
| from gradio.components import Dropdown | |
| artists = sorted(df['artist'].unique()) | |
| titles = sorted(df['title'].unique()) | |
| artist_dropdown = Dropdown(artists, label="Artist") | |
| title_dropdown = Dropdown(titles, label="Title") | |
| # 100 random examples | |
| df_sample = df.sample(100) | |
| sample_artists = df_sample['artist'].tolist() | |
| sample_titles = df_sample['title'].tolist() | |
| artist_title_sample = [[artist, titles] for artist, titles in zip(sample_artists, sample_titles)] | |
| output_interface = gr.components.JSON(label="Similar Songs") | |
| iface = gr.Interface( | |
| fn=semantic_search, | |
| inputs=[artist_dropdown, title_dropdown], | |
| outputs=output_interface, | |
| examples=artist_title_sample, | |
| title="Similar Song Finder", | |
| description="Find four similar songs to the selected song based on different embeddings (GloVe, MiniLM, RoBERTa, GPT)." | |
| ) | |
| iface.launch() | |