|
|
import pickle |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import gradio as gr |
|
|
from sklearn.neighbors import NearestNeighbors |
|
|
|
|
|
BUNDLE_PATH = "spotify_recommender.pkl" |
|
|
|
|
|
with open(BUNDLE_PATH, "rb") as f: |
|
|
bundle = pickle.load(f) |
|
|
|
|
|
features_all: np.ndarray = bundle["features"] |
|
|
track_labels_all = bundle["track_labels"] |
|
|
|
|
|
MAX_SONGS = 100 |
|
|
|
|
|
n_total = features_all.shape[0] |
|
|
n_used = min(MAX_SONGS, n_total) |
|
|
|
|
|
features = features_all[:n_used] |
|
|
track_labels = track_labels_all[:n_used] |
|
|
|
|
|
nn_model = NearestNeighbors(metric="cosine", algorithm="brute") |
|
|
nn_model.fit(features) |
|
|
|
|
|
label_to_index = {label: i for i, label in enumerate(track_labels)} |
|
|
|
|
|
def _split_label(label: str): |
|
|
""" |
|
|
label format: 'track_name – artist_name' |
|
|
Uses an en dash (U+2013). Falls back gracefully if not present. |
|
|
""" |
|
|
if " – " in label: |
|
|
track_name, artist_name = label.split(" – ", 1) |
|
|
else: |
|
|
track_name, artist_name = label, "" |
|
|
return track_name, artist_name |
|
|
|
|
|
|
|
|
def recommend_tracks_ui(query_label: str, k: int): |
|
|
""" |
|
|
Gradio-facing function: |
|
|
- find k nearest neighbors |
|
|
- return a DataFrame with track_name, artist_name, similarity |
|
|
""" |
|
|
if query_label not in label_to_index: |
|
|
return pd.DataFrame( |
|
|
{"error": ["Track not found. Please select from the dropdown."]} |
|
|
) |
|
|
|
|
|
idx = label_to_index[query_label] |
|
|
|
|
|
k = int(k) |
|
|
n_neighbors = min(len(features), k + 1) |
|
|
distances, indices = nn_model.kneighbors( |
|
|
features[idx:idx + 1], |
|
|
n_neighbors=n_neighbors |
|
|
) |
|
|
|
|
|
distances = distances[0] |
|
|
indices = indices[0] |
|
|
|
|
|
mask = indices != idx |
|
|
indices = indices[mask][:k] |
|
|
distances = distances[mask][:k] |
|
|
similarities = 1.0 - distances |
|
|
|
|
|
rows = [] |
|
|
for i, sim in zip(indices, similarities): |
|
|
track_name, artist_name = _split_label(track_labels[i]) |
|
|
rows.append({ |
|
|
"track_name": track_name, |
|
|
"artist_name": artist_name, |
|
|
"similarity": float(sim), |
|
|
}) |
|
|
|
|
|
if not rows: |
|
|
return pd.DataFrame({"info": ["No matches, either no matches or you have an unique taste my friend"]}) |
|
|
|
|
|
return pd.DataFrame(rows) |
|
|
|
|
|
|
|
|
def evaluate_mean_similarity_ui(k: int, n_samples: int): |
|
|
""" |
|
|
Evaluation: |
|
|
- randomly sample n_samples tracks from the subset |
|
|
- get top-k neighbors |
|
|
- compute mean cosine similarity of neighbors |
|
|
""" |
|
|
k = int(k) |
|
|
n_samples = int(n_samples) |
|
|
n = features.shape[0] |
|
|
if n == 0: |
|
|
return "No tracks found" |
|
|
|
|
|
n_samples = min(n_samples, n) |
|
|
|
|
|
rng = np.random.default_rng(42) |
|
|
sample_indices = rng.choice(n, size=n_samples, replace=False) |
|
|
|
|
|
all_means = [] |
|
|
for idx in sample_indices: |
|
|
n_neighbors = min(n, k + 1) |
|
|
distances, indices = nn_model.kneighbors( |
|
|
features[idx:idx + 1], |
|
|
n_neighbors=n_neighbors |
|
|
) |
|
|
|
|
|
distances = distances[0] |
|
|
indices = indices[0] |
|
|
|
|
|
|
|
|
mask = indices != idx |
|
|
distances = distances[mask][:k] |
|
|
|
|
|
if len(distances) == 0: |
|
|
continue |
|
|
|
|
|
similarities = 1.0 - distances |
|
|
all_means.append(similarities.mean()) |
|
|
|
|
|
if not all_means: |
|
|
return "evaluation failed, try again my friend" |
|
|
|
|
|
all_means = np.array(all_means) |
|
|
mean_sim = float(all_means.mean()) |
|
|
std_sim = float(all_means.std()) |
|
|
|
|
|
return ( |
|
|
f"Mean top-{k} cosine similarity over {len(all_means)} random tracks " |
|
|
f"(subset of {n_used} tracks): {mean_sim:.4f} ± {std_sim:.4f}" |
|
|
) |
|
|
|
|
|
with gr.Blocks(title="Spotify Content-Based Recommender (Subset)") as demo: |
|
|
gr.Markdown("# Music Recommender - now that what i call music") |
|
|
gr.Markdown( |
|
|
f"It only uses **{n_used}** tracks from the full dataset " |
|
|
"to make sure all PC can handle it" |
|
|
) |
|
|
|
|
|
with gr.Tab("Recommender"): |
|
|
song_input = gr.Dropdown( |
|
|
choices=track_labels, |
|
|
label="Choose a track (subset)", |
|
|
) |
|
|
k_input = gr.Slider( |
|
|
minimum=1, |
|
|
maximum=10, |
|
|
value=5, |
|
|
step=1, |
|
|
label="Number of recommendations", |
|
|
) |
|
|
recommend_button = gr.Button("Find recommends - find more music you grove") |
|
|
rec_output = gr.Dataframe( |
|
|
label="Recommended Tracks - that match your grove", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
recommend_button.click( |
|
|
fn=recommend_tracks_ui, |
|
|
inputs=[song_input, k_input], |
|
|
outputs=rec_output, |
|
|
) |
|
|
|
|
|
with gr.Tab("Evaluation"): |
|
|
gr.Markdown( |
|
|
"The recommender evaluates this subset using **mean cosine similarity** " |
|
|
"between query tracks and their top-k neighbors." |
|
|
) |
|
|
k_eval = gr.Slider(1, 10, value=5, step=1, label="k (top-k neighbors)") |
|
|
n_eval = gr.Slider(10, 100, value=50, step=10, label="Number of random tracks to sample") |
|
|
eval_button = gr.Button("Run evaluation") |
|
|
eval_output = gr.Textbox(label="Result") |
|
|
|
|
|
eval_button.click( |
|
|
fn=evaluate_mean_similarity_ui, |
|
|
inputs=[k_eval, n_eval], |
|
|
outputs=eval_output, |
|
|
) |
|
|
|
|
|
demo.launch() |
|
|
|