{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "KHnddFeW5hwh" }, "outputs": [], "source": [ "import os\n", "import urllib.request\n", "import zipfile\n", "import json\n", "import pandas as pd\n", "import time\n", "import torch\n", "import numpy as np\n", "import pandas as pd\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", "from torch.utils.data import DataLoader, TensorDataset\n", "from sklearn.model_selection import train_test_split\n", "import matplotlib.pyplot as plt\n", "from sklearn.preprocessing import LabelEncoder" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "l7pGG_d85lzH" }, "outputs": [], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "dL8TIlH55qSc" }, "outputs": [], "source": [ "import shutil\n", "import os\n", "\n", "def copy_file(src, dst):\n", " dst_dir = os.path.dirname(dst)\n", " if not os.path.exists(dst_dir):\n", " os.makedirs(dst_dir)\n", "\n", " shutil.copy2(src, dst)\n", "\n", "copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "LLy-YA775snY" }, "outputs": [], "source": [ "def unzip_archive(filepath, dir_path):\n", " with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n", " zip_ref.extractall(dir_path)\n", "\n", "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "YtO0seclE1Pb" }, "outputs": [], "source": [ "import shutil\n", "\n", "def make_dir(directory):\n", " if os.path.exists(directory):\n", " shutil.rmtree(directory)\n", " os.makedirs(directory)\n", " else:\n", " os.makedirs(directory)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "UeqDk3_65vTt" }, "outputs": [], "source": [ "directory = os.getcwd() + '/data/raw/data'\n", "make_dir(directory)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zMTup29b5wtO" }, "outputs": [], "source": [ "cols = [\n", " 'name',\n", " 'pid',\n", " 'num_followers',\n", " 'pos',\n", " 'artist_name',\n", " 'track_name',\n", " 'album_name'\n", "]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "h6jQO9HT5zsG", "outputId": "ec229c95-c29b-4622-bccf-0fc0bb69f9ba" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mpd.slice.727000-727999.json\t100/1000\t10.0%" ] } ], "source": [ "directory = os.getcwd() + '/data/raw/playlists/data'\n", "df = pd.DataFrame()\n", "index = 0\n", "\n", "for filename in os.listdir(directory):\n", " if os.path.isfile(os.path.join(directory, filename)):\n", " if filename.find('.json') != -1 :\n", " index += 1\n", "\n", " print(f'\\r{filename}\\t{index}/1000\\t{((index/1000)*100):.1f}%', end='')\n", "\n", " full_path = os.path.join(directory, filename)\n", "\n", " with open(full_path, 'r') as file:\n", " json_data = json.load(file)\n", "\n", " temp = pd.DataFrame(json_data['playlists'])\n", " expanded_df = temp.explode('tracks').reset_index(drop=True)\n", "\n", " json_normalized = pd.json_normalize(expanded_df['tracks'])\n", " result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)\n", " result = result[cols]\n", "\n", " df = pd.concat([df, result], axis=0, ignore_index=True)\n", "\n", " if index % 50 == 0:\n", " df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')\n", " del df\n", " df = pd.DataFrame()\n", " if index % 100 == 0:\n", " break" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "id": "PngL0QHq516u" }, "outputs": [], "source": [ "import pyarrow.parquet as pq\n", "\n", "def read_parquet_folder(folder_path):\n", " dataframes = []\n", " for file in os.listdir(folder_path):\n", " if file.endswith('.parquet'):\n", " file_path = os.path.join(folder_path, file)\n", " df = pd.read_parquet(file_path)\n", " dataframes.append(df)\n", "\n", " return pd.concat(dataframes, ignore_index=True)\n", "\n", "folder_path = os.getcwd() + '/../data/raw/data'\n", "df = read_parquet_folder(folder_path)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "id": "peZyue6t57Mz" }, "outputs": [], "source": [ "def create_ids(df, col, name):\n", " value_to_id = {val: i for i, val in enumerate(df[col].unique())}\n", " df[f'{name}_id'] = df[col].map(value_to_id)\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "id": "p68WNyaf58rS" }, "outputs": [], "source": [ "df = create_ids(df, 'artist_name', 'artist')\n", "df = create_ids(df, 'pid', 'playlist')\n", "df = create_ids(df, 'album_name', 'album')" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "id": "aSBKxRFa5-O_" }, "outputs": [], "source": [ "df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')\n", "\n", "df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')\n", "df['playlist_songs'] += 1" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n", "\n", "value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}\n", "df['artist_album_id'] = df['artist_album'].map(value_to_id)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "id": "V1bhU5rW6BSY" }, "outputs": [], "source": [ "df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n", "\n", "encoder = LabelEncoder()\n", "encoder.fit(df['track_name'])\n", "df['track_id'] = encoder.transform(df['track_name'])" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "id": "l6sUWKYC6DCw" }, "outputs": [], "source": [ "df['song_percent'] = df['song_count'] / df['playlist_songs']" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "id": "XxC0WnlL6EWz" }, "outputs": [], "source": [ "import numpy as np\n", "\n", "df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "temp = df" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "df = temp.iloc[:100000]" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "id": "5HLSc9z36Izn" }, "outputs": [], "source": [ "X = df.loc[:,['artist_id','album_id',]]\n", "y = df.loc[:,'song_percent',]\n", "\n", "# Split our data into training and test sets\n", "X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import precision_score, recall_score" ] }, { "cell_type": "code", "execution_count": 81, "metadata": { "id": "k47MaxR65Nq4" }, "outputs": [], "source": [ "class NaiveModel:\n", " def __init__(self, k=10):\n", " self.k = k\n", " self.top_k_items = None\n", "\n", " def fit(self, X, y):\n", " df = pd.DataFrame({'album_id': X['album_id'], 'song_percent': y})\n", " avg_ratings = df.groupby('album_id')['song_percent'].mean()\n", " self.top_k_items = avg_ratings.nlargest(self.k).index.tolist()\n", "\n", " def predict(self, X):\n", " return [self.top_k_items] * len(X)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "def precision_recall(actual,pred, k):\n", " actuals = set(actual)\n", " preds = set(pred[:k])\n", " true_positives = len(actuals & preds)\n", " precision = true_positives / k\n", " recall = true_positives / len(actuals)\n", " return precision, recall" ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Osq-NpGu9V2k", "outputId": "cb9f28e0-1a44-4208-f520-e09ff274d48b" }, "outputs": [], "source": [ "model = NaiveModel()\n", "model.fit(X_train, y_train)\n", "\n", "y_pred = model.predict(X_val)\n", "\n", "y_test_binary = (y_val >= 0.5).astype(int)\n", "y_test_items = X_val['album_id'][y_test_binary == 1].tolist()\n", "\n" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Precision: 1.0\n", "Recall: 0.0011200716845879738\n" ] } ], "source": [ "precisions = []\n", "recalls = []\n", "for i in range(len(X_val)):\n", " precision, recall = precision_recall(y_test_items, y_pred[i], k=10)\n", " precisions.append(precision)\n", " recalls.append(recall)\n", "\n", "precision = sum(precisions) / len(precisions)\n", "recall = sum(recalls) / len(recalls)\n", "\n", "print(f\"Precision: {precision}\")\n", "print(f\"Recall: {recall}\")" ] } ], "metadata": { "colab": { "machine_shape": "hm", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.15" } }, "nbformat": 4, "nbformat_minor": 0 }