{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "KHnddFeW5hwh" }, "outputs": [], "source": [ "import os\n", "import urllib.request\n", "import zipfile\n", "import json\n", "import pandas as pd\n", "import time\n", "import torch\n", "import numpy as np\n", "import pandas as pd\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", "from torch.utils.data import DataLoader, TensorDataset\n", "from sklearn.model_selection import train_test_split\n", "import matplotlib.pyplot as plt\n", "from sklearn.preprocessing import LabelEncoder" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "l7pGG_d85lzH" }, "outputs": [], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "dL8TIlH55qSc" }, "outputs": [], "source": [ "import shutil\n", "import os\n", "\n", "def copy_file(src, dst):\n", " dst_dir = os.path.dirname(dst)\n", " if not os.path.exists(dst_dir):\n", " os.makedirs(dst_dir)\n", "\n", " shutil.copy2(src, dst)\n", "\n", "copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "LLy-YA775snY" }, "outputs": [], "source": [ "def unzip_archive(filepath, dir_path):\n", " with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n", " zip_ref.extractall(dir_path)\n", "\n", "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "YtO0seclE1Pb" }, "outputs": [], "source": [ "import shutil\n", "\n", "def make_dir(directory):\n", " if os.path.exists(directory):\n", " shutil.rmtree(directory)\n", " os.makedirs(directory)\n", " else:\n", " os.makedirs(directory)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "UeqDk3_65vTt" }, "outputs": [], "source": [ "directory = os.getcwd() + '/data/raw/data'\n", "make_dir(directory)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zMTup29b5wtO" }, "outputs": [], "source": [ "cols = [\n", " 'name',\n", " 'pid',\n", " 'num_followers',\n", " 'pos',\n", " 'artist_name',\n", " 'track_name',\n", " 'album_name'\n", "]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "h6jQO9HT5zsG", "outputId": "ec229c95-c29b-4622-bccf-0fc0bb69f9ba" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mpd.slice.727000-727999.json\t100/1000\t10.0%" ] } ], "source": [ "directory = os.getcwd() + '/data/raw/playlists/data'\n", "df = pd.DataFrame()\n", "index = 0\n", "# Loop through all files in the directory\n", "for filename in os.listdir(directory):\n", " # Check if the item is a file (not a subdirectory)\n", " if os.path.isfile(os.path.join(directory, filename)):\n", " if filename.find('.json') != -1 :\n", " index += 1\n", "\n", " # Print the filename or perform operations on the file\n", " print(f'\\r{filename}\\t{index}/1000\\t{((index/1000)*100):.1f}%', end='')\n", "\n", " # If you need the full file path, you can use:\n", " full_path = os.path.join(directory, filename)\n", "\n", " with open(full_path, 'r') as file:\n", " json_data = json.load(file)\n", "\n", " temp = pd.DataFrame(json_data['playlists'])\n", " expanded_df = temp.explode('tracks').reset_index(drop=True)\n", "\n", " # Normalize the JSON data\n", " json_normalized = pd.json_normalize(expanded_df['tracks'])\n", "\n", " # Concatenate the original DataFrame with the normalized JSON data\n", " result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)\n", "\n", " result = result[cols]\n", "\n", " df = pd.concat([df, result], axis=0, ignore_index=True)\n", "\n", " if index % 50 == 0:\n", " df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')\n", " del df\n", " df = pd.DataFrame()\n", " if index % 100 == 0:\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PngL0QHq516u" }, "outputs": [], "source": [ "import pyarrow.parquet as pq\n", "\n", "def read_parquet_folder(folder_path):\n", " dataframes = []\n", " for file in os.listdir(folder_path):\n", " if file.endswith('.parquet'):\n", " file_path = os.path.join(folder_path, file)\n", " df = pd.read_parquet(file_path)\n", " dataframes.append(df)\n", "\n", " return pd.concat(dataframes, ignore_index=True)\n", "\n", "folder_path = os.getcwd() + '/data/raw/data'\n", "df = read_parquet_folder(folder_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "hdLpjr2153b_" }, "outputs": [], "source": [ "directory = os.getcwd() + '/data/raw/mappings'\n", "make_dir(directory)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "peZyue6t57Mz" }, "outputs": [], "source": [ "def create_ids(df, col, name):\n", " # Create a dictionary mapping unique values to IDs\n", " value_to_id = {val: i for i, val in enumerate(df[col].unique())}\n", "\n", " # Create a new column with the IDs\n", " df[f'{name}_id'] = df[col].map(value_to_id)\n", " df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/{name}.csv')\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "p68WNyaf58rS" }, "outputs": [], "source": [ "df = create_ids(df, 'artist_name', 'artist')\n", "df = create_ids(df, 'pid', 'playlist')\n", "df = create_ids(df, 'album_name', 'album')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "aSBKxRFa5-O_" }, "outputs": [], "source": [ "df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')\n", "\n", "df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')\n", "df['playlist_songs'] += 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "4WqHH-pn5_nL" }, "outputs": [], "source": [ "df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n", "\n", "# Step 2: Create a dictionary mapping unique combined values to IDs\n", "value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}\n", "\n", "# Step 3: Map these IDs back to the DataFrame\n", "df['artist_album_id'] = df['artist_album'].map(value_to_id)\n", "\n", "df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/artist_album.csv')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "V1bhU5rW6BSY" }, "outputs": [], "source": [ "df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n", "\n", "encoder = LabelEncoder()\n", "encoder.fit(df['track_name'])\n", "df['track_id'] = encoder.transform(df['track_name'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "l6sUWKYC6DCw" }, "outputs": [], "source": [ "df['song_percent'] = df['song_count'] / df['playlist_songs']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "XxC0WnlL6EWz" }, "outputs": [], "source": [ "import numpy as np\n", "\n", "df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "kbxBcQiX6F2v", "outputId": "eb1fe0b1-83df-4a31-9110-5c904ad14af9" }, "outputs": [ { "data": { "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "artists" }, "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
playlist_idartist_idalbum_id
0000
1011
2022
3033
4044
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "text/plain": [ " playlist_id artist_id album_id\n", "0 0 0 0\n", "1 0 1 1\n", "2 0 2 2\n", "3 0 3 3\n", "4 0 4 4" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "artists = df.loc[:,['playlist_id','artist_id','album_id']].drop_duplicates()\n", "artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5HLSc9z36Izn" }, "outputs": [], "source": [ "X = artists.loc[:,['artist_id','album_id',]]\n", "y = artists.loc[:,'playlist_id',]\n", "\n", "# Split our data into training and test sets\n", "X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "id": "k47MaxR65Nq4" }, "outputs": [], "source": [ "from sklearn.cluster import DBSCAN\n", "db_model = DBSCAN(eps=0.2,min_samples=5)\n", "labels_db = db_model.fit_predict(X)\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Osq-NpGu9V2k", "outputId": "cb9f28e0-1a44-4208-f520-e09ff274d48b" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Precision: 1.589262536579764e-05\n", "Recall: 9.606273770069471e-06\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] } ], "source": [ "from sklearn.metrics import precision_score, recall_score\n", "y_no_noise = y[labels_db != -1]\n", "labels_db_no_noise = labels_db[labels_db != -1]\n", "\n", "precision = precision_score(y_no_noise, labels_db_no_noise, average='weighted')\n", "recall = recall_score(y_no_noise, labels_db_no_noise, average='weighted')\n", "\n", "print(f'Precision: {precision}')\n", "print(f'Recall: {recall}')" ] } ], "metadata": { "colab": { "machine_shape": "hm", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }