{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "KHnddFeW5hwh" }, "outputs": [], "source": [ "import os\n", "import urllib.request\n", "import zipfile\n", "import json\n", "import pandas as pd\n", "import time\n", "import torch\n", "import numpy as np\n", "import pandas as pd\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", "from torch.utils.data import DataLoader, TensorDataset\n", "from sklearn.model_selection import train_test_split\n", "import matplotlib.pyplot as plt\n", "from sklearn.preprocessing import LabelEncoder" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "l7pGG_d85lzH" }, "outputs": [], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "dL8TIlH55qSc" }, "outputs": [], "source": [ "import shutil\n", "import os\n", "\n", "def copy_file(src, dst):\n", " dst_dir = os.path.dirname(dst)\n", " if not os.path.exists(dst_dir):\n", " os.makedirs(dst_dir)\n", "\n", " shutil.copy2(src, dst)\n", "\n", "copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "LLy-YA775snY" }, "outputs": [], "source": [ "def unzip_archive(filepath, dir_path):\n", " with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n", " zip_ref.extractall(dir_path)\n", "\n", "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "YtO0seclE1Pb" }, "outputs": [], "source": [ "import shutil\n", "\n", "def make_dir(directory):\n", " if os.path.exists(directory):\n", " shutil.rmtree(directory)\n", " os.makedirs(directory)\n", " else:\n", " os.makedirs(directory)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "UeqDk3_65vTt" }, "outputs": [], "source": [ "directory = os.getcwd() + '/data/raw/data'\n", "make_dir(directory)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zMTup29b5wtO" }, "outputs": [], "source": [ "cols = [\n", " 'name',\n", " 'pid',\n", " 'num_followers',\n", " 'pos',\n", " 'artist_name',\n", " 'track_name',\n", " 'album_name'\n", "]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "h6jQO9HT5zsG", "outputId": "ec229c95-c29b-4622-bccf-0fc0bb69f9ba" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mpd.slice.727000-727999.json\t100/1000\t10.0%" ] } ], "source": [ "directory = os.getcwd() + '/data/raw/playlists/data'\n", "df = pd.DataFrame()\n", "index = 0\n", "# Loop through all files in the directory\n", "for filename in os.listdir(directory):\n", " # Check if the item is a file (not a subdirectory)\n", " if os.path.isfile(os.path.join(directory, filename)):\n", " if filename.find('.json') != -1 :\n", " index += 1\n", "\n", " # Print the filename or perform operations on the file\n", " print(f'\\r{filename}\\t{index}/1000\\t{((index/1000)*100):.1f}%', end='')\n", "\n", " # If you need the full file path, you can use:\n", " full_path = os.path.join(directory, filename)\n", "\n", " with open(full_path, 'r') as file:\n", " json_data = json.load(file)\n", "\n", " temp = pd.DataFrame(json_data['playlists'])\n", " expanded_df = temp.explode('tracks').reset_index(drop=True)\n", "\n", " # Normalize the JSON data\n", " json_normalized = pd.json_normalize(expanded_df['tracks'])\n", "\n", " # Concatenate the original DataFrame with the normalized JSON data\n", " result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)\n", "\n", " result = result[cols]\n", "\n", " df = pd.concat([df, result], axis=0, ignore_index=True)\n", "\n", " if index % 50 == 0:\n", " df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')\n", " del df\n", " df = pd.DataFrame()\n", " if index % 100 == 0:\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PngL0QHq516u" }, "outputs": [], "source": [ "import pyarrow.parquet as pq\n", "\n", "def read_parquet_folder(folder_path):\n", " dataframes = []\n", " for file in os.listdir(folder_path):\n", " if file.endswith('.parquet'):\n", " file_path = os.path.join(folder_path, file)\n", " df = pd.read_parquet(file_path)\n", " dataframes.append(df)\n", "\n", " return pd.concat(dataframes, ignore_index=True)\n", "\n", "folder_path = os.getcwd() + '/data/raw/data'\n", "df = read_parquet_folder(folder_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "hdLpjr2153b_" }, "outputs": [], "source": [ "directory = os.getcwd() + '/data/raw/mappings'\n", "make_dir(directory)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "peZyue6t57Mz" }, "outputs": [], "source": [ "def create_ids(df, col, name):\n", " # Create a dictionary mapping unique values to IDs\n", " value_to_id = {val: i for i, val in enumerate(df[col].unique())}\n", "\n", " # Create a new column with the IDs\n", " df[f'{name}_id'] = df[col].map(value_to_id)\n", " df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/{name}.csv')\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "p68WNyaf58rS" }, "outputs": [], "source": [ "df = create_ids(df, 'artist_name', 'artist')\n", "df = create_ids(df, 'pid', 'playlist')\n", "df = create_ids(df, 'album_name', 'album')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "aSBKxRFa5-O_" }, "outputs": [], "source": [ "df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')\n", "\n", "df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')\n", "df['playlist_songs'] += 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "4WqHH-pn5_nL" }, "outputs": [], "source": [ "df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n", "\n", "# Step 2: Create a dictionary mapping unique combined values to IDs\n", "value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}\n", "\n", "# Step 3: Map these IDs back to the DataFrame\n", "df['artist_album_id'] = df['artist_album'].map(value_to_id)\n", "\n", "df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/raw/mappings/artist_album.csv')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "V1bhU5rW6BSY" }, "outputs": [], "source": [ "df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n", "\n", "encoder = LabelEncoder()\n", "encoder.fit(df['track_name'])\n", "df['track_id'] = encoder.transform(df['track_name'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "l6sUWKYC6DCw" }, "outputs": [], "source": [ "df['song_percent'] = df['song_count'] / df['playlist_songs']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "XxC0WnlL6EWz" }, "outputs": [], "source": [ "import numpy as np\n", "\n", "df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "kbxBcQiX6F2v", "outputId": "eb1fe0b1-83df-4a31-9110-5c904ad14af9" }, "outputs": [ { "data": { "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "artists" }, "text/html": [ "\n", "
| \n", " | playlist_id | \n", "artist_id | \n", "album_id | \n", "
|---|---|---|---|
| 0 | \n", "0 | \n", "0 | \n", "0 | \n", "
| 1 | \n", "0 | \n", "1 | \n", "1 | \n", "
| 2 | \n", "0 | \n", "2 | \n", "2 | \n", "
| 3 | \n", "0 | \n", "3 | \n", "3 | \n", "
| 4 | \n", "0 | \n", "4 | \n", "4 | \n", "