{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "KHnddFeW5hwh"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "import urllib.request\n",
        "import zipfile\n",
        "import json\n",
        "import pandas as pd\n",
        "import time\n",
        "import torch\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import torch.nn as nn\n",
        "import torch.nn.functional as F\n",
        "import torch.optim as optim\n",
        "from torch.utils.data import DataLoader, TensorDataset\n",
        "from sklearn.model_selection import train_test_split\n",
        "import matplotlib.pyplot as plt\n",
        "from sklearn.preprocessing import LabelEncoder"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "l7pGG_d85lzH"
      },
      "outputs": [],
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "id": "dL8TIlH55qSc"
      },
      "outputs": [],
      "source": [
        "import shutil\n",
        "import os\n",
        "\n",
        "def copy_file(src, dst):\n",
        "  dst_dir = os.path.dirname(dst)\n",
        "  if not os.path.exists(dst_dir):\n",
        "    os.makedirs(dst_dir)\n",
        "\n",
        "  shutil.copy2(src, dst)\n",
        "\n",
        "copy_file('/content/drive/MyDrive/rec_data/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "LLy-YA775snY"
      },
      "outputs": [],
      "source": [
        "def unzip_archive(filepath, dir_path):\n",
        "  with zipfile.ZipFile(f\"{filepath}\", 'r') as zip_ref:\n",
        "    zip_ref.extractall(dir_path)\n",
        "\n",
        "unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YtO0seclE1Pb"
      },
      "outputs": [],
      "source": [
        "import shutil\n",
        "\n",
        "def make_dir(directory):\n",
        "    if os.path.exists(directory):\n",
        "        shutil.rmtree(directory)\n",
        "        os.makedirs(directory)\n",
        "    else:\n",
        "        os.makedirs(directory)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UeqDk3_65vTt"
      },
      "outputs": [],
      "source": [
        "directory = os.getcwd() + '/data/raw/data'\n",
        "make_dir(directory)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zMTup29b5wtO"
      },
      "outputs": [],
      "source": [
        "cols = [\n",
        "    'name',\n",
        "    'pid',\n",
        "    'num_followers',\n",
        "    'pos',\n",
        "    'artist_name',\n",
        "    'track_name',\n",
        "    'album_name'\n",
        "]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "h6jQO9HT5zsG",
        "outputId": "ec229c95-c29b-4622-bccf-0fc0bb69f9ba"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "mpd.slice.727000-727999.json\t100/1000\t10.0%"
          ]
        }
      ],
      "source": [
        "directory = os.getcwd() + '/data/raw/playlists/data'\n",
        "df = pd.DataFrame()\n",
        "index = 0\n",
        "\n",
        "for filename in os.listdir(directory):\n",
        "    if os.path.isfile(os.path.join(directory, filename)):\n",
        "        if filename.find('.json') != -1 :\n",
        "          index += 1\n",
        "\n",
        "          print(f'\\r{filename}\\t{index}/1000\\t{((index/1000)*100):.1f}%', end='')\n",
        "\n",
        "          full_path = os.path.join(directory, filename)\n",
        "\n",
        "          with open(full_path, 'r') as file:\n",
        "              json_data = json.load(file)\n",
        "\n",
        "          temp = pd.DataFrame(json_data['playlists'])\n",
        "          expanded_df = temp.explode('tracks').reset_index(drop=True)\n",
        "\n",
        "          json_normalized = pd.json_normalize(expanded_df['tracks'])\n",
        "          result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)\n",
        "          result = result[cols]\n",
        "\n",
        "          df = pd.concat([df, result], axis=0, ignore_index=True)\n",
        "\n",
        "        if index % 50 == 0:\n",
        "            df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')\n",
        "            del df\n",
        "            df = pd.DataFrame()\n",
        "            if index % 100 == 0:\n",
        "                break"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 44,
      "metadata": {
        "id": "PngL0QHq516u"
      },
      "outputs": [],
      "source": [
        "import pyarrow.parquet as pq\n",
        "\n",
        "def read_parquet_folder(folder_path):\n",
        "    dataframes = []\n",
        "    for file in os.listdir(folder_path):\n",
        "        if file.endswith('.parquet'):\n",
        "            file_path = os.path.join(folder_path, file)\n",
        "            df = pd.read_parquet(file_path)\n",
        "            dataframes.append(df)\n",
        "\n",
        "    return pd.concat(dataframes, ignore_index=True)\n",
        "\n",
        "folder_path = os.getcwd() + '/../data/raw/data'\n",
        "df = read_parquet_folder(folder_path)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 45,
      "metadata": {
        "id": "peZyue6t57Mz"
      },
      "outputs": [],
      "source": [
        "def create_ids(df, col, name):\n",
        "    value_to_id = {val: i for i, val in enumerate(df[col].unique())}\n",
        "    df[f'{name}_id'] = df[col].map(value_to_id)\n",
        "\n",
        "    return df"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 46,
      "metadata": {
        "id": "p68WNyaf58rS"
      },
      "outputs": [],
      "source": [
        "df = create_ids(df, 'artist_name', 'artist')\n",
        "df = create_ids(df, 'pid', 'playlist')\n",
        "df = create_ids(df, 'album_name', 'album')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 47,
      "metadata": {
        "id": "aSBKxRFa5-O_"
      },
      "outputs": [],
      "source": [
        "df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')\n",
        "\n",
        "df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')\n",
        "df['playlist_songs'] += 1"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 48,
      "metadata": {},
      "outputs": [],
      "source": [
        "df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)\n",
        "\n",
        "value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}\n",
        "df['artist_album_id'] = df['artist_album'].map(value_to_id)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 49,
      "metadata": {
        "id": "V1bhU5rW6BSY"
      },
      "outputs": [],
      "source": [
        "df['song_count'] =  df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')\n",
        "\n",
        "encoder = LabelEncoder()\n",
        "encoder.fit(df['track_name'])\n",
        "df['track_id'] = encoder.transform(df['track_name'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 50,
      "metadata": {
        "id": "l6sUWKYC6DCw"
      },
      "outputs": [],
      "source": [
        "df['song_percent'] = df['song_count'] / df['playlist_songs']"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 51,
      "metadata": {
        "id": "XxC0WnlL6EWz"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "\n",
        "df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 57,
      "metadata": {},
      "outputs": [],
      "source": [
        "temp = df"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 78,
      "metadata": {},
      "outputs": [],
      "source": [
        "df = temp.iloc[:100000]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 79,
      "metadata": {
        "id": "5HLSc9z36Izn"
      },
      "outputs": [],
      "source": [
        "X = df.loc[:,['artist_id','album_id',]]\n",
        "y = df.loc[:,'song_percent',]\n",
        "\n",
        "# Split our data into training and test sets\n",
        "X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 80,
      "metadata": {},
      "outputs": [],
      "source": [
        "from sklearn.metrics import precision_score, recall_score"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 81,
      "metadata": {
        "id": "k47MaxR65Nq4"
      },
      "outputs": [],
      "source": [
        "class NaiveModel:\n",
        "    def __init__(self, k=10):\n",
        "        self.k = k\n",
        "        self.top_k_items = None\n",
        "\n",
        "    def fit(self, X, y):\n",
        "        df = pd.DataFrame({'album_id': X['album_id'], 'song_percent': y})\n",
        "        avg_ratings = df.groupby('album_id')['song_percent'].mean()\n",
        "        self.top_k_items = avg_ratings.nlargest(self.k).index.tolist()\n",
        "\n",
        "    def predict(self, X):\n",
        "        return [self.top_k_items] * len(X)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 82,
      "metadata": {},
      "outputs": [],
      "source": [
        "def precision_recall(actual,pred, k):\n",
        "    actuals = set(actual)\n",
        "    preds = set(pred[:k])\n",
        "    true_positives = len(actuals & preds)\n",
        "    precision = true_positives / k\n",
        "    recall = true_positives / len(actuals)\n",
        "    return precision, recall"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 83,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Osq-NpGu9V2k",
        "outputId": "cb9f28e0-1a44-4208-f520-e09ff274d48b"
      },
      "outputs": [],
      "source": [
        "model = NaiveModel()\n",
        "model.fit(X_train, y_train)\n",
        "\n",
        "y_pred = model.predict(X_val)\n",
        "\n",
        "y_test_binary = (y_val >= 0.5).astype(int)\n",
        "y_test_items = X_val['album_id'][y_test_binary == 1].tolist()\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 84,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Precision: 1.0\n",
            "Recall: 0.0011200716845879738\n"
          ]
        }
      ],
      "source": [
        "precisions = []\n",
        "recalls = []\n",
        "for i in range(len(X_val)):\n",
        "    precision, recall = precision_recall(y_test_items, y_pred[i], k=10)\n",
        "    precisions.append(precision)\n",
        "    recalls.append(recall)\n",
        "\n",
        "precision = sum(precisions) / len(precisions)\n",
        "recall = sum(recalls) / len(recalls)\n",
        "\n",
        "print(f\"Precision: {precision}\")\n",
        "print(f\"Recall: {recall}\")"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "machine_shape": "hm",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.15"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}