File size: 69,217 Bytes

e6fd1ec

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "# Mengunduh dataset MovieLens 100k\n",
        "!wget -q https://files.grouplens.org/datasets/movielens/ml-100k.zip\n",
        "!unzip -q ml-100k.zip\n",
        "\n",
        "# Mengunduh dataset MovieLens 1M\n",
        "!wget -q https://files.grouplens.org/datasets/movielens/ml-1m.zip\n",
        "!unzip -q ml-1m.zip\n",
        "\n",
        "# Mengunduh dataset MovieLens Metadata\n",
        "!unzip -q movies_metadata.zip"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "kqom8x_fb61t",
        "outputId": "cccfb8ce-aada-4a9c-e03d-f3e05258dab9"
      },
      "execution_count": 32,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "replace ml-100k/allbut.pl? [y]es, [n]o, [A]ll, [N]one, [r]ename: A\n",
            "replace ml-1m/movies.dat? [y]es, [n]o, [A]ll, [N]one, [r]ename: A\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Load Dataset Movielens\n",
        "Dataset ini harus terdiri dari tiga file master yaitu\n",
        "1. Users yang berisikan user_id, gender, age, occupation, zip_code\n",
        "2. Movies yang berisikan movie_id, title, genres, is_adult, original_language, original_title, overview, popularity, release_date, revenue, runtime, vote_average, dan vote_count.\n",
        "3. Ratings yang berisikan user_id, movie_id, rating, dan timestamp"
      ],
      "metadata": {
        "id": "GWFqG_HXbvQI"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "from sklearn.model_selection import train_test_split"
      ],
      "metadata": {
        "id": "qI07ntK6dAmy"
      },
      "execution_count": 177,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Memuat data\n",
        "ratings = pd.read_csv('ml-100k/u.data', sep='\\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])\n",
        "users = pd.read_csv('ml-100k/u.user', sep='|', names=['user_id', 'gender', 'age', 'occupation', 'zip_code'])\n",
        "movies = pd.read_csv('ml-100k/u.item', sep='|', encoding='ISO-8859-1', header=None, names=['movie_id', 'title', 'release_date', 'imdb_url'], usecols=[0,1,2,4])"
      ],
      "metadata": {
        "id": "p_Al0TLpcuYN"
      },
      "execution_count": 178,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Memuat data\n",
        "ml1_movies = pd.read_csv('ml-1m/movies.dat', sep='::', encoding='ISO-8859-1', header=None, names=['movie_id', 'title', 'genres'], usecols=[0,1,2])\n",
        "ml1_movies.head(1)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "id": "-Umv6xZFjK_H",
        "outputId": "3e789a07-5b10-4026-d26d-fce92496dba3"
      },
      "execution_count": 179,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-179-e71d00712615>:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
            "  ml1_movies = pd.read_csv('ml-1m/movies.dat', sep='::', encoding='ISO-8859-1', header=None, names=['movie_id', 'title', 'genres'], usecols=[0,1,2])\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "   movie_id             title                       genres\n",
              "0         1  Toy Story (1995)  Animation|Children's|Comedy"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-0121428d-a55a-4066-834d-e387ad094c88\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>movie_id</th>\n",
              "      <th>title</th>\n",
              "      <th>genres</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1</td>\n",
              "      <td>Toy Story (1995)</td>\n",
              "      <td>Animation|Children's|Comedy</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-0121428d-a55a-4066-834d-e387ad094c88')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-0121428d-a55a-4066-834d-e387ad094c88 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-0121428d-a55a-4066-834d-e387ad094c88');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "ml1_movies",
              "summary": "{\n  \"name\": \"ml1_movies\",\n  \"rows\": 3883,\n  \"fields\": [\n    {\n      \"column\": \"movie_id\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 1146,\n        \"min\": 1,\n        \"max\": 3952,\n        \"num_unique_values\": 3883,\n        \"samples\": [\n          1365,\n          2706,\n          3667\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 3883,\n        \"samples\": [\n          \"Ridicule (1996)\",\n          \"American Pie (1999)\",\n          \"Rent-A-Cop (1988)\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"genres\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 301,\n        \"samples\": [\n          \"Action|Adventure|Comedy|Horror\",\n          \"Romance|Western\",\n          \"Action|Adventure|Children's|Comedy\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 179
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Menggabungkan kolom 'genres' dari ml1_movies ke movies berdasarkan 'movie_id'\n",
        "movies = movies.merge(ml1_movies[['movie_id', 'genres']], on='movie_id', how='left')\n",
        "# Extract year from title\n",
        "movies[[\"title\", \"year\"]] = movies[\"title\"].str.extract('(.*)\\((\\d+)\\)')\n",
        "# Remove trailing whitespace from title\n",
        "movies[\"title\"] = movies[\"title\"].str.strip()\n",
        "ml1_movies = ml1_movies.iloc[0:0]"
      ],
      "metadata": {
        "id": "SIMv7RJdlV84"
      },
      "execution_count": 180,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "ml_meta_movies = pd.read_csv('movies_metadata.csv', low_memory=False)\n",
        "ml_meta_movies.head(1)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "id": "zf-ApwFIldKw",
        "outputId": "24de81f0-6af8-4b9a-c264-91d55ecc47ef"
      },
      "execution_count": 181,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "   adult                              belongs_to_collection    budget  \\\n",
              "0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   \n",
              "\n",
              "                                              genres  \\\n",
              "0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   \n",
              "\n",
              "                               homepage   id    imdb_id original_language  \\\n",
              "0  http://toystory.disney.com/toy-story  862  tt0114709                en   \n",
              "\n",
              "  original_title                                           overview  ...  \\\n",
              "0      Toy Story  Led by Woody, Andy's toys live happily in his ...  ...   \n",
              "\n",
              "  release_date      revenue runtime                          spoken_languages  \\\n",
              "0   1995-10-30  373554033.0    81.0  [{'iso_639_1': 'en', 'name': 'English'}]   \n",
              "\n",
              "     status  tagline      title  video vote_average vote_count  \n",
              "0  Released      NaN  Toy Story  False          7.7     5415.0  \n",
              "\n",
              "[1 rows x 24 columns]"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-d8591515-3aef-458e-9707-9fb81eb55634\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>adult</th>\n",
              "      <th>belongs_to_collection</th>\n",
              "      <th>budget</th>\n",
              "      <th>genres</th>\n",
              "      <th>homepage</th>\n",
              "      <th>id</th>\n",
              "      <th>imdb_id</th>\n",
              "      <th>original_language</th>\n",
              "      <th>original_title</th>\n",
              "      <th>overview</th>\n",
              "      <th>...</th>\n",
              "      <th>release_date</th>\n",
              "      <th>revenue</th>\n",
              "      <th>runtime</th>\n",
              "      <th>spoken_languages</th>\n",
              "      <th>status</th>\n",
              "      <th>tagline</th>\n",
              "      <th>title</th>\n",
              "      <th>video</th>\n",
              "      <th>vote_average</th>\n",
              "      <th>vote_count</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>False</td>\n",
              "      <td>{'id': 10194, 'name': 'Toy Story Collection', ...</td>\n",
              "      <td>30000000</td>\n",
              "      <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
              "      <td>http://toystory.disney.com/toy-story</td>\n",
              "      <td>862</td>\n",
              "      <td>tt0114709</td>\n",
              "      <td>en</td>\n",
              "      <td>Toy Story</td>\n",
              "      <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
              "      <td>...</td>\n",
              "      <td>1995-10-30</td>\n",
              "      <td>373554033.0</td>\n",
              "      <td>81.0</td>\n",
              "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
              "      <td>Released</td>\n",
              "      <td>NaN</td>\n",
              "      <td>Toy Story</td>\n",
              "      <td>False</td>\n",
              "      <td>7.7</td>\n",
              "      <td>5415.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>1 rows × 24 columns</p>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-d8591515-3aef-458e-9707-9fb81eb55634')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-d8591515-3aef-458e-9707-9fb81eb55634 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-d8591515-3aef-458e-9707-9fb81eb55634');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "ml_meta_movies"
            }
          },
          "metadata": {},
          "execution_count": 181
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(movies[\"movie_id\"].nunique())\n",
        "print(users[\"user_id\"].nunique())"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "fNy0OWLnqHkC",
        "outputId": "fafeda16-2ece-4738-ae68-3189b7d30cca"
      },
      "execution_count": 182,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "1682\n",
            "943\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# prompt: i wanna check that all title on dataframe movies is exists on dataframe ml_meta_movies with column title or original_title and how many title is not exists, with text is lowercase, and remove the row on movies if is not exists.\n",
        "\n",
        "# Convert titles to lowercase for comparison\n",
        "movies['title_lower'] = movies['title'].str.lower()\n",
        "ml_meta_movies['title_lower'] = ml_meta_movies['title'].str.lower()\n",
        "ml_meta_movies['original_title_lower'] = ml_meta_movies['original_title'].str.lower()\n",
        "\n",
        "# Check which titles in 'movies' exist in 'ml_meta_movies'\n",
        "movies_exist = movies['title_lower'].isin(ml_meta_movies['title_lower']) | movies['title_lower'].isin(ml_meta_movies['original_title_lower'])\n",
        "\n",
        "# Count how many titles don't exist\n",
        "not_exist_count = (~movies_exist).sum()\n",
        "print(\"Number of titles not existing in ml_meta_movies:\", not_exist_count)\n",
        "\n",
        "# Remove rows from 'movies' where titles don't exist\n",
        "movies = movies[movies_exist]\n",
        "\n",
        "# Drop the temporary lowercase title columns\n",
        "movies = movies.drop(['title_lower'], axis=1)\n",
        "ml_meta_movies = ml_meta_movies.drop(['title_lower', 'original_title_lower'], axis=1)\n",
        "movies.reset_index(drop=True, inplace=True)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Wg9oVcqi9m7p",
        "outputId": "b664355e-2d3f-4357-a02b-1f6d0e39b355"
      },
      "execution_count": 183,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Number of titles not existing in ml_meta_movies: 518\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# prompt: remove all rows on ratings dataframe if the column movie_id is not exists on movies dataframe\n",
        "\n",
        "# Filter ratings DataFrame based on movie existence\n",
        "ratings = ratings[ratings['movie_id'].isin(movies['movie_id'])]\n",
        "ratings.reset_index(drop=True, inplace=True)"
      ],
      "metadata": {
        "id": "2FZmRDYP-MQM"
      },
      "execution_count": 184,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# prompt: Can you reset movie_id column on movies datafram start to 1 and syncronize to movie_id on ratings dataframe\n",
        "\n",
        "# Create a mapping of old movie_id to new movie_id\n",
        "movie_id_map = {old_id: new_id for new_id, old_id in enumerate(movies['movie_id'].unique(), start=1)}\n",
        "\n",
        "# Apply the mapping to the movies DataFrame\n",
        "movies['movie_id'] = movies['movie_id'].map(movie_id_map)\n",
        "\n",
        "# Apply the mapping to the ratings DataFrame\n",
        "ratings['movie_id'] = ratings['movie_id'].map(movie_id_map)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "WPOSzdEoB7qP",
        "outputId": "02a53bdb-92f0-456d-8aae-d4b30777d04e"
      },
      "execution_count": 185,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-185-05ef1c64b40f>:10: SettingWithCopyWarning: \n",
            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
            "Try using .loc[row_indexer,col_indexer] = value instead\n",
            "\n",
            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
            "  ratings['movie_id'] = ratings['movie_id'].map(movie_id_map)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# prompt: Can you reset user_id column on ratings datafram start to 1 and syncronize to user_id on users dataframe and remove all rows on users when the user_id is not exist on unique user_id on ratings\n",
        "\n",
        "# Get unique user_ids from ratings\n",
        "unique_rating_users = ratings['user_id'].unique()\n",
        "\n",
        "# Filter users DataFrame to keep only users present in ratings\n",
        "users = users[users['user_id'].isin(unique_rating_users)]\n",
        "users.reset_index(drop=True, inplace=True)\n",
        "\n",
        "# Create a mapping of old user_id to new user_id\n",
        "user_id_map = {old_id: new_id for new_id, old_id in enumerate(users['user_id'].unique(), start=1)}\n",
        "\n",
        "# Apply the mapping to the users DataFrame\n",
        "users['user_id'] = users['user_id'].map(user_id_map)\n",
        "\n",
        "# Apply the mapping to the ratings DataFrame\n",
        "ratings['user_id'] = ratings['user_id'].map(user_id_map)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YulqHdu7Dmf6",
        "outputId": "9ff98c2c-0b7d-419c-8698-bfb51c48ca09"
      },
      "execution_count": 186,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-186-63a67bcdbf80>:17: SettingWithCopyWarning: \n",
            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
            "Try using .loc[row_indexer,col_indexer] = value instead\n",
            "\n",
            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
            "  ratings['user_id'] = ratings['user_id'].map(user_id_map)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# prompt: Now i want to copy columns adult, original_language, original_title, overview, popularity, release_date, revenue, runtime, vote_average, dan vote_count from ml_meta_movies to movies dataframe based on title or original_title\n",
        "\n",
        "# Create temporary lowercase title columns for efficient comparison\n",
        "movies['title_lower'] = movies['title'].str.lower()\n",
        "ml_meta_movies['title_lower'] = ml_meta_movies['title'].str.lower()\n",
        "ml_meta_movies['original_title_lower'] = ml_meta_movies['original_title'].str.lower()\n",
        "\n",
        "# Initialize new columns in 'movies' DataFrame\n",
        "movies['adult'] = None\n",
        "movies['original_language'] = None\n",
        "movies['original_title'] = None\n",
        "movies['overview'] = None\n",
        "movies['popularity'] = None\n",
        "movies['release_date'] = None\n",
        "movies['revenue'] = None\n",
        "movies['runtime'] = None\n",
        "movies['vote_average'] = None\n",
        "movies['vote_count'] = None\n",
        "\n",
        "# Iterate over 'movies' and copy data from 'ml_meta_movies'\n",
        "for index, row in movies.iterrows():\n",
        "    title_lower = row['title_lower']\n",
        "    match = ml_meta_movies[(ml_meta_movies['title_lower'] == title_lower) | (ml_meta_movies['original_title_lower'] == title_lower)]\n",
        "    if not match.empty:\n",
        "        movies.loc[index, 'adult'] = match['adult'].iloc[0]\n",
        "        movies.loc[index, 'original_language'] = match['original_language'].iloc[0]\n",
        "        movies.loc[index, 'original_title'] = match['original_title'].iloc[0]\n",
        "        movies.loc[index, 'overview'] = match['overview'].iloc[0]\n",
        "        movies.loc[index, 'popularity'] = match['popularity'].iloc[0]\n",
        "        movies.loc[index, 'release_date'] = match['release_date'].iloc[0]\n",
        "        movies.loc[index, 'revenue'] = match['revenue'].iloc[0]\n",
        "        movies.loc[index, 'runtime'] = match['runtime'].iloc[0]\n",
        "        movies.loc[index, 'vote_average'] = match['vote_average'].iloc[0]\n",
        "        movies.loc[index, 'vote_count'] = match['vote_count'].iloc[0]\n",
        "\n",
        "# Drop the temporary lowercase title columns\n",
        "movies = movies.drop(['title_lower'], axis=1)\n",
        "ml_meta_movies = ml_meta_movies.drop(['title_lower', 'original_title_lower'], axis=1)\n"
      ],
      "metadata": {
        "id": "joj4h0U2JNRL"
      },
      "execution_count": 187,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Show Tables"
      ],
      "metadata": {
        "id": "jA-vHTcjiaYj"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Ratings\n",
        "ratings.head(1)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "id": "-gEkmf5mevq2",
        "outputId": "b0495a83-b6f9-41da-d493-4f04dd3efb3e"
      },
      "execution_count": 188,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "   user_id  movie_id  rating  timestamp\n",
              "0      196       169       3  881250949"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-046071fc-1ebc-4261-8a1c-d4bca4119035\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>user_id</th>\n",
              "      <th>movie_id</th>\n",
              "      <th>rating</th>\n",
              "      <th>timestamp</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>196</td>\n",
              "      <td>169</td>\n",
              "      <td>3</td>\n",
              "      <td>881250949</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-046071fc-1ebc-4261-8a1c-d4bca4119035')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-046071fc-1ebc-4261-8a1c-d4bca4119035 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-046071fc-1ebc-4261-8a1c-d4bca4119035');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "ratings",
              "summary": "{\n  \"name\": \"ratings\",\n  \"rows\": 72799,\n  \"fields\": [\n    {\n      \"column\": \"user_id\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 266,\n        \"min\": 1,\n        \"max\": 943,\n        \"num_unique_values\": 943,\n        \"samples\": [\n          1,\n          204,\n          812\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"movie_id\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 236,\n        \"min\": 1,\n        \"max\": 1164,\n        \"num_unique_values\": 1164,\n        \"samples\": [\n          652,\n          683,\n          485\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"rating\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 1,\n        \"min\": 1,\n        \"max\": 5,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          1,\n          5,\n          2\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"timestamp\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 5332640,\n        \"min\": 874724710,\n        \"max\": 893286638,\n        \"num_unique_values\": 41739,\n        \"samples\": [\n          892836523,\n          891224840,\n          882910457\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 188
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Users\n",
        "users.head(1)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "id": "02jusHhgipqH",
        "outputId": "f8b6812e-1106-4f08-967f-f524b2b7e45b"
      },
      "execution_count": 189,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "   user_id  gender age  occupation zip_code\n",
              "0        1      24   M  technician    85711"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-1b7be07c-888a-4129-ab46-b905f73eb705\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>user_id</th>\n",
              "      <th>gender</th>\n",
              "      <th>age</th>\n",
              "      <th>occupation</th>\n",
              "      <th>zip_code</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1</td>\n",
              "      <td>24</td>\n",
              "      <td>M</td>\n",
              "      <td>technician</td>\n",
              "      <td>85711</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-1b7be07c-888a-4129-ab46-b905f73eb705')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-1b7be07c-888a-4129-ab46-b905f73eb705 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-1b7be07c-888a-4129-ab46-b905f73eb705');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "users",
              "summary": "{\n  \"name\": \"users\",\n  \"rows\": 943,\n  \"fields\": [\n    {\n      \"column\": \"user_id\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 272,\n        \"min\": 1,\n        \"max\": 943,\n        \"num_unique_values\": 943,\n        \"samples\": [\n          97,\n          266,\n          811\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"gender\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 12,\n        \"min\": 7,\n        \"max\": 73,\n        \"num_unique_values\": 61,\n        \"samples\": [\n          24,\n          57,\n          52\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"age\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 2,\n        \"samples\": [\n          \"F\",\n          \"M\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"occupation\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 21,\n        \"samples\": [\n          \"technician\",\n          \"healthcare\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"zip_code\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 795,\n        \"samples\": [\n          \"90016\",\n          \"15232\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 189
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Movies\n",
        "movies.head(1)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 0
        },
        "id": "5y3nX2tQivh_",
        "outputId": "ec5a7f2c-eb79-4fc9-a58e-5473a6074d2a"
      },
      "execution_count": 190,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "   movie_id      title release_date  \\\n",
              "0         1  Toy Story   1995-10-30   \n",
              "\n",
              "                                            imdb_url  \\\n",
              "0  http://us.imdb.com/M/title-exact?Toy%20Story%2...   \n",
              "\n",
              "                        genres  year  adult original_language original_title  \\\n",
              "0  Animation|Children's|Comedy  1995  False                en      Toy Story   \n",
              "\n",
              "                                            overview popularity      revenue  \\\n",
              "0  Led by Woody, Andy's toys live happily in his ...  21.946943  373554033.0   \n",
              "\n",
              "  runtime vote_average vote_count  \n",
              "0    81.0          7.7     5415.0  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-1c36e9a6-1049-44ca-879c-ccaf267483f7\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>movie_id</th>\n",
              "      <th>title</th>\n",
              "      <th>release_date</th>\n",
              "      <th>imdb_url</th>\n",
              "      <th>genres</th>\n",
              "      <th>year</th>\n",
              "      <th>adult</th>\n",
              "      <th>original_language</th>\n",
              "      <th>original_title</th>\n",
              "      <th>overview</th>\n",
              "      <th>popularity</th>\n",
              "      <th>revenue</th>\n",
              "      <th>runtime</th>\n",
              "      <th>vote_average</th>\n",
              "      <th>vote_count</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1</td>\n",
              "      <td>Toy Story</td>\n",
              "      <td>1995-10-30</td>\n",
              "      <td>http://us.imdb.com/M/title-exact?Toy%20Story%2...</td>\n",
              "      <td>Animation|Children's|Comedy</td>\n",
              "      <td>1995</td>\n",
              "      <td>False</td>\n",
              "      <td>en</td>\n",
              "      <td>Toy Story</td>\n",
              "      <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
              "      <td>21.946943</td>\n",
              "      <td>373554033.0</td>\n",
              "      <td>81.0</td>\n",
              "      <td>7.7</td>\n",
              "      <td>5415.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-1c36e9a6-1049-44ca-879c-ccaf267483f7')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-1c36e9a6-1049-44ca-879c-ccaf267483f7 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-1c36e9a6-1049-44ca-879c-ccaf267483f7');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "movies",
              "summary": "{\n  \"name\": \"movies\",\n  \"rows\": 1164,\n  \"fields\": [\n    {\n      \"column\": \"movie_id\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 336,\n        \"min\": 1,\n        \"max\": 1164,\n        \"num_unique_values\": 1164,\n        \"samples\": [\n          765,\n          102,\n          774\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 1145,\n        \"samples\": [\n          \"Titanic\",\n          \"Hard Eight\",\n          \"Immortal Beloved\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"release_date\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"num_unique_values\": 756,\n        \"samples\": [\n          \"1977-04-06\",\n          \"1973-12-17\",\n          \"1994-05-13\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"imdb_url\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 1149,\n        \"samples\": [\n          \"http://us.imdb.com/M/title-exact?Shall%20we%20DANSU%3F%20%281996%29\",\n          \"http://us.imdb.com/M/title-exact?Koyaanisqatsi%20(1983)\",\n          \"http://us.imdb.com/M/title-exact?Conan+the+Barbarian+(1981)\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"genres\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 176,\n        \"samples\": [\n          \"Documentary\",\n          \"Comedy|Drama|Romance\",\n          \"Action|Romance\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"year\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"num_unique_values\": 65,\n        \"samples\": [\n          \"1943\",\n          \"1952\",\n          \"1995\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"adult\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 1,\n        \"samples\": [\n          \"False\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"original_language\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 20,\n        \"samples\": [\n          \"en\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"original_title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 1145,\n        \"samples\": [\n          \"Titanic\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"overview\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 1139,\n        \"samples\": [\n          \"Dorothy Parker remembers the heyday of the Algonquin Round Table, a circle of friends whose barbed wit, like hers, was fueled by alcohol and flirted with despair.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"popularity\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 1144,\n        \"samples\": [\n          \"26.88907\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"revenue\",\n      \"properties\": {\n        \"dtype\": \"date\",\n        \"min\": 0.0,\n        \"max\": 1845034188.0,\n        \"num_unique_values\": 585,\n        \"samples\": [\n          19075720.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"runtime\",\n      \"properties\": {\n        \"dtype\": \"date\",\n        \"min\": 0.0,\n        \"max\": 242.0,\n        \"num_unique_values\": 113,\n        \"samples\": [\n          141.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"vote_average\",\n      \"properties\": {\n        \"dtype\": \"date\",\n        \"min\": 0.0,\n        \"max\": 10.0,\n        \"num_unique_values\": 57,\n        \"samples\": [\n          7.7\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"vote_count\",\n      \"properties\": {\n        \"dtype\": \"date\",\n        \"min\": 0.0,\n        \"max\": 8670.0,\n        \"num_unique_values\": 484,\n        \"samples\": [\n          92.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 190
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Sample Recsys to check dataset is valid for embedding or not"
      ],
      "metadata": {
        "id": "hU4wX4dn-zXO"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Memuat data\n",
        "data = ratings.copy()\n",
        "data = data[['user_id', 'movie_id', 'rating']]\n",
        "\n",
        "# Normalisasi ID pengguna dan item (karena ID asli mungkin tidak dimulai dari 0)\n",
        "data['user_id'] = data['user_id'] - 1\n",
        "data['movie_id'] = data['movie_id'] - 1\n",
        "\n",
        "# Melihat statistik dataset\n",
        "num_users = data['user_id'].nunique()\n",
        "num_items = data['movie_id'].nunique()\n",
        "print(f\"Number of users: {num_users}, Number of items: {num_items}\")\n",
        "\n",
        "# Split dataset menjadi train dan test\n",
        "train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "kI90v3Mu-9Bd",
        "outputId": "0fdd7e17-60d0-4727-9421-6c081f12d621"
      },
      "execution_count": 191,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Number of users: 943, Number of items: 1164\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "from torch.utils.data import DataLoader, Dataset\n",
        "\n",
        "class MovieLensDataset(Dataset):\n",
        "    def __init__(self, data):\n",
        "        self.user_ids = torch.tensor(data['user_id'].values, dtype=torch.long)\n",
        "        self.item_ids = torch.tensor(data['movie_id'].values, dtype=torch.long)\n",
        "        self.ratings = torch.tensor(data['rating'].values, dtype=torch.float32)\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.ratings)\n",
        "\n",
        "    def __getitem__(self, idx):\n",
        "        return self.user_ids[idx], self.item_ids[idx], self.ratings[idx]\n",
        "\n",
        "class MFModel(nn.Module):\n",
        "    def __init__(self, num_users, num_items, embedding_size):\n",
        "        super(MFModel, self).__init__()\n",
        "        self.user_embedding = nn.Embedding(num_users, embedding_size)\n",
        "        self.item_embedding = nn.Embedding(num_items, embedding_size)\n",
        "\n",
        "    def forward(self, user_id, item_id):\n",
        "        user_vec = self.user_embedding(user_id)\n",
        "        item_vec = self.item_embedding(item_id)\n",
        "        dot_product = torch.sum(user_vec * item_vec, dim=1)\n",
        "        return dot_product\n",
        "\n",
        "    def regularization_loss(self):\n",
        "        return self.reg_factor * (torch.norm(self.user_embedding.weight) + torch.norm(self.item_embedding.weight))"
      ],
      "metadata": {
        "id": "s025DVSf_nhh"
      },
      "execution_count": 192,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# DataLoader untuk training\n",
        "train_dataset = MovieLensDataset(train_data)\n",
        "train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)\n",
        "\n",
        "# Hyperparameters\n",
        "embedding_size = 30\n",
        "reg_factor = 0.01\n",
        "model = MFModel(num_users, num_items, embedding_size)\n",
        "criterion = nn.MSELoss()\n",
        "optimizer = optim.Adam(model.parameters(), lr=0.01)\n",
        "\n",
        "# Training loop\n",
        "for epoch in range(10):\n",
        "    model.train()\n",
        "    total_loss = 0\n",
        "    for data_user_id, data_item_id, data_rating in train_loader:\n",
        "        optimizer.zero_grad()\n",
        "        predictions = model(data_user_id, data_item_id)\n",
        "        loss = criterion(predictions, data_rating)\n",
        "        loss.backward()\n",
        "        optimizer.step()\n",
        "        total_loss += loss.item()\n",
        "    print(f\"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "QmZazumN_ylY",
        "outputId": "60b6fb1c-0206-4b91-f41c-cb7f7cf0bae4"
      },
      "execution_count": 193,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Epoch 1, Loss: 34.7072\n",
            "Epoch 2, Loss: 18.3940\n",
            "Epoch 3, Loss: 9.6635\n",
            "Epoch 4, Loss: 4.3265\n",
            "Epoch 5, Loss: 2.3596\n",
            "Epoch 6, Loss: 1.5818\n",
            "Epoch 7, Loss: 1.1944\n",
            "Epoch 8, Loss: 0.9714\n",
            "Epoch 9, Loss: 0.8330\n",
            "Epoch 10, Loss: 0.7384\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.metrics import mean_squared_error\n",
        "import numpy as np\n",
        "\n",
        "model.eval()\n",
        "test_dataset = MovieLensDataset(test_data)\n",
        "test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)\n",
        "\n",
        "predictions, targets = [], []\n",
        "with torch.no_grad():\n",
        "    for data_user_id, data_item_id, data_rating in test_loader:\n",
        "        output = model(data_user_id, data_item_id)\n",
        "        predictions.extend(output.numpy())\n",
        "        targets.extend(data_rating.numpy())\n",
        "\n",
        "rmse = np.sqrt(mean_squared_error(targets, predictions))\n",
        "print(f\"Test RMSE: {rmse:.4f}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "w9mD2UhHI0Kx",
        "outputId": "76b52339-ec74-420e-f0b8-add08e66842d"
      },
      "execution_count": 194,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Test RMSE: 1.8248\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def get_top_n_recommendations_pytorch(model, user_id, N=10):\n",
        "    # Dapatkan semua item yang tersedia\n",
        "    all_items = np.array(range(num_items))\n",
        "\n",
        "    # Cek item yang sudah dirating oleh user\n",
        "    rated_items = train_data[train_data['user_id'] == user_id]['movie_id'].values\n",
        "\n",
        "    # Ambil item yang belum dirating oleh user\n",
        "    items_to_predict = np.setdiff1d(all_items, rated_items)\n",
        "\n",
        "    # Prediksi rating untuk item-item tersebut\n",
        "    model.eval()\n",
        "    with torch.no_grad():\n",
        "        user_ids = torch.tensor([user_id] * len(items_to_predict))\n",
        "        item_ids = torch.tensor(items_to_predict)\n",
        "        predicted_ratings = model(user_ids, item_ids).numpy()\n",
        "\n",
        "    # Urutkan item berdasarkan rating tertinggi\n",
        "    top_n_items = items_to_predict[np.argsort(predicted_ratings)[-N:][::-1]]\n",
        "\n",
        "    return top_n_items\n",
        "\n",
        "# Contoh penggunaan\n",
        "user_id = 0\n",
        "top_n_recommendations = get_top_n_recommendations_pytorch(model, user_id, N=10)\n",
        "print(f\"Top 10 recommended items for user {user_id}: {top_n_recommendations}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "04UHnGBtI8CB",
        "outputId": "93f34309-174b-41bd-af2c-f8a353efe5ad"
      },
      "execution_count": 195,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Top 10 recommended items for user 0: [1151  916  718 1134  832  941  327  347  631  434]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Export the datasets\n"
      ],
      "metadata": {
        "id": "PEfUd5FtKQS8"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "ratings.to_csv(f\"ratings.csv\", index=False)\n",
        "movies.to_csv(f\"movies.csv\", index=False)\n",
        "users.to_csv(f\"users.csv\", index=False)"
      ],
      "metadata": {
        "id": "tVZFC_urL4Yd"
      },
      "execution_count": 196,
      "outputs": []
    }
  ]
}