{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4ba6aba8"
      },
      "source": [
        "# 🤖 **Data Collection, Creation, Storage, and Processing**\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jpASMyIQMaAq"
      },
      "source": [
        "## **1.** 📦 Install required packages"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "f48c8f8c",
        "outputId": "b2ca7f1a-9d54-4844-d0b5-6bdcac1fdf58"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n",
            "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n",
            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n",
            "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n",
            "Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n",
            "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n",
            "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n",
            "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n",
            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n",
            "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.3)\n",
            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n",
            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n",
            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n",
            "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n",
            "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n",
            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n",
            "Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n",
            "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.1)\n",
            "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n",
            "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (2025.11.3)\n",
            "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (4.67.3)\n",
            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n"
          ]
        }
      ],
      "source": [
        "!pip install beautifulsoup4 pandas matplotlib seaborn numpy textblob"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lquNYCbfL9IM"
      },
      "source": [
        "## **2.** ⛏ Web-scrape all book titles, prices, and ratings from books.toscrape.com"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0IWuNpxxYDJF"
      },
      "source": [
        "### *a. Initial setup*\n",
        "Define the base url of the website you will scrape as well as how and what you will scrape"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "id": "91d52125"
      },
      "outputs": [],
      "source": [
        "import requests\n",
        "from bs4 import BeautifulSoup\n",
        "import pandas as pd\n",
        "import time\n",
        "\n",
        "base_url = \"https://books.toscrape.com/catalogue/page-{}.html\"\n",
        "headers = {\"User-Agent\": \"Mozilla/5.0\"}\n",
        "\n",
        "titles, prices, ratings = [], [], []"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "oCdTsin2Yfp3"
      },
      "source": [
        "### *b. Fill titles, prices, and ratings from the web pages*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "id": "xqO5Y3dnYhxt"
      },
      "outputs": [],
      "source": [
        "# Loop through all 50 pages\n",
        "for page in range(1, 51):\n",
        "    url = base_url.format(page)\n",
        "    response = requests.get(url, headers=headers)\n",
        "    soup = BeautifulSoup(response.content, \"html.parser\")\n",
        "    books = soup.find_all(\"article\", class_=\"product_pod\")\n",
        "\n",
        "    for book in books:\n",
        "        titles.append(book.h3.a[\"title\"])\n",
        "        prices.append(float(book.find(\"p\", class_=\"price_color\").text[1:]))\n",
        "        ratings.append(book.p.get(\"class\")[1])\n",
        "\n",
        "    time.sleep(0.5)  # polite scraping delay"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "T0TOeRC4Yrnn"
      },
      "source": [
        "### *c. ✋🏻🛑⛔️ Create a dataframe df_books that contains the now complete \"title\", \"price\", and \"rating\" objects*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "id": "l5FkkNhUYTHh",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "outputId": "8c962ec1-dc39-44ad-b779-730351edde24"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                                   title  price rating\n",
              "0                   A Light in the Attic  51.77  Three\n",
              "1                     Tipping the Velvet  53.74    One\n",
              "2                             Soumission  50.10    One\n",
              "3                          Sharp Objects  47.82   Four\n",
              "4  Sapiens: A Brief History of Humankind  54.23   Five"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-9dee82af-7d23-47b2-a4cb-92b1f8c3d09e\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>title</th>\n",
              "      <th>price</th>\n",
              "      <th>rating</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>A Light in the Attic</td>\n",
              "      <td>51.77</td>\n",
              "      <td>Three</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Tipping the Velvet</td>\n",
              "      <td>53.74</td>\n",
              "      <td>One</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Soumission</td>\n",
              "      <td>50.10</td>\n",
              "      <td>One</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Sharp Objects</td>\n",
              "      <td>47.82</td>\n",
              "      <td>Four</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Sapiens: A Brief History of Humankind</td>\n",
              "      <td>54.23</td>\n",
              "      <td>Five</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-9dee82af-7d23-47b2-a4cb-92b1f8c3d09e')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-9dee82af-7d23-47b2-a4cb-92b1f8c3d09e button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-9dee82af-7d23-47b2-a4cb-92b1f8c3d09e');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "df_books",
              "summary": "{\n  \"name\": \"df_books\",\n  \"rows\": 1000,\n  \"fields\": [\n    {\n      \"column\": \"title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 999,\n        \"samples\": [\n          \"The Grownup\",\n          \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n          \"Ayumi's Violin\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"price\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 14.446689669952772,\n        \"min\": 10.0,\n        \"max\": 59.99,\n        \"num_unique_values\": 903,\n        \"samples\": [\n          19.73,\n          55.65,\n          46.31\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"rating\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"One\",\n          \"Two\",\n          \"Four\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 7
        }
      ],
      "source": [
        "import pandas as pd\n",
        "\n",
        "df_books = pd.DataFrame({\n",
        "    \"title\": titles,\n",
        "    \"price\": prices,\n",
        "    \"rating\": ratings\n",
        "})\n",
        "\n",
        "df_books.head()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "duI5dv3CZYvF"
      },
      "source": [
        "### *d. Save web-scraped dataframe either as a CSV or Excel file*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "id": "lC1U_YHtZifh"
      },
      "outputs": [],
      "source": [
        "# 💾 Save to CSV\n",
        "df_books.to_csv(\"books_data.csv\", index=False)\n",
        "\n",
        "# 💾 Or save to Excel\n",
        "# df_books.to_excel(\"books_data.xlsx\", index=False)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qMjRKMBQZlJi"
      },
      "source": [
        "### *e. ✋🏻🛑⛔️ View first fiew lines*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "id": "O_wIvTxYZqCK",
        "outputId": "9d403240-8a04-4e3d-9a2d-fec660b9b8f4"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                                   title  price rating\n",
              "0                   A Light in the Attic  51.77  Three\n",
              "1                     Tipping the Velvet  53.74    One\n",
              "2                             Soumission  50.10    One\n",
              "3                          Sharp Objects  47.82   Four\n",
              "4  Sapiens: A Brief History of Humankind  54.23   Five"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-4d04f056-aeb0-4be4-b17c-1dda3120a187\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>title</th>\n",
              "      <th>price</th>\n",
              "      <th>rating</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>A Light in the Attic</td>\n",
              "      <td>51.77</td>\n",
              "      <td>Three</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Tipping the Velvet</td>\n",
              "      <td>53.74</td>\n",
              "      <td>One</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Soumission</td>\n",
              "      <td>50.10</td>\n",
              "      <td>One</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Sharp Objects</td>\n",
              "      <td>47.82</td>\n",
              "      <td>Four</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Sapiens: A Brief History of Humankind</td>\n",
              "      <td>54.23</td>\n",
              "      <td>Five</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4d04f056-aeb0-4be4-b17c-1dda3120a187')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-4d04f056-aeb0-4be4-b17c-1dda3120a187 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-4d04f056-aeb0-4be4-b17c-1dda3120a187');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "df_books",
              "summary": "{\n  \"name\": \"df_books\",\n  \"rows\": 1000,\n  \"fields\": [\n    {\n      \"column\": \"title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 999,\n        \"samples\": [\n          \"The Grownup\",\n          \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n          \"Ayumi's Violin\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"price\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 14.446689669952772,\n        \"min\": 10.0,\n        \"max\": 59.99,\n        \"num_unique_values\": 903,\n        \"samples\": [\n          19.73,\n          55.65,\n          46.31\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"rating\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"One\",\n          \"Two\",\n          \"Four\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 9
        }
      ],
      "source": [
        "# View first few lines\n",
        "df_books.head()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "p-1Pr2szaqLk"
      },
      "source": [
        "## **3.** 🧩 Create a meaningful connection between real & synthetic datasets"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SIaJUGIpaH4V"
      },
      "source": [
        "### *a. Initial setup*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "id": "-gPXGcRPuV_9"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import random\n",
        "from datetime import datetime\n",
        "import warnings\n",
        "\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "random.seed(2025)\n",
        "np.random.seed(2025)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "pY4yCoIuaQqp"
      },
      "source": [
        "### *b. Generate popularity scores based on rating (with some randomness) with a generate_popularity_score function*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "id": "mnd5hdAbaNjz"
      },
      "outputs": [],
      "source": [
        "def generate_popularity_score(rating):\n",
        "    base = {\"One\": 2, \"Two\": 3, \"Three\": 3, \"Four\": 4, \"Five\": 4}.get(rating, 3)\n",
        "    trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n",
        "    return int(np.clip(base + trend_factor, 1, 5))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "n4-TaNTFgPak"
      },
      "source": [
        "### *c. ✋🏻🛑⛔️ Run the function to create a \"popularity_score\" column from \"rating\"*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {
        "id": "V-G3OCUCgR07",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "outputId": "3bfd9e0f-fa90-4b13-ba80-9b468d041978"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                                   title  price rating  popularity_score\n",
              "0                   A Light in the Attic  51.77  Three                 3\n",
              "1                     Tipping the Velvet  53.74    One                 2\n",
              "2                             Soumission  50.10    One                 2\n",
              "3                          Sharp Objects  47.82   Four                 4\n",
              "4  Sapiens: A Brief History of Humankind  54.23   Five                 3"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-424ecf49-9455-4a9b-a36f-492c981573db\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>title</th>\n",
              "      <th>price</th>\n",
              "      <th>rating</th>\n",
              "      <th>popularity_score</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>A Light in the Attic</td>\n",
              "      <td>51.77</td>\n",
              "      <td>Three</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Tipping the Velvet</td>\n",
              "      <td>53.74</td>\n",
              "      <td>One</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Soumission</td>\n",
              "      <td>50.10</td>\n",
              "      <td>One</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Sharp Objects</td>\n",
              "      <td>47.82</td>\n",
              "      <td>Four</td>\n",
              "      <td>4</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Sapiens: A Brief History of Humankind</td>\n",
              "      <td>54.23</td>\n",
              "      <td>Five</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-424ecf49-9455-4a9b-a36f-492c981573db')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-424ecf49-9455-4a9b-a36f-492c981573db button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-424ecf49-9455-4a9b-a36f-492c981573db');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "df_books",
              "summary": "{\n  \"name\": \"df_books\",\n  \"rows\": 1000,\n  \"fields\": [\n    {\n      \"column\": \"title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 999,\n        \"samples\": [\n          \"The Grownup\",\n          \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n          \"Ayumi's Violin\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"price\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 14.446689669952772,\n        \"min\": 10.0,\n        \"max\": 59.99,\n        \"num_unique_values\": 903,\n        \"samples\": [\n          19.73,\n          55.65,\n          46.31\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"rating\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"One\",\n          \"Two\",\n          \"Four\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"popularity_score\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 1,\n        \"min\": 1,\n        \"max\": 5,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          2,\n          5,\n          4\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 13
        }
      ],
      "source": [
        "# Create popularity_score column based on rating\n",
        "df_books[\"popularity_score\"] = df_books[\"rating\"].apply(generate_popularity_score)\n",
        "\n",
        "df_books.head()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HnngRNTgacYt"
      },
      "source": [
        "### *d. Decide on the sentiment_label based on the popularity score with a get_sentiment function*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 15,
      "metadata": {
        "id": "kUtWmr8maZLZ"
      },
      "outputs": [],
      "source": [
        "def get_sentiment(popularity_score):\n",
        "    if popularity_score <= 2:\n",
        "        return \"negative\"\n",
        "    elif popularity_score == 3:\n",
        "        return \"neutral\"\n",
        "    else:\n",
        "        return \"positive\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HF9F9HIzgT7Z"
      },
      "source": [
        "### *e. ✋🏻🛑⛔️ Run the function to create a \"sentiment_label\" column from \"popularity_score\"*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 16,
      "metadata": {
        "id": "tafQj8_7gYCG",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "outputId": "57910696-4fbc-4df5-c86e-ad4dc1f0321a"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                                   title  price rating  popularity_score  \\\n",
              "0                   A Light in the Attic  51.77  Three                 3   \n",
              "1                     Tipping the Velvet  53.74    One                 2   \n",
              "2                             Soumission  50.10    One                 2   \n",
              "3                          Sharp Objects  47.82   Four                 4   \n",
              "4  Sapiens: A Brief History of Humankind  54.23   Five                 3   \n",
              "\n",
              "  sentiment_label  \n",
              "0         neutral  \n",
              "1        negative  \n",
              "2        negative  \n",
              "3        positive  \n",
              "4         neutral  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-0a23af4f-e965-457a-be3e-250861b3db2b\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>title</th>\n",
              "      <th>price</th>\n",
              "      <th>rating</th>\n",
              "      <th>popularity_score</th>\n",
              "      <th>sentiment_label</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>A Light in the Attic</td>\n",
              "      <td>51.77</td>\n",
              "      <td>Three</td>\n",
              "      <td>3</td>\n",
              "      <td>neutral</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Tipping the Velvet</td>\n",
              "      <td>53.74</td>\n",
              "      <td>One</td>\n",
              "      <td>2</td>\n",
              "      <td>negative</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Soumission</td>\n",
              "      <td>50.10</td>\n",
              "      <td>One</td>\n",
              "      <td>2</td>\n",
              "      <td>negative</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Sharp Objects</td>\n",
              "      <td>47.82</td>\n",
              "      <td>Four</td>\n",
              "      <td>4</td>\n",
              "      <td>positive</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Sapiens: A Brief History of Humankind</td>\n",
              "      <td>54.23</td>\n",
              "      <td>Five</td>\n",
              "      <td>3</td>\n",
              "      <td>neutral</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-0a23af4f-e965-457a-be3e-250861b3db2b')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-0a23af4f-e965-457a-be3e-250861b3db2b button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-0a23af4f-e965-457a-be3e-250861b3db2b');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "df_books",
              "summary": "{\n  \"name\": \"df_books\",\n  \"rows\": 1000,\n  \"fields\": [\n    {\n      \"column\": \"title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 999,\n        \"samples\": [\n          \"The Grownup\",\n          \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n          \"Ayumi's Violin\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"price\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 14.446689669952772,\n        \"min\": 10.0,\n        \"max\": 59.99,\n        \"num_unique_values\": 903,\n        \"samples\": [\n          19.73,\n          55.65,\n          46.31\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"rating\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"One\",\n          \"Two\",\n          \"Four\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"popularity_score\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 1,\n        \"min\": 1,\n        \"max\": 5,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          2,\n          5,\n          4\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"sentiment_label\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"neutral\",\n          \"negative\",\n          \"positive\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 16
        }
      ],
      "source": [
        "# Create sentiment_label column based on popularity_score\n",
        "df_books[\"sentiment_label\"] = df_books[\"popularity_score\"].apply(get_sentiment)\n",
        "\n",
        "df_books.head()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "T8AdKkmASq9a"
      },
      "source": [
        "## **4.** 📈 Generate synthetic book sales data of 18 months"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OhXbdGD5fH0c"
      },
      "source": [
        "### *a. Create a generate_sales_profit function that would generate sales patterns based on sentiment_label (with some randomness)*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "metadata": {
        "id": "qkVhYPXGbgEn"
      },
      "outputs": [],
      "source": [
        "def generate_sales_profile(sentiment):\n",
        "    months = pd.date_range(end=datetime.today(), periods=18, freq=\"M\")\n",
        "\n",
        "    if sentiment == \"positive\":\n",
        "        base = random.randint(200, 300)\n",
        "        trend = np.linspace(base, base + random.randint(20, 60), len(months))\n",
        "    elif sentiment == \"negative\":\n",
        "        base = random.randint(20, 80)\n",
        "        trend = np.linspace(base, base - random.randint(10, 30), len(months))\n",
        "    else:  # neutral\n",
        "        base = random.randint(80, 160)\n",
        "        trend = np.full(len(months), base + random.randint(-10, 10))\n",
        "\n",
        "    seasonality = 10 * np.sin(np.linspace(0, 3 * np.pi, len(months)))\n",
        "    noise = np.random.normal(0, 5, len(months))\n",
        "    monthly_sales = np.clip(trend + seasonality + noise, a_min=0, a_max=None).astype(int)\n",
        "\n",
        "    return list(zip(months.strftime(\"%Y-%m\"), monthly_sales))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "L2ak1HlcgoTe"
      },
      "source": [
        "### *b. Run the function as part of building sales_data*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 18,
      "metadata": {
        "id": "SlJ24AUafoDB"
      },
      "outputs": [],
      "source": [
        "sales_data = []\n",
        "for _, row in df_books.iterrows():\n",
        "    records = generate_sales_profile(row[\"sentiment_label\"])\n",
        "    for month, units in records:\n",
        "        sales_data.append({\n",
        "            \"title\": row[\"title\"],\n",
        "            \"month\": month,\n",
        "            \"units_sold\": units,\n",
        "            \"sentiment_label\": row[\"sentiment_label\"]\n",
        "        })"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4IXZKcCSgxnq"
      },
      "source": [
        "### *c. ✋🏻🛑⛔️ Create a df_sales DataFrame from sales_data*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 19,
      "metadata": {
        "id": "wcN6gtiZg-ws",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "outputId": "2c5a32e4-0960-4f83-8397-36f750ffc7f0"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                  title    month  units_sold sentiment_label\n",
              "0  A Light in the Attic  2024-08         100         neutral\n",
              "1  A Light in the Attic  2024-09         109         neutral\n",
              "2  A Light in the Attic  2024-10         102         neutral\n",
              "3  A Light in the Attic  2024-11         107         neutral\n",
              "4  A Light in the Attic  2024-12         108         neutral"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-132238b5-accd-472f-a20a-d941bb0a04de\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>title</th>\n",
              "      <th>month</th>\n",
              "      <th>units_sold</th>\n",
              "      <th>sentiment_label</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>A Light in the Attic</td>\n",
              "      <td>2024-08</td>\n",
              "      <td>100</td>\n",
              "      <td>neutral</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>A Light in the Attic</td>\n",
              "      <td>2024-09</td>\n",
              "      <td>109</td>\n",
              "      <td>neutral</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>A Light in the Attic</td>\n",
              "      <td>2024-10</td>\n",
              "      <td>102</td>\n",
              "      <td>neutral</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>A Light in the Attic</td>\n",
              "      <td>2024-11</td>\n",
              "      <td>107</td>\n",
              "      <td>neutral</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>A Light in the Attic</td>\n",
              "      <td>2024-12</td>\n",
              "      <td>108</td>\n",
              "      <td>neutral</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-132238b5-accd-472f-a20a-d941bb0a04de')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-132238b5-accd-472f-a20a-d941bb0a04de button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-132238b5-accd-472f-a20a-d941bb0a04de');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "df_sales",
              "summary": "{\n  \"name\": \"df_sales\",\n  \"rows\": 18000,\n  \"fields\": [\n    {\n      \"column\": \"title\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 999,\n        \"samples\": [\n          \"The Grownup\",\n          \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n          \"Ayumi's Violin\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"month\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"num_unique_values\": 18,\n        \"samples\": [\n          \"2024-08\",\n          \"2024-09\",\n          \"2025-04\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"units_sold\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 98,\n        \"min\": 0,\n        \"max\": 362,\n        \"num_unique_values\": 354,\n        \"samples\": [\n          214,\n          289,\n          205\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"sentiment_label\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"neutral\",\n          \"negative\",\n          \"positive\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 19
        }
      ],
      "source": [
        "import pandas as pd\n",
        "\n",
        "df_sales = pd.DataFrame(sales_data)\n",
        "\n",
        "df_sales.head()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "EhIjz9WohAmZ"
      },
      "source": [
        "### *d. Save df_sales as synthetic_sales_data.csv & view first few lines*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 20,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "MzbZvLcAhGaH",
        "outputId": "55e476f5-8c5e-4c7b-b6a4-043afc037bae"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "                  title    month  units_sold sentiment_label\n",
            "0  A Light in the Attic  2024-08         100         neutral\n",
            "1  A Light in the Attic  2024-09         109         neutral\n",
            "2  A Light in the Attic  2024-10         102         neutral\n",
            "3  A Light in the Attic  2024-11         107         neutral\n",
            "4  A Light in the Attic  2024-12         108         neutral\n"
          ]
        }
      ],
      "source": [
        "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n",
        "\n",
        "print(df_sales.head())"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7g9gqBgQMtJn"
      },
      "source": [
        "## **5.** 🎯 Generate synthetic customer reviews"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Gi4y9M9KuDWx"
      },
      "source": [
        "### *a. ✋🏻🛑⛔️ Ask ChatGPT to create a list of 50 distinct generic book review texts for the sentiment labels \"positive\", \"neutral\", and \"negative\" called synthetic_reviews_by_sentiment*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 27,
      "metadata": {
        "id": "b3cd2a50"
      },
      "outputs": [],
      "source": [
        "synthetic_reviews_by_sentiment = {\n",
        "    \"positive\": [\n",
        "        \"Absolutely loved this book from start to finish.\",\n",
        "        \"A beautifully written story with memorable characters.\",\n",
        "        \"Engaging plot and emotionally satisfying ending.\",\n",
        "        \"One of the most enjoyable reads I've had in a while.\",\n",
        "        \"Inspiring, thoughtful, and deeply moving.\",\n",
        "        \"The storytelling was immersive and captivating.\",\n",
        "        \"A powerful narrative that stayed with me.\",\n",
        "        \"Well-paced and wonderfully developed.\",\n",
        "        \"The characters felt authentic and relatable.\",\n",
        "        \"An uplifting and rewarding experience.\",\n",
        "        \"Brilliant execution and compelling themes.\",\n",
        "        \"The author’s voice was strong and confident.\",\n",
        "        \"A delightful surprise that exceeded expectations.\",\n",
        "        \"Emotionally rich and incredibly engaging.\",\n",
        "        \"Hard to put down once I started.\",\n",
        "        \"An outstanding contribution to the genre.\",\n",
        "        \"Smart, entertaining, and beautifully crafted.\",\n",
        "        \"A truly satisfying and meaningful read.\",\n",
        "        \"The writing style was elegant and fluid.\",\n",
        "        \"An unforgettable literary experience.\",\n",
        "        \"Creative, thoughtful, and well-structured.\",\n",
        "        \"The plot twists were exciting and well done.\",\n",
        "        \"A heartwarming and impactful story.\",\n",
        "        \"Masterfully written and highly engaging.\",\n",
        "        \"The pacing kept me hooked throughout.\",\n",
        "        \"An impressive and polished work.\",\n",
        "        \"Deeply resonant and emotionally compelling.\",\n",
        "        \"A refreshing and enjoyable read.\",\n",
        "        \"Strong character development and vivid scenes.\",\n",
        "        \"A fascinating and rewarding journey.\",\n",
        "        \"The dialogue felt natural and sharp.\",\n",
        "        \"An inspiring and beautifully told tale.\",\n",
        "        \"Richly detailed and thoughtfully written.\",\n",
        "        \"A standout book in its category.\",\n",
        "        \"Highly recommended for fans of the genre.\",\n",
        "        \"An engaging blend of emotion and action.\",\n",
        "        \"Thought-provoking and satisfying.\",\n",
        "        \"A memorable and meaningful story.\",\n",
        "        \"The themes were handled with depth and care.\",\n",
        "        \"An exceptional and absorbing narrative.\",\n",
        "        \"Truly enjoyable from beginning to end.\",\n",
        "        \"A smart and emotionally layered story.\",\n",
        "        \"Compelling storytelling with a strong finish.\",\n",
        "        \"A wonderfully crafted and immersive book.\",\n",
        "        \"An exciting and heartfelt read.\",\n",
        "        \"The author did a fantastic job.\",\n",
        "        \"A captivating and beautifully structured novel.\",\n",
        "        \"A rewarding and enriching experience.\",\n",
        "        \"Strong writing and vivid imagination.\",\n",
        "        \"An excellent book I would gladly reread.\"\n",
        "    ],\n",
        "\n",
        "    \"neutral\": [\n",
        "        \"An average read with some interesting moments.\",\n",
        "        \"It had both strengths and weaknesses.\",\n",
        "        \"Not bad, but not particularly memorable either.\",\n",
        "        \"Some parts were engaging, others less so.\",\n",
        "        \"A decent book for passing the time.\",\n",
        "        \"The story was fine but nothing extraordinary.\",\n",
        "        \"Moderately enjoyable overall.\",\n",
        "        \"The pacing was uneven at times.\",\n",
        "        \"An okay read with mixed impressions.\",\n",
        "        \"Some characters stood out more than others.\",\n",
        "        \"The plot was predictable but acceptable.\",\n",
        "        \"A fairly standard story.\",\n",
        "        \"There were highlights, but also dull sections.\",\n",
        "        \"It met expectations but didn’t exceed them.\",\n",
        "        \"Reasonably entertaining but not remarkable.\",\n",
        "        \"The writing was serviceable.\",\n",
        "        \"An average experience overall.\",\n",
        "        \"The concept was interesting but execution varied.\",\n",
        "        \"A typical entry in the genre.\",\n",
        "        \"It held my attention in parts.\",\n",
        "        \"Not particularly original, but readable.\",\n",
        "        \"Some scenes worked better than others.\",\n",
        "        \"An adequate and straightforward read.\",\n",
        "        \"The themes were present but lightly explored.\",\n",
        "        \"Neither impressive nor disappointing.\",\n",
        "        \"A passable story with moderate appeal.\",\n",
        "        \"The ending was acceptable.\",\n",
        "        \"It had potential that wasn’t fully realized.\",\n",
        "        \"Somewhat engaging but uneven.\",\n",
        "        \"An alright book with room for improvement.\",\n",
        "        \"Fairly enjoyable in places.\",\n",
        "        \"The dialogue was average.\",\n",
        "        \"A mixed but tolerable experience.\",\n",
        "        \"The characters were moderately developed.\",\n",
        "        \"It was fine for casual reading.\",\n",
        "        \"An ordinary story with standard pacing.\",\n",
        "        \"There were moments of interest.\",\n",
        "        \"Not bad overall, just not standout.\",\n",
        "        \"A reasonable but forgettable read.\",\n",
        "        \"The book was competently written.\",\n",
        "        \"It did what it set out to do.\",\n",
        "        \"Slightly engaging but not gripping.\",\n",
        "        \"An acceptable addition to the shelf.\",\n",
        "        \"The story had both highs and lows.\",\n",
        "        \"An average literary effort.\",\n",
        "        \"Somewhat enjoyable but not memorable.\",\n",
        "        \"The narrative was steady but plain.\",\n",
        "        \"A simple and predictable read.\",\n",
        "        \"It was okay overall.\",\n",
        "        \"A neutral reading experience.\"\n",
        "    ],\n",
        "\n",
        "    \"negative\": [\n",
        "        \"I struggled to stay interested throughout.\",\n",
        "        \"The plot felt confusing and disjointed.\",\n",
        "        \"Disappointing overall and hard to finish.\",\n",
        "        \"The characters lacked depth and realism.\",\n",
        "        \"It didn’t live up to expectations.\",\n",
        "        \"The pacing was slow and uneven.\",\n",
        "        \"I found it difficult to connect with the story.\",\n",
        "        \"The writing style didn’t appeal to me.\",\n",
        "        \"A frustrating and underwhelming read.\",\n",
        "        \"The narrative felt repetitive.\",\n",
        "        \"Not as engaging as I had hoped.\",\n",
        "        \"The story lacked coherence.\",\n",
        "        \"I lost interest midway through.\",\n",
        "        \"The dialogue felt unnatural.\",\n",
        "        \"A missed opportunity with weak execution.\",\n",
        "        \"The ending was unsatisfying.\",\n",
        "        \"The plot twists were predictable.\",\n",
        "        \"It felt overly long and dragged out.\",\n",
        "        \"The themes were poorly developed.\",\n",
        "        \"I expected much more from this book.\",\n",
        "        \"The characters were forgettable.\",\n",
        "        \"The story felt flat and uninspired.\",\n",
        "        \"It failed to capture my attention.\",\n",
        "        \"The structure was confusing.\",\n",
        "        \"A dull and disappointing experience.\",\n",
        "        \"The writing lacked clarity.\",\n",
        "        \"I wouldn’t recommend this one.\",\n",
        "        \"The concept was interesting but poorly executed.\",\n",
        "        \"It felt rushed in key parts.\",\n",
        "        \"The emotional impact was minimal.\",\n",
        "        \"The storyline was weak and inconsistent.\",\n",
        "        \"I found it tedious to read.\",\n",
        "        \"The book didn’t resonate with me.\",\n",
        "        \"The pacing made it hard to enjoy.\",\n",
        "        \"The character arcs were unsatisfying.\",\n",
        "        \"A forgettable and disappointing read.\",\n",
        "        \"It lacked originality and depth.\",\n",
        "        \"The plot development was poor.\",\n",
        "        \"I struggled to understand the direction.\",\n",
        "        \"It didn’t hold my attention.\",\n",
        "        \"The story felt incomplete.\",\n",
        "        \"The writing was uninspired.\",\n",
        "        \"The book was not engaging.\",\n",
        "        \"I wouldn’t read it again.\",\n",
        "        \"The overall execution was lacking.\",\n",
        "        \"The narrative felt forced.\",\n",
        "        \"It was difficult to stay invested.\",\n",
        "        \"A disappointing literary experience.\",\n",
        "        \"The book failed to impress.\",\n",
        "        \"Not worth the time in my opinion.\"\n",
        "    ]\n",
        "}\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fQhfVaDmuULT"
      },
      "source": [
        "### *b. Generate 10 reviews per book using random sampling from the corresponding 50*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 28,
      "metadata": {
        "id": "l2SRc3PjuTGM"
      },
      "outputs": [],
      "source": [
        "review_rows = []\n",
        "for _, row in df_books.iterrows():\n",
        "    title = row['title']\n",
        "    sentiment_label = row['sentiment_label']\n",
        "    review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n",
        "    sampled_reviews = random.sample(review_pool, 10)\n",
        "    for review_text in sampled_reviews:\n",
        "        review_rows.append({\n",
        "            \"title\": title,\n",
        "            \"sentiment_label\": sentiment_label,\n",
        "            \"review_text\": review_text,\n",
        "            \"rating\": row['rating'],\n",
        "            \"popularity_score\": row['popularity_score']\n",
        "        })"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "bmJMXF-Bukdm"
      },
      "source": [
        "### *c. Create the final dataframe df_reviews & save it as synthetic_book_reviews.csv*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 29,
      "metadata": {
        "id": "ZUKUqZsuumsp"
      },
      "outputs": [],
      "source": [
        "df_reviews = pd.DataFrame(review_rows)\n",
        "df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### *c. inputs for R*"
      ],
      "metadata": {
        "id": "_602pYUS3gY5"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 30,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "3946e521",
        "outputId": "33160805-20df-4483-dde9-3b557b25f063"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "✅ Wrote synthetic_title_level_features.csv\n",
            "✅ Wrote synthetic_monthly_revenue_series.csv\n"
          ]
        }
      ],
      "source": [
        "import numpy as np\n",
        "\n",
        "def _safe_num(s):\n",
        "    return pd.to_numeric(\n",
        "        pd.Series(s).astype(str).str.replace(r\"[^0-9.]\", \"\", regex=True),\n",
        "        errors=\"coerce\"\n",
        "    )\n",
        "\n",
        "# --- Clean book metadata (price/rating) ---\n",
        "df_books_r = df_books.copy()\n",
        "if \"price\" in df_books_r.columns:\n",
        "    df_books_r[\"price\"] = _safe_num(df_books_r[\"price\"])\n",
        "if \"rating\" in df_books_r.columns:\n",
        "    df_books_r[\"rating\"] = _safe_num(df_books_r[\"rating\"])\n",
        "\n",
        "df_books_r[\"title\"] = df_books_r[\"title\"].astype(str).str.strip()\n",
        "\n",
        "# --- Clean sales ---\n",
        "df_sales_r = df_sales.copy()\n",
        "df_sales_r[\"title\"] = df_sales_r[\"title\"].astype(str).str.strip()\n",
        "df_sales_r[\"month\"] = pd.to_datetime(df_sales_r[\"month\"], errors=\"coerce\")\n",
        "df_sales_r[\"units_sold\"] = _safe_num(df_sales_r[\"units_sold\"])\n",
        "\n",
        "# --- Clean reviews ---\n",
        "df_reviews_r = df_reviews.copy()\n",
        "df_reviews_r[\"title\"] = df_reviews_r[\"title\"].astype(str).str.strip()\n",
        "df_reviews_r[\"sentiment_label\"] = df_reviews_r[\"sentiment_label\"].astype(str).str.lower().str.strip()\n",
        "if \"rating\" in df_reviews_r.columns:\n",
        "    df_reviews_r[\"rating\"] = _safe_num(df_reviews_r[\"rating\"])\n",
        "if \"popularity_score\" in df_reviews_r.columns:\n",
        "    df_reviews_r[\"popularity_score\"] = _safe_num(df_reviews_r[\"popularity_score\"])\n",
        "\n",
        "# --- Sentiment shares per title (from reviews) ---\n",
        "sent_counts = (\n",
        "    df_reviews_r.groupby([\"title\", \"sentiment_label\"])\n",
        "    .size()\n",
        "    .unstack(fill_value=0)\n",
        ")\n",
        "for lab in [\"positive\", \"neutral\", \"negative\"]:\n",
        "    if lab not in sent_counts.columns:\n",
        "        sent_counts[lab] = 0\n",
        "\n",
        "sent_counts[\"total_reviews\"] = sent_counts[[\"positive\", \"neutral\", \"negative\"]].sum(axis=1)\n",
        "den = sent_counts[\"total_reviews\"].replace(0, np.nan)\n",
        "sent_counts[\"share_positive\"] = sent_counts[\"positive\"] / den\n",
        "sent_counts[\"share_neutral\"]  = sent_counts[\"neutral\"]  / den\n",
        "sent_counts[\"share_negative\"] = sent_counts[\"negative\"] / den\n",
        "sent_counts = sent_counts.reset_index()\n",
        "\n",
        "# --- Sales aggregation per title ---\n",
        "sales_by_title = (\n",
        "    df_sales_r.dropna(subset=[\"title\"])\n",
        "    .groupby(\"title\", as_index=False)\n",
        "    .agg(\n",
        "        months_observed=(\"month\", \"nunique\"),\n",
        "        avg_units_sold=(\"units_sold\", \"mean\"),\n",
        "        total_units_sold=(\"units_sold\", \"sum\"),\n",
        "    )\n",
        ")\n",
        "\n",
        "# --- Title-level features (join sales + books + sentiment) ---\n",
        "df_title = (\n",
        "    sales_by_title\n",
        "    .merge(df_books_r[[\"title\", \"price\", \"rating\"]], on=\"title\", how=\"left\")\n",
        "    .merge(sent_counts[[\"title\", \"share_positive\", \"share_neutral\", \"share_negative\", \"total_reviews\"]],\n",
        "           on=\"title\", how=\"left\")\n",
        ")\n",
        "\n",
        "df_title[\"avg_revenue\"] = df_title[\"avg_units_sold\"] * df_title[\"price\"]\n",
        "df_title[\"total_revenue\"] = df_title[\"total_units_sold\"] * df_title[\"price\"]\n",
        "\n",
        "df_title.to_csv(\"synthetic_title_level_features.csv\", index=False)\n",
        "print(\"✅ Wrote synthetic_title_level_features.csv\")\n",
        "\n",
        "# --- Monthly revenue series (proxy: units_sold * price) ---\n",
        "monthly_rev = (\n",
        "    df_sales_r.merge(df_books_r[[\"title\", \"price\"]], on=\"title\", how=\"left\")\n",
        ")\n",
        "monthly_rev[\"revenue\"] = monthly_rev[\"units_sold\"] * monthly_rev[\"price\"]\n",
        "\n",
        "df_monthly = (\n",
        "    monthly_rev.dropna(subset=[\"month\"])\n",
        "    .groupby(\"month\", as_index=False)[\"revenue\"]\n",
        "    .sum()\n",
        "    .rename(columns={\"revenue\": \"total_revenue\"})\n",
        "    .sort_values(\"month\")\n",
        ")\n",
        "# if revenue is all NA (e.g., missing price), fallback to units_sold as a teaching proxy\n",
        "if df_monthly[\"total_revenue\"].notna().sum() == 0:\n",
        "    df_monthly = (\n",
        "        df_sales_r.dropna(subset=[\"month\"])\n",
        "        .groupby(\"month\", as_index=False)[\"units_sold\"]\n",
        "        .sum()\n",
        "        .rename(columns={\"units_sold\": \"total_revenue\"})\n",
        "        .sort_values(\"month\")\n",
        "    )\n",
        "\n",
        "df_monthly[\"month\"] = pd.to_datetime(df_monthly[\"month\"], errors=\"coerce\").dt.strftime(\"%Y-%m-%d\")\n",
        "df_monthly.to_csv(\"synthetic_monthly_revenue_series.csv\", index=False)\n",
        "print(\"✅ Wrote synthetic_monthly_revenue_series.csv\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "RYvGyVfXuo54"
      },
      "source": [
        "### *d. ✋🏻🛑⛔️ View the first few lines*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 31,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 963
        },
        "id": "xfE8NMqOurKo",
        "outputId": "b34adff4-1832-4c6c-b4d6-7a40b303615e"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "=== df_title (title-level features) ===\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "                                               title  months_observed  \\\n",
              "0  \"Most Blessed of the Patriarchs\": Thomas Jeffe...               18   \n",
              "1                                          #GIRLBOSS               18   \n",
              "2  #HigherSelfie: Wake Up Your Life. Free Your So...               18   \n",
              "3                                       'Salem's Lot               18   \n",
              "4  (Un)Qualified: How God Uses Broken People to D...               18   \n",
              "\n",
              "   avg_units_sold  total_units_sold  price  rating  share_positive  \\\n",
              "0      285.555556              5140  44.48     NaN             1.0   \n",
              "1       47.944444               863  50.96     NaN             0.0   \n",
              "2      226.777778              4082  23.11     NaN             1.0   \n",
              "3      246.055556              4429  49.56     NaN             1.0   \n",
              "4      294.444444              5300  54.00     NaN             1.0   \n",
              "\n",
              "   share_neutral  share_negative  total_reviews   avg_revenue  total_revenue  \n",
              "0            0.0             0.0             10  12701.511111      228627.20  \n",
              "1            0.0             1.0             10   2443.248889       43978.48  \n",
              "2            0.0             0.0             10   5240.834444       94335.02  \n",
              "3            0.0             0.0             10  12194.513333      219501.24  \n",
              "4            0.0             0.0             10  15900.000000      286200.00  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-d4b62240-7e0a-4021-8fe2-ed3e0acce0e4\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>title</th>\n",
              "      <th>months_observed</th>\n",
              "      <th>avg_units_sold</th>\n",
              "      <th>total_units_sold</th>\n",
              "      <th>price</th>\n",
              "      <th>rating</th>\n",
              "      <th>share_positive</th>\n",
              "      <th>share_neutral</th>\n",
              "      <th>share_negative</th>\n",
              "      <th>total_reviews</th>\n",
              "      <th>avg_revenue</th>\n",
              "      <th>total_revenue</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>\"Most Blessed of the Patriarchs\": Thomas Jeffe...</td>\n",
              "      <td>18</td>\n",
              "      <td>285.555556</td>\n",
              "      <td>5140</td>\n",
              "      <td>44.48</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>10</td>\n",
              "      <td>12701.511111</td>\n",
              "      <td>228627.20</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>#GIRLBOSS</td>\n",
              "      <td>18</td>\n",
              "      <td>47.944444</td>\n",
              "      <td>863</td>\n",
              "      <td>50.96</td>\n",
              "      <td>NaN</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>10</td>\n",
              "      <td>2443.248889</td>\n",
              "      <td>43978.48</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>#HigherSelfie: Wake Up Your Life. Free Your So...</td>\n",
              "      <td>18</td>\n",
              "      <td>226.777778</td>\n",
              "      <td>4082</td>\n",
              "      <td>23.11</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>10</td>\n",
              "      <td>5240.834444</td>\n",
              "      <td>94335.02</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>'Salem's Lot</td>\n",
              "      <td>18</td>\n",
              "      <td>246.055556</td>\n",
              "      <td>4429</td>\n",
              "      <td>49.56</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>10</td>\n",
              "      <td>12194.513333</td>\n",
              "      <td>219501.24</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>(Un)Qualified: How God Uses Broken People to D...</td>\n",
              "      <td>18</td>\n",
              "      <td>294.444444</td>\n",
              "      <td>5300</td>\n",
              "      <td>54.00</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>10</td>\n",
              "      <td>15900.000000</td>\n",
              "      <td>286200.00</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-d4b62240-7e0a-4021-8fe2-ed3e0acce0e4')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-d4b62240-7e0a-4021-8fe2-ed3e0acce0e4 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-d4b62240-7e0a-4021-8fe2-ed3e0acce0e4');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"print(df_monthly\",\n  \"rows\": 5,\n  \"fields\": [\n    {\n      \"column\": \"title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"#GIRLBOSS\",\n          \"(Un)Qualified: How God Uses Broken People to Do Big Things\",\n          \"#HigherSelfie: Wake Up Your Life. Free Your Soul. Find Your Tribe.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"months_observed\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 18,\n        \"max\": 18,\n        \"num_unique_values\": 1,\n        \"samples\": [\n          18\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"avg_units_sold\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 100.20894374958456,\n        \"min\": 47.94444444444444,\n        \"max\": 294.44444444444446,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          47.94444444444444\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"total_units_sold\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 1803,\n        \"min\": 863,\n        \"max\": 5300,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          863\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"price\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 12.400476603743908,\n        \"min\": 23.11,\n        \"max\": 54.0,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          50.96\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"rating\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": null,\n        \"min\": null,\n        \"max\": null,\n        \"num_unique_values\": 0,\n        \"samples\": [],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"share_positive\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.44721359549995804,\n        \"min\": 0.0,\n        \"max\": 1.0,\n        \"num_unique_values\": 2,\n        \"samples\": [],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"share_neutral\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.0,\n        \"min\": 0.0,\n        \"max\": 0.0,\n        \"num_unique_values\": 1,\n        \"samples\": [],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"share_negative\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.44721359549995804,\n        \"min\": 0.0,\n        \"max\": 1.0,\n        \"num_unique_values\": 2,\n        \"samples\": [],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"total_reviews\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 10,\n        \"max\": 10,\n        \"num_unique_values\": 1,\n        \"samples\": [],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"avg_revenue\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 5617.298744073206,\n        \"min\": 2443.248888888889,\n        \"max\": 15900.0,\n        \"num_unique_values\": 5,\n        \"samples\": [],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"total_revenue\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 101111.37739331771,\n        \"min\": 43978.48,\n        \"max\": 286200.0,\n        \"num_unique_values\": 5,\n        \"samples\": [],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "shape: (1000, 12)\n",
            "rating              1000\n",
            "title                  0\n",
            "avg_units_sold         0\n",
            "months_observed        0\n",
            "total_units_sold       0\n",
            "price                  0\n",
            "share_positive         0\n",
            "share_neutral          0\n",
            "share_negative         0\n",
            "total_reviews          0\n",
            "dtype: int64\n",
            "\n",
            "=== df_monthly (monthly revenue series) ===\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "        month  total_revenue\n",
              "0  2024-08-01     5631956.77\n",
              "1  2024-09-01     5856653.68\n",
              "2  2024-10-01     6006876.26\n",
              "3  2024-11-01     6061519.85\n",
              "4  2024-12-01     6014276.79"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-5784b83a-dc01-4062-9982-150f25c9d635\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>month</th>\n",
              "      <th>total_revenue</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>2024-08-01</td>\n",
              "      <td>5631956.77</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>2024-09-01</td>\n",
              "      <td>5856653.68</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>2024-10-01</td>\n",
              "      <td>6006876.26</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>2024-11-01</td>\n",
              "      <td>6061519.85</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>2024-12-01</td>\n",
              "      <td>6014276.79</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-5784b83a-dc01-4062-9982-150f25c9d635')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-5784b83a-dc01-4062-9982-150f25c9d635 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-5784b83a-dc01-4062-9982-150f25c9d635');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"print(df_monthly\",\n  \"rows\": 5,\n  \"fields\": [\n    {\n      \"column\": \"month\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"2024-09-01\",\n          \"2024-12-01\",\n          \"2024-10-01\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"total_revenue\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 175556.39765987248,\n        \"min\": 5631956.77,\n        \"max\": 6061519.85,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          5856653.68,\n          6014276.79,\n          6006876.26\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "shape: (18, 2)\n",
            "month            0\n",
            "total_revenue    0\n",
            "dtype: int64\n"
          ]
        }
      ],
      "source": [
        "# d. View the first few lines (and quick checks)\n",
        "\n",
        "print(\"=== df_title (title-level features) ===\")\n",
        "display(df_title.head())\n",
        "print(\"shape:\", df_title.shape)\n",
        "print(df_title.isna().sum().sort_values(ascending=False).head(10))\n",
        "\n",
        "print(\"\\n=== df_monthly (monthly revenue series) ===\")\n",
        "display(df_monthly.head())\n",
        "print(\"shape:\", df_monthly.shape)\n",
        "print(df_monthly.isna().sum())\n"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}