Spaces:
No application file
No application file
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "4ba6aba8" | |
| }, | |
| "source": [ | |
| "# 🤖 **Data Collection, Creation, Storage, and Processing**\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "jpASMyIQMaAq" | |
| }, | |
| "source": [ | |
| "## **1.** 📦 Install required packages" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "collapsed": true, | |
| "id": "f48c8f8c", | |
| "outputId": "04e1bdd4-2da2-4c3d-9d91-dc5c25a1cf81" | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n", | |
| "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n", | |
| "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", | |
| "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n", | |
| "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", | |
| "Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n", | |
| "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n", | |
| "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n", | |
| "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n", | |
| "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n", | |
| "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.3)\n", | |
| "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", | |
| "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", | |
| "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", | |
| "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", | |
| "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n", | |
| "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", | |
| "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", | |
| "Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n", | |
| "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.1)\n", | |
| "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n", | |
| "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (2025.11.3)\n", | |
| "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (4.67.3)\n", | |
| "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!pip install beautifulsoup4 pandas matplotlib seaborn numpy textblob" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "lquNYCbfL9IM" | |
| }, | |
| "source": [ | |
| "## **2.** ⛏ Web-scrape all book titles, prices, and ratings from books.toscrape.com" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "0IWuNpxxYDJF" | |
| }, | |
| "source": [ | |
| "### *a. Initial setup*\n", | |
| "Define the base url of the website you will scrape as well as how and what you will scrape" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "id": "91d52125" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import requests\n", | |
| "from bs4 import BeautifulSoup\n", | |
| "import pandas as pd\n", | |
| "import time\n", | |
| "import re\n", | |
| "\n", | |
| "base_url = \"https://books.toscrape.com/catalogue/page-{}.html\"\n", | |
| "headers = {\"User-Agent\": \"Mozilla/5.0\"}\n", | |
| "\n", | |
| "titles, prices, ratings = [], [], []\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "oCdTsin2Yfp3" | |
| }, | |
| "source": [ | |
| "### *b. Fill titles, prices, and ratings from the web pages*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "id": "xqO5Y3dnYhxt" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Loop through all 50 pages\n", | |
| "for page in range(1, 51):\n", | |
| " url = base_url.format(page)\n", | |
| " response = requests.get(url, headers=headers, timeout=20)\n", | |
| " response.raise_for_status()\n", | |
| "\n", | |
| " soup = BeautifulSoup(response.content, \"html.parser\")\n", | |
| " books = soup.find_all(\"article\", class_=\"product_pod\")\n", | |
| "\n", | |
| " for book in books:\n", | |
| " raw_price = book.find(\"p\", class_=\"price_color\").get_text(strip=True)\n", | |
| " clean_price = float(re.sub(r\"[^0-9.]\", \"\", raw_price))\n", | |
| "\n", | |
| " titles.append(book.h3.a[\"title\"])\n", | |
| " prices.append(clean_price)\n", | |
| " ratings.append(book.p.get(\"class\")[1])\n", | |
| "\n", | |
| " time.sleep(0.2)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "T0TOeRC4Yrnn" | |
| }, | |
| "source": [ | |
| "### *c. ✋🏻🛑⛔️ Create a dataframe df_books that contains the now complete \"title\", \"price\", and \"rating\" objects*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "id": "l5FkkNhUYTHh" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_books = pd.DataFrame({\n", | |
| " \"title\": titles,\n", | |
| " \"price\": prices,\n", | |
| " \"rating\": ratings\n", | |
| "})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "duI5dv3CZYvF" | |
| }, | |
| "source": [ | |
| "### *d. Save web-scraped dataframe either as a CSV or Excel file*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": { | |
| "id": "lC1U_YHtZifh" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# 💾 Save to CSV\n", | |
| "df_books.to_csv(\"books_data.csv\", index=False)\n", | |
| "\n", | |
| "# 💾 Or save to Excel\n", | |
| "# df_books.to_excel(\"books_data.xlsx\", index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "qMjRKMBQZlJi" | |
| }, | |
| "source": [ | |
| "### *e. ✋🏻🛑⛔️ View first fiew lines*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 206 | |
| }, | |
| "id": "O_wIvTxYZqCK", | |
| "outputId": "718c3d95-1dc9-4f67-9940-ebda3ddbf90b" | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "application/vnd.google.colaboratory.intrinsic+json": { | |
| "summary": "{\n \"name\": \"df_books\",\n \"rows\": 840,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 799,\n \"samples\": [\n \"Where'd You Go, Bernadette\",\n \"Mockingjay (The Hunger Games #3)\",\n \"The Mindfulness and Acceptance Workbook for Anxiety: A Guide to Breaking Free from Anxiety, Phobias, and Worry Using Acceptance and Commitment Therapy\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.560827565784338,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 740,\n \"samples\": [\n 42.95,\n 20.91,\n 21.8\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", | |
| "type": "dataframe", | |
| "variable_name": "df_books" | |
| }, | |
| "text/html": [ | |
| "\n", | |
| " <div id=\"df-f3614048-0669-4e42-9703-5871bd840543\" class=\"colab-df-container\">\n", | |
| " <div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>title</th>\n", | |
| " <th>price</th>\n", | |
| " <th>rating</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>A Light in the Attic</td>\n", | |
| " <td>51.77</td>\n", | |
| " <td>Three</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>Tipping the Velvet</td>\n", | |
| " <td>53.74</td>\n", | |
| " <td>One</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>Soumission</td>\n", | |
| " <td>50.10</td>\n", | |
| " <td>One</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>Sharp Objects</td>\n", | |
| " <td>47.82</td>\n", | |
| " <td>Four</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>Sapiens: A Brief History of Humankind</td>\n", | |
| " <td>54.23</td>\n", | |
| " <td>Five</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>\n", | |
| " <div class=\"colab-df-buttons\">\n", | |
| "\n", | |
| " <div class=\"colab-df-container\">\n", | |
| " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f3614048-0669-4e42-9703-5871bd840543')\"\n", | |
| " title=\"Convert this dataframe to an interactive table.\"\n", | |
| " style=\"display:none;\">\n", | |
| "\n", | |
| " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n", | |
| " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n", | |
| " </svg>\n", | |
| " </button>\n", | |
| "\n", | |
| " <style>\n", | |
| " .colab-df-container {\n", | |
| " display:flex;\n", | |
| " gap: 12px;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-convert {\n", | |
| " background-color: #E8F0FE;\n", | |
| " border: none;\n", | |
| " border-radius: 50%;\n", | |
| " cursor: pointer;\n", | |
| " display: none;\n", | |
| " fill: #1967D2;\n", | |
| " height: 32px;\n", | |
| " padding: 0 0 0 0;\n", | |
| " width: 32px;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-convert:hover {\n", | |
| " background-color: #E2EBFA;\n", | |
| " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
| " fill: #174EA6;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-buttons div {\n", | |
| " margin-bottom: 4px;\n", | |
| " }\n", | |
| "\n", | |
| " [theme=dark] .colab-df-convert {\n", | |
| " background-color: #3B4455;\n", | |
| " fill: #D2E3FC;\n", | |
| " }\n", | |
| "\n", | |
| " [theme=dark] .colab-df-convert:hover {\n", | |
| " background-color: #434B5C;\n", | |
| " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
| " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
| " fill: #FFFFFF;\n", | |
| " }\n", | |
| " </style>\n", | |
| "\n", | |
| " <script>\n", | |
| " const buttonEl =\n", | |
| " document.querySelector('#df-f3614048-0669-4e42-9703-5871bd840543 button.colab-df-convert');\n", | |
| " buttonEl.style.display =\n", | |
| " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
| "\n", | |
| " async function convertToInteractive(key) {\n", | |
| " const element = document.querySelector('#df-f3614048-0669-4e42-9703-5871bd840543');\n", | |
| " const dataTable =\n", | |
| " await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
| " [key], {});\n", | |
| " if (!dataTable) return;\n", | |
| "\n", | |
| " const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
| " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
| " + ' to learn more about interactive tables.';\n", | |
| " element.innerHTML = '';\n", | |
| " dataTable['output_type'] = 'display_data';\n", | |
| " await google.colab.output.renderOutput(dataTable, element);\n", | |
| " const docLink = document.createElement('div');\n", | |
| " docLink.innerHTML = docLinkHtml;\n", | |
| " element.appendChild(docLink);\n", | |
| " }\n", | |
| " </script>\n", | |
| " </div>\n", | |
| "\n", | |
| "\n", | |
| " </div>\n", | |
| " </div>\n" | |
| ], | |
| "text/plain": [ | |
| " title price rating\n", | |
| "0 A Light in the Attic 51.77 Three\n", | |
| "1 Tipping the Velvet 53.74 One\n", | |
| "2 Soumission 50.10 One\n", | |
| "3 Sharp Objects 47.82 Four\n", | |
| "4 Sapiens: A Brief History of Humankind 54.23 Five" | |
| ] | |
| }, | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_books.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "p-1Pr2szaqLk" | |
| }, | |
| "source": [ | |
| "## **3.** 🧩 Create a meaningful connection between real & synthetic datasets" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "SIaJUGIpaH4V" | |
| }, | |
| "source": [ | |
| "### *a. Initial setup*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": { | |
| "id": "-gPXGcRPuV_9" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import numpy as np\n", | |
| "import random\n", | |
| "from datetime import datetime\n", | |
| "import warnings\n", | |
| "\n", | |
| "warnings.filterwarnings(\"ignore\")\n", | |
| "random.seed(2025)\n", | |
| "np.random.seed(2025)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "pY4yCoIuaQqp" | |
| }, | |
| "source": [ | |
| "### *b. Generate popularity scores based on rating (with some randomness) with a generate_popularity_score function*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": { | |
| "id": "mnd5hdAbaNjz" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def generate_popularity_score(rating):\n", | |
| " base = {\"One\": 2, \"Two\": 3, \"Three\": 3, \"Four\": 4, \"Five\": 4}.get(rating, 3)\n", | |
| " trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n", | |
| " return int(np.clip(base + trend_factor, 1, 5))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "n4-TaNTFgPak" | |
| }, | |
| "source": [ | |
| "### *c. ✋🏻🛑⛔️ Run the function to create a \"popularity_score\" column from \"rating\"*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 206 | |
| }, | |
| "id": "V-G3OCUCgR07", | |
| "outputId": "034c1831-bbbb-477d-f8fa-a2fcf6f4159c" | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "application/vnd.google.colaboratory.intrinsic+json": { | |
| "summary": "{\n \"name\": \"df_books\",\n \"rows\": 840,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 799,\n \"samples\": [\n \"Where'd You Go, Bernadette\",\n \"Mockingjay (The Hunger Games #3)\",\n \"The Mindfulness and Acceptance Workbook for Anxiety: A Guide to Breaking Free from Anxiety, Phobias, and Worry Using Acceptance and Commitment Therapy\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.560827565784338,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 740,\n \"samples\": [\n 42.95,\n 20.91,\n 21.8\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", | |
| "type": "dataframe", | |
| "variable_name": "df_books" | |
| }, | |
| "text/html": [ | |
| "\n", | |
| " <div id=\"df-749488be-58a0-4a48-abf8-6773255573e7\" class=\"colab-df-container\">\n", | |
| " <div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>title</th>\n", | |
| " <th>price</th>\n", | |
| " <th>rating</th>\n", | |
| " <th>popularity_score</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>A Light in the Attic</td>\n", | |
| " <td>51.77</td>\n", | |
| " <td>Three</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>Tipping the Velvet</td>\n", | |
| " <td>53.74</td>\n", | |
| " <td>One</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>Soumission</td>\n", | |
| " <td>50.10</td>\n", | |
| " <td>One</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>Sharp Objects</td>\n", | |
| " <td>47.82</td>\n", | |
| " <td>Four</td>\n", | |
| " <td>4</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>Sapiens: A Brief History of Humankind</td>\n", | |
| " <td>54.23</td>\n", | |
| " <td>Five</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>\n", | |
| " <div class=\"colab-df-buttons\">\n", | |
| "\n", | |
| " <div class=\"colab-df-container\">\n", | |
| " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-749488be-58a0-4a48-abf8-6773255573e7')\"\n", | |
| " title=\"Convert this dataframe to an interactive table.\"\n", | |
| " style=\"display:none;\">\n", | |
| "\n", | |
| " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n", | |
| " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n", | |
| " </svg>\n", | |
| " </button>\n", | |
| "\n", | |
| " <style>\n", | |
| " .colab-df-container {\n", | |
| " display:flex;\n", | |
| " gap: 12px;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-convert {\n", | |
| " background-color: #E8F0FE;\n", | |
| " border: none;\n", | |
| " border-radius: 50%;\n", | |
| " cursor: pointer;\n", | |
| " display: none;\n", | |
| " fill: #1967D2;\n", | |
| " height: 32px;\n", | |
| " padding: 0 0 0 0;\n", | |
| " width: 32px;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-convert:hover {\n", | |
| " background-color: #E2EBFA;\n", | |
| " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
| " fill: #174EA6;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-buttons div {\n", | |
| " margin-bottom: 4px;\n", | |
| " }\n", | |
| "\n", | |
| " [theme=dark] .colab-df-convert {\n", | |
| " background-color: #3B4455;\n", | |
| " fill: #D2E3FC;\n", | |
| " }\n", | |
| "\n", | |
| " [theme=dark] .colab-df-convert:hover {\n", | |
| " background-color: #434B5C;\n", | |
| " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
| " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
| " fill: #FFFFFF;\n", | |
| " }\n", | |
| " </style>\n", | |
| "\n", | |
| " <script>\n", | |
| " const buttonEl =\n", | |
| " document.querySelector('#df-749488be-58a0-4a48-abf8-6773255573e7 button.colab-df-convert');\n", | |
| " buttonEl.style.display =\n", | |
| " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
| "\n", | |
| " async function convertToInteractive(key) {\n", | |
| " const element = document.querySelector('#df-749488be-58a0-4a48-abf8-6773255573e7');\n", | |
| " const dataTable =\n", | |
| " await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
| " [key], {});\n", | |
| " if (!dataTable) return;\n", | |
| "\n", | |
| " const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
| " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
| " + ' to learn more about interactive tables.';\n", | |
| " element.innerHTML = '';\n", | |
| " dataTable['output_type'] = 'display_data';\n", | |
| " await google.colab.output.renderOutput(dataTable, element);\n", | |
| " const docLink = document.createElement('div');\n", | |
| " docLink.innerHTML = docLinkHtml;\n", | |
| " element.appendChild(docLink);\n", | |
| " }\n", | |
| " </script>\n", | |
| " </div>\n", | |
| "\n", | |
| "\n", | |
| " </div>\n", | |
| " </div>\n" | |
| ], | |
| "text/plain": [ | |
| " title price rating popularity_score\n", | |
| "0 A Light in the Attic 51.77 Three 3\n", | |
| "1 Tipping the Velvet 53.74 One 2\n", | |
| "2 Soumission 50.10 One 2\n", | |
| "3 Sharp Objects 47.82 Four 4\n", | |
| "4 Sapiens: A Brief History of Humankind 54.23 Five 3" | |
| ] | |
| }, | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_books[\"popularity_score\"] = df_books[\"rating\"].apply(generate_popularity_score)\n", | |
| "\n", | |
| "df_books.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "HnngRNTgacYt" | |
| }, | |
| "source": [ | |
| "### *d. Decide on the sentiment_label based on the popularity score with a get_sentiment function*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": { | |
| "id": "kUtWmr8maZLZ" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def get_sentiment(popularity_score):\n", | |
| " if popularity_score <= 2:\n", | |
| " return \"negative\"\n", | |
| " elif popularity_score == 3:\n", | |
| " return \"neutral\"\n", | |
| " else:\n", | |
| " return \"positive\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "HF9F9HIzgT7Z" | |
| }, | |
| "source": [ | |
| "### *e. ✋🏻🛑⛔️ Run the function to create a \"sentiment_label\" column from \"popularity_score\"*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 206 | |
| }, | |
| "id": "tafQj8_7gYCG", | |
| "outputId": "dbb4cead-9675-4491-db0e-0fedb34a10b9" | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "application/vnd.google.colaboratory.intrinsic+json": { | |
| "summary": "{\n \"name\": \"df_books\",\n \"rows\": 840,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 799,\n \"samples\": [\n \"Where'd You Go, Bernadette\",\n \"Mockingjay (The Hunger Games #3)\",\n \"The Mindfulness and Acceptance Workbook for Anxiety: A Guide to Breaking Free from Anxiety, Phobias, and Worry Using Acceptance and Commitment Therapy\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.560827565784338,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 740,\n \"samples\": [\n 42.95,\n 20.91,\n 21.8\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", | |
| "type": "dataframe", | |
| "variable_name": "df_books" | |
| }, | |
| "text/html": [ | |
| "\n", | |
| " <div id=\"df-4de4fca6-e8b0-45a4-aeb2-25e946439896\" class=\"colab-df-container\">\n", | |
| " <div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>title</th>\n", | |
| " <th>price</th>\n", | |
| " <th>rating</th>\n", | |
| " <th>popularity_score</th>\n", | |
| " <th>sentiment_label</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>A Light in the Attic</td>\n", | |
| " <td>51.77</td>\n", | |
| " <td>Three</td>\n", | |
| " <td>3</td>\n", | |
| " <td>neutral</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>Tipping the Velvet</td>\n", | |
| " <td>53.74</td>\n", | |
| " <td>One</td>\n", | |
| " <td>2</td>\n", | |
| " <td>negative</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>Soumission</td>\n", | |
| " <td>50.10</td>\n", | |
| " <td>One</td>\n", | |
| " <td>2</td>\n", | |
| " <td>negative</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>Sharp Objects</td>\n", | |
| " <td>47.82</td>\n", | |
| " <td>Four</td>\n", | |
| " <td>4</td>\n", | |
| " <td>positive</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>Sapiens: A Brief History of Humankind</td>\n", | |
| " <td>54.23</td>\n", | |
| " <td>Five</td>\n", | |
| " <td>3</td>\n", | |
| " <td>neutral</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>\n", | |
| " <div class=\"colab-df-buttons\">\n", | |
| "\n", | |
| " <div class=\"colab-df-container\">\n", | |
| " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4de4fca6-e8b0-45a4-aeb2-25e946439896')\"\n", | |
| " title=\"Convert this dataframe to an interactive table.\"\n", | |
| " style=\"display:none;\">\n", | |
| "\n", | |
| " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n", | |
| " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n", | |
| " </svg>\n", | |
| " </button>\n", | |
| "\n", | |
| " <style>\n", | |
| " .colab-df-container {\n", | |
| " display:flex;\n", | |
| " gap: 12px;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-convert {\n", | |
| " background-color: #E8F0FE;\n", | |
| " border: none;\n", | |
| " border-radius: 50%;\n", | |
| " cursor: pointer;\n", | |
| " display: none;\n", | |
| " fill: #1967D2;\n", | |
| " height: 32px;\n", | |
| " padding: 0 0 0 0;\n", | |
| " width: 32px;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-convert:hover {\n", | |
| " background-color: #E2EBFA;\n", | |
| " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
| " fill: #174EA6;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-buttons div {\n", | |
| " margin-bottom: 4px;\n", | |
| " }\n", | |
| "\n", | |
| " [theme=dark] .colab-df-convert {\n", | |
| " background-color: #3B4455;\n", | |
| " fill: #D2E3FC;\n", | |
| " }\n", | |
| "\n", | |
| " [theme=dark] .colab-df-convert:hover {\n", | |
| " background-color: #434B5C;\n", | |
| " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
| " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
| " fill: #FFFFFF;\n", | |
| " }\n", | |
| " </style>\n", | |
| "\n", | |
| " <script>\n", | |
| " const buttonEl =\n", | |
| " document.querySelector('#df-4de4fca6-e8b0-45a4-aeb2-25e946439896 button.colab-df-convert');\n", | |
| " buttonEl.style.display =\n", | |
| " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
| "\n", | |
| " async function convertToInteractive(key) {\n", | |
| " const element = document.querySelector('#df-4de4fca6-e8b0-45a4-aeb2-25e946439896');\n", | |
| " const dataTable =\n", | |
| " await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
| " [key], {});\n", | |
| " if (!dataTable) return;\n", | |
| "\n", | |
| " const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
| " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
| " + ' to learn more about interactive tables.';\n", | |
| " element.innerHTML = '';\n", | |
| " dataTable['output_type'] = 'display_data';\n", | |
| " await google.colab.output.renderOutput(dataTable, element);\n", | |
| " const docLink = document.createElement('div');\n", | |
| " docLink.innerHTML = docLinkHtml;\n", | |
| " element.appendChild(docLink);\n", | |
| " }\n", | |
| " </script>\n", | |
| " </div>\n", | |
| "\n", | |
| "\n", | |
| " </div>\n", | |
| " </div>\n" | |
| ], | |
| "text/plain": [ | |
| " title price rating popularity_score \\\n", | |
| "0 A Light in the Attic 51.77 Three 3 \n", | |
| "1 Tipping the Velvet 53.74 One 2 \n", | |
| "2 Soumission 50.10 One 2 \n", | |
| "3 Sharp Objects 47.82 Four 4 \n", | |
| "4 Sapiens: A Brief History of Humankind 54.23 Five 3 \n", | |
| "\n", | |
| " sentiment_label \n", | |
| "0 neutral \n", | |
| "1 negative \n", | |
| "2 negative \n", | |
| "3 positive \n", | |
| "4 neutral " | |
| ] | |
| }, | |
| "execution_count": 19, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_books[\"sentiment_label\"] = df_books[\"popularity_score\"].apply(get_sentiment)\n", | |
| "\n", | |
| "df_books.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "T8AdKkmASq9a" | |
| }, | |
| "source": [ | |
| "## **4.** 📈 Generate synthetic book sales data of 18 months" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "OhXbdGD5fH0c" | |
| }, | |
| "source": [ | |
| "### *a. Create a generate_sales_profit function that would generate sales patterns based on sentiment_label (with some randomness)*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": { | |
| "id": "qkVhYPXGbgEn" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def generate_sales_profile(sentiment):\n", | |
| " months = pd.date_range(end=datetime.today(), periods=18, freq=\"M\")\n", | |
| "\n", | |
| " if sentiment == \"positive\":\n", | |
| " base = random.randint(200, 300)\n", | |
| " trend = np.linspace(base, base + random.randint(20, 60), len(months))\n", | |
| " elif sentiment == \"negative\":\n", | |
| " base = random.randint(20, 80)\n", | |
| " trend = np.linspace(base, base - random.randint(10, 30), len(months))\n", | |
| " else: # neutral\n", | |
| " base = random.randint(80, 160)\n", | |
| " trend = np.full(len(months), base + random.randint(-10, 10))\n", | |
| "\n", | |
| " seasonality = 10 * np.sin(np.linspace(0, 3 * np.pi, len(months)))\n", | |
| " noise = np.random.normal(0, 5, len(months))\n", | |
| " monthly_sales = np.clip(trend + seasonality + noise, a_min=0, a_max=None).astype(int)\n", | |
| "\n", | |
| " return list(zip(months.strftime(\"%Y-%m\"), monthly_sales))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "L2ak1HlcgoTe" | |
| }, | |
| "source": [ | |
| "### *b. Run the function as part of building sales_data*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "metadata": { | |
| "id": "SlJ24AUafoDB" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "sales_data = []\n", | |
| "for _, row in df_books.iterrows():\n", | |
| " records = generate_sales_profile(row[\"sentiment_label\"])\n", | |
| " for month, units in records:\n", | |
| " sales_data.append({\n", | |
| " \"title\": row[\"title\"],\n", | |
| " \"month\": month,\n", | |
| " \"units_sold\": units,\n", | |
| " \"sentiment_label\": row[\"sentiment_label\"]\n", | |
| " })" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "4IXZKcCSgxnq" | |
| }, | |
| "source": [ | |
| "### *c. ✋🏻🛑⛔️ Create a df_sales DataFrame from sales_data*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 206 | |
| }, | |
| "id": "wcN6gtiZg-ws", | |
| "outputId": "80a1cc44-a7d5-42c8-ae24-5e0536fea67c" | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "application/vnd.google.colaboratory.intrinsic+json": { | |
| "summary": "{\n \"name\": \"df_sales\",\n \"rows\": 15120,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 799,\n \"samples\": [\n \"Where'd You Go, Bernadette\",\n \"Mockingjay (The Hunger Games #3)\",\n \"The Mindfulness and Acceptance Workbook for Anxiety: A Guide to Breaking Free from Anxiety, Phobias, and Worry Using Acceptance and Commitment Therapy\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"month\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"2024-09\",\n \"2024-10\",\n \"2025-05\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"units_sold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 98,\n \"min\": 0,\n \"max\": 362,\n \"num_unique_values\": 355,\n \"samples\": [\n 247,\n 302,\n 56\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", | |
| "type": "dataframe", | |
| "variable_name": "df_sales" | |
| }, | |
| "text/html": [ | |
| "\n", | |
| " <div id=\"df-a851ff68-be2f-4e9a-849f-62e1c2a663f1\" class=\"colab-df-container\">\n", | |
| " <div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>title</th>\n", | |
| " <th>month</th>\n", | |
| " <th>units_sold</th>\n", | |
| " <th>sentiment_label</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>A Light in the Attic</td>\n", | |
| " <td>2024-09</td>\n", | |
| " <td>127</td>\n", | |
| " <td>neutral</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>A Light in the Attic</td>\n", | |
| " <td>2024-10</td>\n", | |
| " <td>136</td>\n", | |
| " <td>neutral</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>A Light in the Attic</td>\n", | |
| " <td>2024-11</td>\n", | |
| " <td>129</td>\n", | |
| " <td>neutral</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>A Light in the Attic</td>\n", | |
| " <td>2024-12</td>\n", | |
| " <td>134</td>\n", | |
| " <td>neutral</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>A Light in the Attic</td>\n", | |
| " <td>2025-01</td>\n", | |
| " <td>135</td>\n", | |
| " <td>neutral</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>\n", | |
| " <div class=\"colab-df-buttons\">\n", | |
| "\n", | |
| " <div class=\"colab-df-container\">\n", | |
| " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-a851ff68-be2f-4e9a-849f-62e1c2a663f1')\"\n", | |
| " title=\"Convert this dataframe to an interactive table.\"\n", | |
| " style=\"display:none;\">\n", | |
| "\n", | |
| " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n", | |
| " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n", | |
| " </svg>\n", | |
| " </button>\n", | |
| "\n", | |
| " <style>\n", | |
| " .colab-df-container {\n", | |
| " display:flex;\n", | |
| " gap: 12px;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-convert {\n", | |
| " background-color: #E8F0FE;\n", | |
| " border: none;\n", | |
| " border-radius: 50%;\n", | |
| " cursor: pointer;\n", | |
| " display: none;\n", | |
| " fill: #1967D2;\n", | |
| " height: 32px;\n", | |
| " padding: 0 0 0 0;\n", | |
| " width: 32px;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-convert:hover {\n", | |
| " background-color: #E2EBFA;\n", | |
| " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
| " fill: #174EA6;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-buttons div {\n", | |
| " margin-bottom: 4px;\n", | |
| " }\n", | |
| "\n", | |
| " [theme=dark] .colab-df-convert {\n", | |
| " background-color: #3B4455;\n", | |
| " fill: #D2E3FC;\n", | |
| " }\n", | |
| "\n", | |
| " [theme=dark] .colab-df-convert:hover {\n", | |
| " background-color: #434B5C;\n", | |
| " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
| " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
| " fill: #FFFFFF;\n", | |
| " }\n", | |
| " </style>\n", | |
| "\n", | |
| " <script>\n", | |
| " const buttonEl =\n", | |
| " document.querySelector('#df-a851ff68-be2f-4e9a-849f-62e1c2a663f1 button.colab-df-convert');\n", | |
| " buttonEl.style.display =\n", | |
| " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
| "\n", | |
| " async function convertToInteractive(key) {\n", | |
| " const element = document.querySelector('#df-a851ff68-be2f-4e9a-849f-62e1c2a663f1');\n", | |
| " const dataTable =\n", | |
| " await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
| " [key], {});\n", | |
| " if (!dataTable) return;\n", | |
| "\n", | |
| " const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
| " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
| " + ' to learn more about interactive tables.';\n", | |
| " element.innerHTML = '';\n", | |
| " dataTable['output_type'] = 'display_data';\n", | |
| " await google.colab.output.renderOutput(dataTable, element);\n", | |
| " const docLink = document.createElement('div');\n", | |
| " docLink.innerHTML = docLinkHtml;\n", | |
| " element.appendChild(docLink);\n", | |
| " }\n", | |
| " </script>\n", | |
| " </div>\n", | |
| "\n", | |
| "\n", | |
| " </div>\n", | |
| " </div>\n" | |
| ], | |
| "text/plain": [ | |
| " title month units_sold sentiment_label\n", | |
| "0 A Light in the Attic 2024-09 127 neutral\n", | |
| "1 A Light in the Attic 2024-10 136 neutral\n", | |
| "2 A Light in the Attic 2024-11 129 neutral\n", | |
| "3 A Light in the Attic 2024-12 134 neutral\n", | |
| "4 A Light in the Attic 2025-01 135 neutral" | |
| ] | |
| }, | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_sales = pd.DataFrame(sales_data)\n", | |
| "\n", | |
| "df_sales.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "EhIjz9WohAmZ" | |
| }, | |
| "source": [ | |
| "### *d. Save df_sales as synthetic_sales_data.csv & view first few lines*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "MzbZvLcAhGaH", | |
| "outputId": "385401c5-fa65-4eae-fff5-e8ecbcc99fc4" | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " title month units_sold sentiment_label\n", | |
| "0 A Light in the Attic 2024-09 127 neutral\n", | |
| "1 A Light in the Attic 2024-10 136 neutral\n", | |
| "2 A Light in the Attic 2024-11 129 neutral\n", | |
| "3 A Light in the Attic 2024-12 134 neutral\n", | |
| "4 A Light in the Attic 2025-01 135 neutral\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n", | |
| "\n", | |
| "print(df_sales.head())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "7g9gqBgQMtJn" | |
| }, | |
| "source": [ | |
| "## **5.** 🎯 Generate synthetic customer reviews" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "Gi4y9M9KuDWx" | |
| }, | |
| "source": [ | |
| "### *a. ✋🏻🛑⛔️ Ask ChatGPT to create a list of 50 distinct generic book review texts for the sentiment labels \"positive\", \"neutral\", and \"negative\" called synthetic_reviews_by_sentiment*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": { | |
| "id": "b3cd2a50" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "synthetic_reviews_by_sentiment = {\n", | |
| " \"positive\": [\n", | |
| " \"A compelling and heartwarming read that stayed with me long after I finished.\",\n", | |
| " \"Brilliantly written! The characters were unforgettable and the plot was engaging.\",\n", | |
| " \"One of the best books I've read this year — inspiring and emotionally rich.\",\n", | |
| " ],\n", | |
| " \"neutral\": [\n", | |
| " \"An average book — not great, but not bad either.\",\n", | |
| " \"Some parts really stood out, others felt a bit flat.\",\n", | |
| " \"It was okay overall. A decent way to pass the time.\",\n", | |
| " ],\n", | |
| " \"negative\": [\n", | |
| " \"I struggled to get through this one — it just didn’t grab me.\",\n", | |
| " \"The plot was confusing and the characters felt underdeveloped.\",\n", | |
| " \"Disappointing. I had high hopes, but they weren't met.\",\n", | |
| " ]\n", | |
| "}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "fQhfVaDmuULT" | |
| }, | |
| "source": [ | |
| "### *b. Generate 10 reviews per book using random sampling from the corresponding 50*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": { | |
| "id": "l2SRc3PjuTGM" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "review_rows = []\n", | |
| "for _, row in df_books.iterrows():\n", | |
| " title = row[\"title\"]\n", | |
| " sentiment_label = row[\"sentiment_label\"]\n", | |
| " review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n", | |
| "\n", | |
| " sampled_reviews = random.choices(review_pool, k=10)\n", | |
| "\n", | |
| " for review_text in sampled_reviews:\n", | |
| " review_rows.append({\n", | |
| " \"title\": title,\n", | |
| " \"sentiment_label\": sentiment_label,\n", | |
| " \"review_text\": review_text,\n", | |
| " \"rating\": row[\"rating\"],\n", | |
| " \"popularity_score\": row[\"popularity_score\"]\n", | |
| " })\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "bmJMXF-Bukdm" | |
| }, | |
| "source": [ | |
| "### *c. Create the final dataframe df_reviews & save it as synthetic_book_reviews.csv*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "metadata": { | |
| "id": "ZUKUqZsuumsp" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df_reviews = pd.DataFrame(review_rows)\n", | |
| "df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "_602pYUS3gY5" | |
| }, | |
| "source": [ | |
| "### *c. inputs for R*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "3946e521", | |
| "outputId": "f50b5ec3-4143-4ff6-8596-5ef92ad874aa" | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "✅ Wrote synthetic_title_level_features.csv\n", | |
| "✅ Wrote synthetic_monthly_revenue_series.csv\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import numpy as np\n", | |
| "\n", | |
| "def _safe_num(s):\n", | |
| " return pd.to_numeric(\n", | |
| " pd.Series(s).astype(str).str.replace(r\"[^0-9.]\", \"\", regex=True),\n", | |
| " errors=\"coerce\"\n", | |
| " )\n", | |
| "\n", | |
| "# --- Clean book metadata (price/rating) ---\n", | |
| "df_books_r = df_books.copy()\n", | |
| "if \"price\" in df_books_r.columns:\n", | |
| " df_books_r[\"price\"] = _safe_num(df_books_r[\"price\"])\n", | |
| "if \"rating\" in df_books_r.columns:\n", | |
| " df_books_r[\"rating\"] = _safe_num(df_books_r[\"rating\"])\n", | |
| "\n", | |
| "df_books_r[\"title\"] = df_books_r[\"title\"].astype(str).str.strip()\n", | |
| "\n", | |
| "# --- Clean sales ---\n", | |
| "df_sales_r = df_sales.copy()\n", | |
| "df_sales_r[\"title\"] = df_sales_r[\"title\"].astype(str).str.strip()\n", | |
| "df_sales_r[\"month\"] = pd.to_datetime(df_sales_r[\"month\"], errors=\"coerce\")\n", | |
| "df_sales_r[\"units_sold\"] = _safe_num(df_sales_r[\"units_sold\"])\n", | |
| "\n", | |
| "# --- Clean reviews ---\n", | |
| "df_reviews_r = df_reviews.copy()\n", | |
| "df_reviews_r[\"title\"] = df_reviews_r[\"title\"].astype(str).str.strip()\n", | |
| "df_reviews_r[\"sentiment_label\"] = df_reviews_r[\"sentiment_label\"].astype(str).str.lower().str.strip()\n", | |
| "if \"rating\" in df_reviews_r.columns:\n", | |
| " df_reviews_r[\"rating\"] = _safe_num(df_reviews_r[\"rating\"])\n", | |
| "if \"popularity_score\" in df_reviews_r.columns:\n", | |
| " df_reviews_r[\"popularity_score\"] = _safe_num(df_reviews_r[\"popularity_score\"])\n", | |
| "\n", | |
| "# --- Sentiment shares per title (from reviews) ---\n", | |
| "sent_counts = (\n", | |
| " df_reviews_r.groupby([\"title\", \"sentiment_label\"])\n", | |
| " .size()\n", | |
| " .unstack(fill_value=0)\n", | |
| ")\n", | |
| "for lab in [\"positive\", \"neutral\", \"negative\"]:\n", | |
| " if lab not in sent_counts.columns:\n", | |
| " sent_counts[lab] = 0\n", | |
| "\n", | |
| "sent_counts[\"total_reviews\"] = sent_counts[[\"positive\", \"neutral\", \"negative\"]].sum(axis=1)\n", | |
| "den = sent_counts[\"total_reviews\"].replace(0, np.nan)\n", | |
| "sent_counts[\"share_positive\"] = sent_counts[\"positive\"] / den\n", | |
| "sent_counts[\"share_neutral\"] = sent_counts[\"neutral\"] / den\n", | |
| "sent_counts[\"share_negative\"] = sent_counts[\"negative\"] / den\n", | |
| "sent_counts = sent_counts.reset_index()\n", | |
| "\n", | |
| "# --- Sales aggregation per title ---\n", | |
| "sales_by_title = (\n", | |
| " df_sales_r.dropna(subset=[\"title\"])\n", | |
| " .groupby(\"title\", as_index=False)\n", | |
| " .agg(\n", | |
| " months_observed=(\"month\", \"nunique\"),\n", | |
| " avg_units_sold=(\"units_sold\", \"mean\"),\n", | |
| " total_units_sold=(\"units_sold\", \"sum\"),\n", | |
| " )\n", | |
| ")\n", | |
| "\n", | |
| "# --- Title-level features (join sales + books + sentiment) ---\n", | |
| "df_title = (\n", | |
| " sales_by_title\n", | |
| " .merge(df_books_r[[\"title\", \"price\", \"rating\"]], on=\"title\", how=\"left\")\n", | |
| " .merge(sent_counts[[\"title\", \"share_positive\", \"share_neutral\", \"share_negative\", \"total_reviews\"]],\n", | |
| " on=\"title\", how=\"left\")\n", | |
| ")\n", | |
| "\n", | |
| "df_title[\"avg_revenue\"] = df_title[\"avg_units_sold\"] * df_title[\"price\"]\n", | |
| "df_title[\"total_revenue\"] = df_title[\"total_units_sold\"] * df_title[\"price\"]\n", | |
| "\n", | |
| "df_title.to_csv(\"synthetic_title_level_features.csv\", index=False)\n", | |
| "print(\"✅ Wrote synthetic_title_level_features.csv\")\n", | |
| "\n", | |
| "# --- Monthly revenue series (proxy: units_sold * price) ---\n", | |
| "monthly_rev = (\n", | |
| " df_sales_r.merge(df_books_r[[\"title\", \"price\"]], on=\"title\", how=\"left\")\n", | |
| ")\n", | |
| "monthly_rev[\"revenue\"] = monthly_rev[\"units_sold\"] * monthly_rev[\"price\"]\n", | |
| "\n", | |
| "df_monthly = (\n", | |
| " monthly_rev.dropna(subset=[\"month\"])\n", | |
| " .groupby(\"month\", as_index=False)[\"revenue\"]\n", | |
| " .sum()\n", | |
| " .rename(columns={\"revenue\": \"total_revenue\"})\n", | |
| " .sort_values(\"month\")\n", | |
| ")\n", | |
| "# if revenue is all NA (e.g., missing price), fallback to units_sold as a teaching proxy\n", | |
| "if df_monthly[\"total_revenue\"].notna().sum() == 0:\n", | |
| " df_monthly = (\n", | |
| " df_sales_r.dropna(subset=[\"month\"])\n", | |
| " .groupby(\"month\", as_index=False)[\"units_sold\"]\n", | |
| " .sum()\n", | |
| " .rename(columns={\"units_sold\": \"total_revenue\"})\n", | |
| " .sort_values(\"month\")\n", | |
| " )\n", | |
| "\n", | |
| "df_monthly[\"month\"] = pd.to_datetime(df_monthly[\"month\"], errors=\"coerce\").dt.strftime(\"%Y-%m-%d\")\n", | |
| "df_monthly.to_csv(\"synthetic_monthly_revenue_series.csv\", index=False)\n", | |
| "print(\"✅ Wrote synthetic_monthly_revenue_series.csv\")\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "RYvGyVfXuo54" | |
| }, | |
| "source": [ | |
| "### *d. ✋🏻🛑⛔️ View the first few lines*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 30, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 206 | |
| }, | |
| "id": "xfE8NMqOurKo", | |
| "outputId": "fc16b2df-502f-4aaf-e6c9-63e3b2a4f83e" | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "application/vnd.google.colaboratory.intrinsic+json": { | |
| "summary": "{\n \"name\": \"df_reviews\",\n \"rows\": 2520,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 799,\n \"samples\": [\n \"Where'd You Go, Bernadette\",\n \"Mockingjay (The Hunger Games #3)\",\n \"The Mindfulness and Acceptance Workbook for Anxiety: A Guide to Breaking Free from Anxiety, Phobias, and Worry Using Acceptance and Commitment Therapy\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review_text\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 9,\n \"samples\": [\n \"A compelling and heartwarming read that stayed with me long after I finished.\",\n \"Some parts really stood out, others felt a bit flat.\",\n \"I struggled to get through this one \\u2014 it just didn\\u2019t grab me.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", | |
| "type": "dataframe", | |
| "variable_name": "df_reviews" | |
| }, | |
| "text/html": [ | |
| "\n", | |
| " <div id=\"df-a985ad05-8a6d-442d-a94f-f2cdc781b1f7\" class=\"colab-df-container\">\n", | |
| " <div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>title</th>\n", | |
| " <th>sentiment_label</th>\n", | |
| " <th>review_text</th>\n", | |
| " <th>rating</th>\n", | |
| " <th>popularity_score</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>A Light in the Attic</td>\n", | |
| " <td>neutral</td>\n", | |
| " <td>An average book — not great, but not bad either.</td>\n", | |
| " <td>Three</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>A Light in the Attic</td>\n", | |
| " <td>neutral</td>\n", | |
| " <td>Some parts really stood out, others felt a bit...</td>\n", | |
| " <td>Three</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>A Light in the Attic</td>\n", | |
| " <td>neutral</td>\n", | |
| " <td>It was okay overall. A decent way to pass the ...</td>\n", | |
| " <td>Three</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>Tipping the Velvet</td>\n", | |
| " <td>negative</td>\n", | |
| " <td>The plot was confusing and the characters felt...</td>\n", | |
| " <td>One</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>Tipping the Velvet</td>\n", | |
| " <td>negative</td>\n", | |
| " <td>Disappointing. I had high hopes, but they were...</td>\n", | |
| " <td>One</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>\n", | |
| " <div class=\"colab-df-buttons\">\n", | |
| "\n", | |
| " <div class=\"colab-df-container\">\n", | |
| " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-a985ad05-8a6d-442d-a94f-f2cdc781b1f7')\"\n", | |
| " title=\"Convert this dataframe to an interactive table.\"\n", | |
| " style=\"display:none;\">\n", | |
| "\n", | |
| " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n", | |
| " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n", | |
| " </svg>\n", | |
| " </button>\n", | |
| "\n", | |
| " <style>\n", | |
| " .colab-df-container {\n", | |
| " display:flex;\n", | |
| " gap: 12px;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-convert {\n", | |
| " background-color: #E8F0FE;\n", | |
| " border: none;\n", | |
| " border-radius: 50%;\n", | |
| " cursor: pointer;\n", | |
| " display: none;\n", | |
| " fill: #1967D2;\n", | |
| " height: 32px;\n", | |
| " padding: 0 0 0 0;\n", | |
| " width: 32px;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-convert:hover {\n", | |
| " background-color: #E2EBFA;\n", | |
| " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
| " fill: #174EA6;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-buttons div {\n", | |
| " margin-bottom: 4px;\n", | |
| " }\n", | |
| "\n", | |
| " [theme=dark] .colab-df-convert {\n", | |
| " background-color: #3B4455;\n", | |
| " fill: #D2E3FC;\n", | |
| " }\n", | |
| "\n", | |
| " [theme=dark] .colab-df-convert:hover {\n", | |
| " background-color: #434B5C;\n", | |
| " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
| " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
| " fill: #FFFFFF;\n", | |
| " }\n", | |
| " </style>\n", | |
| "\n", | |
| " <script>\n", | |
| " const buttonEl =\n", | |
| " document.querySelector('#df-a985ad05-8a6d-442d-a94f-f2cdc781b1f7 button.colab-df-convert');\n", | |
| " buttonEl.style.display =\n", | |
| " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
| "\n", | |
| " async function convertToInteractive(key) {\n", | |
| " const element = document.querySelector('#df-a985ad05-8a6d-442d-a94f-f2cdc781b1f7');\n", | |
| " const dataTable =\n", | |
| " await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
| " [key], {});\n", | |
| " if (!dataTable) return;\n", | |
| "\n", | |
| " const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
| " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
| " + ' to learn more about interactive tables.';\n", | |
| " element.innerHTML = '';\n", | |
| " dataTable['output_type'] = 'display_data';\n", | |
| " await google.colab.output.renderOutput(dataTable, element);\n", | |
| " const docLink = document.createElement('div');\n", | |
| " docLink.innerHTML = docLinkHtml;\n", | |
| " element.appendChild(docLink);\n", | |
| " }\n", | |
| " </script>\n", | |
| " </div>\n", | |
| "\n", | |
| "\n", | |
| " </div>\n", | |
| " </div>\n" | |
| ], | |
| "text/plain": [ | |
| " title sentiment_label \\\n", | |
| "0 A Light in the Attic neutral \n", | |
| "1 A Light in the Attic neutral \n", | |
| "2 A Light in the Attic neutral \n", | |
| "3 Tipping the Velvet negative \n", | |
| "4 Tipping the Velvet negative \n", | |
| "\n", | |
| " review_text rating popularity_score \n", | |
| "0 An average book — not great, but not bad either. Three 3 \n", | |
| "1 Some parts really stood out, others felt a bit... Three 3 \n", | |
| "2 It was okay overall. A decent way to pass the ... Three 3 \n", | |
| "3 The plot was confusing and the characters felt... One 2 \n", | |
| "4 Disappointing. I had high hopes, but they were... One 2 " | |
| ] | |
| }, | |
| "execution_count": 30, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_reviews.head()" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "collapsed_sections": [ | |
| "jpASMyIQMaAq", | |
| "lquNYCbfL9IM", | |
| "p-1Pr2szaqLk", | |
| "T8AdKkmASq9a", | |
| "7g9gqBgQMtJn" | |
| ], | |
| "provenance": [] | |
| }, | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } | |