{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "4ba6aba8" }, "source": [ "# 🤖 **Data Collection, Creation, Storage, and Processing**\n" ] }, { "cell_type": "markdown", "metadata": { "id": "jpASMyIQMaAq" }, "source": [ "## **1.** 📦 Install required packages" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f48c8f8c", "outputId": "4f65d7d2-075c-46e2-b9a5-c824813a6712" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n", "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", "Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n", "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n", "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.3)\n", "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n", "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", "Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n", "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.1)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n", "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (2025.11.3)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (4.67.3)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n" ] } ], "source": [ "!pip install beautifulsoup4 pandas matplotlib seaborn numpy textblob" ] }, { "cell_type": "markdown", "metadata": { "id": "lquNYCbfL9IM" }, "source": [ "## **2.** ⛏ Web-scrape all book titles, prices, and ratings from books.toscrape.com" ] }, { "cell_type": "markdown", "metadata": { "id": "0IWuNpxxYDJF" }, "source": [ "### *a. Initial setup*\n", "Define the base url of the website you will scrape as well as how and what you will scrape" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "id": "91d52125" }, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "import time\n", "\n", "base_url = \"https://books.toscrape.com/catalogue/page-{}.html\"\n", "headers = {\"User-Agent\": \"Mozilla/5.0\"}\n", "\n", "titles, prices, ratings = [], [], []" ] }, { "cell_type": "markdown", "metadata": { "id": "oCdTsin2Yfp3" }, "source": [ "### *b. Fill titles, prices, and ratings from the web pages*" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "id": "xqO5Y3dnYhxt" }, "outputs": [], "source": [ "# Loop through all 50 pages\n", "for page in range(1, 51):\n", " url = base_url.format(page)\n", " response = requests.get(url, headers=headers)\n", " soup = BeautifulSoup(response.content, \"html.parser\")\n", " books = soup.find_all(\"article\", class_=\"product_pod\")\n", "\n", " for book in books:\n", " titles.append(book.h3.a[\"title\"])\n", " prices.append(float(book.find(\"p\", class_=\"price_color\").text[1:]))\n", " ratings.append(book.p.get(\"class\")[1])\n", "\n", " time.sleep(0.5) # polite scraping delay" ] }, { "cell_type": "markdown", "metadata": { "id": "T0TOeRC4Yrnn" }, "source": [ "### *c. ✋🏻🛑⛔️ Create a dataframe df_books that contains the now complete \"title\", \"price\", and \"rating\" objects*" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "id": "l5FkkNhUYTHh", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "8e8f7559-2792-473c-f77e-8cb166559adc" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title price rating\n", "0 A Light in the Attic 51.77 Three\n", "1 Tipping the Velvet 53.74 One\n", "2 Soumission 50.10 One\n", "3 Sharp Objects 47.82 Four\n", "4 Sapiens: A Brief History of Humankind 54.23 Five" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepricerating
0A Light in the Attic51.77Three
1Tipping the Velvet53.74One
2Soumission50.10One
3Sharp Objects47.82Four
4Sapiens: A Brief History of Humankind54.23Five
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_books", "summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 42 } ], "source": [ "import pandas as pd\n", "\n", "# Create dataframe\n", "df_books = pd.DataFrame({\n", " \"title\": titles,\n", " \"price\": prices,\n", " \"rating\": ratings\n", "})\n", "\n", "# View first rows\n", "df_books.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "duI5dv3CZYvF" }, "source": [ "### *d. Save web-scraped dataframe either as a CSV or Excel file*" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "id": "lC1U_YHtZifh" }, "outputs": [], "source": [ "# Save to CSV\n", "df_books.to_csv(\"books_data.csv\", index=False)\n", "\n", "# Or save to Excel\n", "# df_books.to_excel(\"books_data.xlsx\", index=False)" ] }, { "cell_type": "markdown", "metadata": { "id": "qMjRKMBQZlJi" }, "source": [ "### *e. ✋🏻🛑⛔️ View first fiew lines*" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "O_wIvTxYZqCK", "outputId": "92b767d7-7954-41fd-f3d6-91caa841fd05" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title price rating\n", "0 A Light in the Attic 51.77 Three\n", "1 Tipping the Velvet 53.74 One\n", "2 Soumission 50.10 One\n", "3 Sharp Objects 47.82 Four\n", "4 Sapiens: A Brief History of Humankind 54.23 Five" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepricerating
0A Light in the Attic51.77Three
1Tipping the Velvet53.74One
2Soumission50.10One
3Sharp Objects47.82Four
4Sapiens: A Brief History of Humankind54.23Five
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_books", "summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 44 } ], "source": [ "# View first few lines\n", "df_books.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "p-1Pr2szaqLk" }, "source": [ "## **3.** 🧩 Create a meaningful connection between real & synthetic datasets" ] }, { "cell_type": "markdown", "metadata": { "id": "SIaJUGIpaH4V" }, "source": [ "### *a. Initial setup*" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "id": "-gPXGcRPuV_9" }, "outputs": [], "source": [ "import numpy as np\n", "import random\n", "from datetime import datetime\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "random.seed(2025)\n", "np.random.seed(2025)" ] }, { "cell_type": "markdown", "metadata": { "id": "pY4yCoIuaQqp" }, "source": [ "### *b. Generate popularity scores based on rating (with some randomness) with a generate_popularity_score function*" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "id": "mnd5hdAbaNjz" }, "outputs": [], "source": [ "def generate_popularity_score(rating):\n", " base = {\"One\": 2, \"Two\": 3, \"Three\": 3, \"Four\": 4, \"Five\": 4}.get(rating, 3)\n", " trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n", " return int(np.clip(base + trend_factor, 1, 5))" ] }, { "cell_type": "markdown", "metadata": { "id": "n4-TaNTFgPak" }, "source": [ "### *c. ✋🏻🛑⛔️ Run the function to create a \"popularity_score\" column from \"rating\"*" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "id": "V-G3OCUCgR07", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "64821b96-7095-4e9e-b281-45fe4c349cd9" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title price rating popularity_score\n", "0 A Light in the Attic 51.77 Three 3\n", "1 Tipping the Velvet 53.74 One 2\n", "2 Soumission 50.10 One 2\n", "3 Sharp Objects 47.82 Four 4\n", "4 Sapiens: A Brief History of Humankind 54.23 Five 3" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepriceratingpopularity_score
0A Light in the Attic51.77Three3
1Tipping the Velvet53.74One2
2Soumission50.10One2
3Sharp Objects47.82Four4
4Sapiens: A Brief History of Humankind54.23Five3
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_books", "summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 47 } ], "source": [ "import numpy as np\n", "import random\n", "\n", "# Re-seed (optional but good practice)\n", "random.seed(2025)\n", "np.random.seed(2025)\n", "\n", "# Define the function\n", "def generate_popularity_score(rating):\n", " base = {\n", " \"One\": 2,\n", " \"Two\": 3,\n", " \"Three\": 3,\n", " \"Four\": 4,\n", " \"Five\": 4\n", " }.get(rating, 3)\n", "\n", " trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n", "\n", " return int(np.clip(base + trend_factor, 1, 5))\n", "\n", "# Create the new column\n", "df_books[\"popularity_score\"] = df_books[\"rating\"].apply(generate_popularity_score)\n", "\n", "# View result\n", "df_books.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "HnngRNTgacYt" }, "source": [ "### *d. Decide on the sentiment_label based on the popularity score with a get_sentiment function*" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "id": "kUtWmr8maZLZ" }, "outputs": [], "source": [ "def get_sentiment(popularity_score):\n", " if popularity_score <= 2:\n", " return \"negative\"\n", " elif popularity_score == 3:\n", " return \"neutral\"\n", " else:\n", " return \"positive\"" ] }, { "cell_type": "markdown", "metadata": { "id": "HF9F9HIzgT7Z" }, "source": [ "### *e. ✋🏻🛑⛔️ Run the function to create a \"sentiment_label\" column from \"popularity_score\"*" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "id": "tafQj8_7gYCG", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "2c8fac9f-483d-4bf1-f232-37169b21eb06" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title price rating popularity_score \\\n", "0 A Light in the Attic 51.77 Three 3 \n", "1 Tipping the Velvet 53.74 One 2 \n", "2 Soumission 50.10 One 2 \n", "3 Sharp Objects 47.82 Four 4 \n", "4 Sapiens: A Brief History of Humankind 54.23 Five 3 \n", "\n", " sentiment_label \n", "0 neutral \n", "1 negative \n", "2 negative \n", "3 positive \n", "4 neutral " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepriceratingpopularity_scoresentiment_label
0A Light in the Attic51.77Three3neutral
1Tipping the Velvet53.74One2negative
2Soumission50.10One2negative
3Sharp Objects47.82Four4positive
4Sapiens: A Brief History of Humankind54.23Five3neutral
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_books", "summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 49 } ], "source": [ "# Define the sentiment function\n", "def get_sentiment(popularity_score):\n", " if popularity_score <= 2:\n", " return \"negative\"\n", " elif popularity_score == 3:\n", " return \"neutral\"\n", " else:\n", " return \"positive\"\n", "\n", "# Create sentiment_label column\n", "df_books[\"sentiment_label\"] = df_books[\"popularity_score\"].apply(get_sentiment)\n", "\n", "# View first rows\n", "df_books.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "T8AdKkmASq9a" }, "source": [ "## **4.** 📈 Generate synthetic book sales data of 18 months" ] }, { "cell_type": "markdown", "metadata": { "id": "OhXbdGD5fH0c" }, "source": [ "### *a. Create a generate_sales_profit function that would generate sales patterns based on sentiment_label (with some randomness)*" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "id": "qkVhYPXGbgEn" }, "outputs": [], "source": [ "def generate_sales_profile(sentiment):\n", " months = pd.date_range(end=datetime.today(), periods=18, freq=\"M\")\n", "\n", " if sentiment == \"positive\":\n", " base = random.randint(200, 300)\n", " trend = np.linspace(base, base + random.randint(20, 60), len(months))\n", " elif sentiment == \"negative\":\n", " base = random.randint(20, 80)\n", " trend = np.linspace(base, base - random.randint(10, 30), len(months))\n", " else: # neutral\n", " base = random.randint(80, 160)\n", " trend = np.full(len(months), base + random.randint(-10, 10))\n", "\n", " seasonality = 10 * np.sin(np.linspace(0, 3 * np.pi, len(months)))\n", " noise = np.random.normal(0, 5, len(months))\n", " monthly_sales = np.clip(trend + seasonality + noise, a_min=0, a_max=None).astype(int)\n", "\n", " return list(zip(months.strftime(\"%Y-%m\"), monthly_sales))" ] }, { "cell_type": "markdown", "metadata": { "id": "L2ak1HlcgoTe" }, "source": [ "### *b. Run the function as part of building sales_data*" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "id": "SlJ24AUafoDB" }, "outputs": [], "source": [ "sales_data = []\n", "for _, row in df_books.iterrows():\n", " records = generate_sales_profile(row[\"sentiment_label\"])\n", " for month, units in records:\n", " sales_data.append({\n", " \"title\": row[\"title\"],\n", " \"month\": month,\n", " \"units_sold\": units,\n", " \"sentiment_label\": row[\"sentiment_label\"]\n", " })" ] }, { "cell_type": "markdown", "metadata": { "id": "4IXZKcCSgxnq" }, "source": [ "### *c. ✋🏻🛑⛔️ Create a df_sales DataFrame from sales_data*" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "id": "wcN6gtiZg-ws" }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df_sales = pd.DataFrame(sales_data)" ] }, { "cell_type": "markdown", "metadata": { "id": "EhIjz9WohAmZ" }, "source": [ "### *d. Save df_sales as synthetic_sales_data.csv & view first few lines*" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "MzbZvLcAhGaH", "outputId": "0a931b3d-73f6-4a0a-ec81-78fe688d3ce6" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title month units_sold sentiment_label\n", "0 A Light in the Attic 2024-09 100 neutral\n", "1 A Light in the Attic 2024-10 109 neutral\n", "2 A Light in the Attic 2024-11 102 neutral\n", "3 A Light in the Attic 2024-12 107 neutral\n", "4 A Light in the Attic 2025-01 108 neutral" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlemonthunits_soldsentiment_label
0A Light in the Attic2024-09100neutral
1A Light in the Attic2024-10109neutral
2A Light in the Attic2024-11102neutral
3A Light in the Attic2024-12107neutral
4A Light in the Attic2025-01108neutral
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_sales", "summary": "{\n \"name\": \"df_sales\",\n \"rows\": 18000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"month\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"2024-09\",\n \"2024-10\",\n \"2025-05\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"units_sold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 98,\n \"min\": 0,\n \"max\": 362,\n \"num_unique_values\": 354,\n \"samples\": [\n 214,\n 289,\n 205\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 53 } ], "source": [ "# Save df_sales as CSV\n", "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n", "\n", "# View first few lines\n", "df_sales.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "7g9gqBgQMtJn" }, "source": [ "## **5.** 🎯 Generate synthetic customer reviews" ] }, { "cell_type": "markdown", "metadata": { "id": "Gi4y9M9KuDWx" }, "source": [ "### *a. ✋🏻🛑⛔️ Ask ChatGPT to create a list of 50 distinct generic book review texts for the sentiment labels \"positive\", \"neutral\", and \"negative\" called synthetic_reviews_by_sentiment*" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "id": "b3cd2a50" }, "outputs": [], "source": [ "synthetic_reviews_by_sentiment = {\n", " \"positive\": [\n", " \"An absolutely wonderful read from start to finish.\",\n", " \"I couldn’t put this book down—truly captivating.\",\n", " \"The storytelling was immersive and beautifully crafted.\",\n", " \"A heartfelt and inspiring story.\",\n", " \"The characters were vivid and unforgettable.\",\n", " \"An engaging plot with excellent pacing.\",\n", " \"A beautifully written and emotionally rich novel.\",\n", " \"One of the best books I’ve read this year.\",\n", " \"A powerful and uplifting experience.\",\n", " \"The author’s writing style was elegant and compelling.\",\n", " \"An unforgettable journey from beginning to end.\",\n", " \"The themes were handled with depth and care.\",\n", " \"A gripping and satisfying read.\",\n", " \"An imaginative and creative masterpiece.\",\n", " \"The emotional impact was profound.\",\n", " \"A delightful and charming story.\",\n", " \"I was hooked from the first page.\",\n", " \"A wonderfully constructed narrative.\",\n", " \"An inspiring and thought-provoking novel.\",\n", " \"The character development was superb.\",\n", " \"A brilliant blend of emotion and storytelling.\",\n", " \"An exciting and immersive experience.\",\n", " \"The plot twists were perfectly executed.\",\n", " \"A refreshing and engaging perspective.\",\n", " \"A deeply moving and memorable book.\",\n", " \"An outstanding achievement in storytelling.\",\n", " \"The prose was smooth and captivating.\",\n", " \"A compelling and beautifully told tale.\",\n", " \"It exceeded all my expectations.\",\n", " \"A fantastic and rewarding read.\",\n", " \"The dialogue felt natural and authentic.\",\n", " \"An emotionally satisfying conclusion.\",\n", " \"A truly exceptional novel.\",\n", " \"The story flowed effortlessly.\",\n", " \"An inspiring and meaningful narrative.\",\n", " \"A captivating and heartfelt read.\",\n", " \"The pacing kept me engaged throughout.\",\n", " \"A masterfully written story.\",\n", " \"An entertaining and powerful book.\",\n", " \"A rich and immersive reading experience.\",\n", " \"An impressive and unforgettable novel.\",\n", " \"The storytelling was both creative and compelling.\",\n", " \"A stunning and beautifully crafted book.\",\n", " \"A five-star reading experience.\",\n", " \"A deeply engaging and satisfying story.\",\n", " \"An uplifting and memorable journey.\",\n", " \"The narrative was gripping and emotional.\",\n", " \"A beautifully structured novel.\",\n", " \"An inspiring and powerful read.\",\n", " \"A truly remarkable book.\"\n", " ],\n", " \"neutral\": [\n", " \"It was an okay read overall.\",\n", " \"The book had its moments but wasn’t remarkable.\",\n", " \"A fairly average story.\",\n", " \"Some parts were engaging, others less so.\",\n", " \"It met my expectations but didn’t exceed them.\",\n", " \"A decent way to pass the time.\",\n", " \"The writing was fine, nothing extraordinary.\",\n", " \"An average book with a straightforward plot.\",\n", " \"I found it moderately interesting.\",\n", " \"Neither particularly good nor bad.\",\n", " \"The characters were adequate but not memorable.\",\n", " \"It was entertaining enough.\",\n", " \"A standard story for the genre.\",\n", " \"Some chapters were stronger than others.\",\n", " \"The pacing was inconsistent at times.\",\n", " \"It held my attention in parts.\",\n", " \"An acceptable but unremarkable novel.\",\n", " \"The plot was predictable but serviceable.\",\n", " \"It was fine for a casual read.\",\n", " \"A reasonably written book.\",\n", " \"I didn’t love it, but I didn’t dislike it either.\",\n", " \"The themes were presented clearly but simply.\",\n", " \"A mild and easy read.\",\n", " \"It delivered what it promised.\",\n", " \"The story was straightforward and simple.\",\n", " \"An average reading experience.\",\n", " \"There were both strengths and weaknesses.\",\n", " \"It was somewhat engaging.\",\n", " \"A typical example of its genre.\",\n", " \"The ending was satisfactory.\",\n", " \"It had a few interesting ideas.\",\n", " \"The execution was decent overall.\",\n", " \"A passable and light read.\",\n", " \"It kept me mildly interested.\",\n", " \"The writing style was standard.\",\n", " \"Some characters stood out more than others.\",\n", " \"It was readable but not memorable.\",\n", " \"An ordinary but competent book.\",\n", " \"The story moved at a steady pace.\",\n", " \"It was fine for what it was.\",\n", " \"A serviceable narrative.\",\n", " \"The plot developed in a predictable way.\",\n", " \"An adequate storytelling effort.\",\n", " \"It had its ups and downs.\",\n", " \"A modest and simple read.\",\n", " \"The overall experience was balanced.\",\n", " \"It didn’t leave a strong impression.\",\n", " \"A fairly routine novel.\",\n", " \"It was acceptable but not standout.\",\n", " \"An unexceptional yet readable book.\"\n", " ],\n", " \"negative\": [\n", " \"I struggled to finish this book.\",\n", " \"The story failed to hold my interest.\",\n", " \"The plot felt confusing and disjointed.\",\n", " \"I was disappointed by the overall execution.\",\n", " \"The characters lacked depth and development.\",\n", " \"It didn’t live up to the hype.\",\n", " \"The pacing was painfully slow.\",\n", " \"I found the writing style hard to enjoy.\",\n", " \"The narrative felt flat and uninspired.\",\n", " \"A frustrating reading experience.\",\n", " \"The dialogue felt unnatural and forced.\",\n", " \"It was difficult to stay engaged.\",\n", " \"The story lacked originality.\",\n", " \"I expected much more from this book.\",\n", " \"The ending was unsatisfying.\",\n", " \"The plot holes were distracting.\",\n", " \"It felt overly long and drawn out.\",\n", " \"The characters were forgettable.\",\n", " \"The writing seemed rushed.\",\n", " \"I couldn’t connect with the story.\",\n", " \"The themes were poorly developed.\",\n", " \"It failed to capture my attention.\",\n", " \"A disappointing and underwhelming novel.\",\n", " \"The structure felt messy.\",\n", " \"The book lacked emotional impact.\",\n", " \"I found it boring overall.\",\n", " \"The storytelling was inconsistent.\",\n", " \"The conflict felt forced.\",\n", " \"It didn’t offer anything new.\",\n", " \"The pacing dragged throughout.\",\n", " \"I had trouble staying focused while reading.\",\n", " \"The plot twists were predictable.\",\n", " \"It felt repetitive and dull.\",\n", " \"The characters’ motivations were unclear.\",\n", " \"The writing lacked polish.\",\n", " \"A forgettable and bland read.\",\n", " \"The story never fully came together.\",\n", " \"It was not worth the time investment.\",\n", " \"The narrative felt shallow.\",\n", " \"I was left feeling unsatisfied.\",\n", " \"The book lacked cohesion.\",\n", " \"The execution fell short of expectations.\",\n", " \"It didn’t resonate with me at all.\",\n", " \"The story felt underdeveloped.\",\n", " \"A tedious reading experience.\",\n", " \"The author’s style didn’t work for me.\",\n", " \"The plot felt overly complicated.\",\n", " \"The book failed to engage me emotionally.\",\n", " \"It was a disappointing effort overall.\",\n", " \"I wouldn’t recommend this one.\"\n", " ]\n", "}" ] }, { "cell_type": "markdown", "metadata": { "id": "fQhfVaDmuULT" }, "source": [ "### *b. Generate 10 reviews per book using random sampling from the corresponding 50*" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "id": "l2SRc3PjuTGM" }, "outputs": [], "source": [ "review_rows = []\n", "for _, row in df_books.iterrows():\n", " title = row['title']\n", " sentiment_label = row['sentiment_label']\n", " review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n", " sampled_reviews = random.sample(review_pool, 10)\n", " for review_text in sampled_reviews:\n", " review_rows.append({\n", " \"title\": title,\n", " \"sentiment_label\": sentiment_label,\n", " \"review_text\": review_text,\n", " \"rating\": row['rating'],\n", " \"popularity_score\": row['popularity_score']\n", " })" ] }, { "cell_type": "markdown", "metadata": { "id": "bmJMXF-Bukdm" }, "source": [ "### *c. Create the final dataframe df_reviews & save it as synthetic_book_reviews.csv*" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "id": "ZUKUqZsuumsp" }, "outputs": [], "source": [ "df_reviews = pd.DataFrame(review_rows)\n", "df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)" ] }, { "cell_type": "markdown", "source": [ "### *c. inputs for R*" ], "metadata": { "id": "_602pYUS3gY5" } }, { "cell_type": "code", "execution_count": 57, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3946e521", "outputId": "cac6769e-1856-4a5a-acb0-b4fe1ae98a56" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "✅ Wrote synthetic_title_level_features.csv\n", "✅ Wrote synthetic_monthly_revenue_series.csv\n" ] } ], "source": [ "import numpy as np\n", "\n", "def _safe_num(s):\n", " return pd.to_numeric(\n", " pd.Series(s).astype(str).str.replace(r\"[^0-9.]\", \"\", regex=True),\n", " errors=\"coerce\"\n", " )\n", "\n", "# --- Clean book metadata (price/rating) ---\n", "df_books_r = df_books.copy()\n", "if \"price\" in df_books_r.columns:\n", " df_books_r[\"price\"] = _safe_num(df_books_r[\"price\"])\n", "if \"rating\" in df_books_r.columns:\n", " df_books_r[\"rating\"] = _safe_num(df_books_r[\"rating\"])\n", "\n", "df_books_r[\"title\"] = df_books_r[\"title\"].astype(str).str.strip()\n", "\n", "# --- Clean sales ---\n", "df_sales_r = df_sales.copy()\n", "df_sales_r[\"title\"] = df_sales_r[\"title\"].astype(str).str.strip()\n", "df_sales_r[\"month\"] = pd.to_datetime(df_sales_r[\"month\"], errors=\"coerce\")\n", "df_sales_r[\"units_sold\"] = _safe_num(df_sales_r[\"units_sold\"])\n", "\n", "# --- Clean reviews ---\n", "df_reviews_r = df_reviews.copy()\n", "df_reviews_r[\"title\"] = df_reviews_r[\"title\"].astype(str).str.strip()\n", "df_reviews_r[\"sentiment_label\"] = df_reviews_r[\"sentiment_label\"].astype(str).str.lower().str.strip()\n", "if \"rating\" in df_reviews_r.columns:\n", " df_reviews_r[\"rating\"] = _safe_num(df_reviews_r[\"rating\"])\n", "if \"popularity_score\" in df_reviews_r.columns:\n", " df_reviews_r[\"popularity_score\"] = _safe_num(df_reviews_r[\"popularity_score\"])\n", "\n", "# --- Sentiment shares per title (from reviews) ---\n", "sent_counts = (\n", " df_reviews_r.groupby([\"title\", \"sentiment_label\"])\n", " .size()\n", " .unstack(fill_value=0)\n", ")\n", "for lab in [\"positive\", \"neutral\", \"negative\"]:\n", " if lab not in sent_counts.columns:\n", " sent_counts[lab] = 0\n", "\n", "sent_counts[\"total_reviews\"] = sent_counts[[\"positive\", \"neutral\", \"negative\"]].sum(axis=1)\n", "den = sent_counts[\"total_reviews\"].replace(0, np.nan)\n", "sent_counts[\"share_positive\"] = sent_counts[\"positive\"] / den\n", "sent_counts[\"share_neutral\"] = sent_counts[\"neutral\"] / den\n", "sent_counts[\"share_negative\"] = sent_counts[\"negative\"] / den\n", "sent_counts = sent_counts.reset_index()\n", "\n", "# --- Sales aggregation per title ---\n", "sales_by_title = (\n", " df_sales_r.dropna(subset=[\"title\"])\n", " .groupby(\"title\", as_index=False)\n", " .agg(\n", " months_observed=(\"month\", \"nunique\"),\n", " avg_units_sold=(\"units_sold\", \"mean\"),\n", " total_units_sold=(\"units_sold\", \"sum\"),\n", " )\n", ")\n", "\n", "# --- Title-level features (join sales + books + sentiment) ---\n", "df_title = (\n", " sales_by_title\n", " .merge(df_books_r[[\"title\", \"price\", \"rating\"]], on=\"title\", how=\"left\")\n", " .merge(sent_counts[[\"title\", \"share_positive\", \"share_neutral\", \"share_negative\", \"total_reviews\"]],\n", " on=\"title\", how=\"left\")\n", ")\n", "\n", "df_title[\"avg_revenue\"] = df_title[\"avg_units_sold\"] * df_title[\"price\"]\n", "df_title[\"total_revenue\"] = df_title[\"total_units_sold\"] * df_title[\"price\"]\n", "\n", "df_title.to_csv(\"synthetic_title_level_features.csv\", index=False)\n", "print(\"✅ Wrote synthetic_title_level_features.csv\")\n", "\n", "# --- Monthly revenue series (proxy: units_sold * price) ---\n", "monthly_rev = (\n", " df_sales_r.merge(df_books_r[[\"title\", \"price\"]], on=\"title\", how=\"left\")\n", ")\n", "monthly_rev[\"revenue\"] = monthly_rev[\"units_sold\"] * monthly_rev[\"price\"]\n", "\n", "df_monthly = (\n", " monthly_rev.dropna(subset=[\"month\"])\n", " .groupby(\"month\", as_index=False)[\"revenue\"]\n", " .sum()\n", " .rename(columns={\"revenue\": \"total_revenue\"})\n", " .sort_values(\"month\")\n", ")\n", "# if revenue is all NA (e.g., missing price), fallback to units_sold as a teaching proxy\n", "if df_monthly[\"total_revenue\"].notna().sum() == 0:\n", " df_monthly = (\n", " df_sales_r.dropna(subset=[\"month\"])\n", " .groupby(\"month\", as_index=False)[\"units_sold\"]\n", " .sum()\n", " .rename(columns={\"units_sold\": \"total_revenue\"})\n", " .sort_values(\"month\")\n", " )\n", "\n", "df_monthly[\"month\"] = pd.to_datetime(df_monthly[\"month\"], errors=\"coerce\").dt.strftime(\"%Y-%m-%d\")\n", "df_monthly.to_csv(\"synthetic_monthly_revenue_series.csv\", index=False)\n", "print(\"✅ Wrote synthetic_monthly_revenue_series.csv\")\n" ] }, { "cell_type": "markdown", "metadata": { "id": "RYvGyVfXuo54" }, "source": [ "### *d. ✋🏻🛑⛔️ View the first few lines*" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "xfE8NMqOurKo", "outputId": "39d8355e-626e-47cb-aa7c-524649dbbcd4" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title sentiment_label review_text \\\n", "0 A Light in the Attic neutral A passable and light read. \n", "1 A Light in the Attic neutral An average reading experience. \n", "2 A Light in the Attic neutral It was somewhat engaging. \n", "3 A Light in the Attic neutral It was acceptable but not standout. \n", "4 A Light in the Attic neutral A serviceable narrative. \n", "\n", " rating popularity_score \n", "0 Three 3 \n", "1 Three 3 \n", "2 Three 3 \n", "3 Three 3 \n", "4 Three 3 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlesentiment_labelreview_textratingpopularity_score
0A Light in the AtticneutralA passable and light read.Three3
1A Light in the AtticneutralAn average reading experience.Three3
2A Light in the AtticneutralIt was somewhat engaging.Three3
3A Light in the AtticneutralIt was acceptable but not standout.Three3
4A Light in the AtticneutralA serviceable narrative.Three3
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_reviews", "summary": "{\n \"name\": \"df_reviews\",\n \"rows\": 10000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review_text\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 150,\n \"samples\": [\n \"An entertaining and powerful book.\",\n \"It felt repetitive and dull.\",\n \"One of the best books I\\u2019ve read this year.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 58 } ], "source": [ "import random\n", "import pandas as pd\n", "\n", "# Recreate review_rows\n", "review_rows = []\n", "\n", "for _, row in df_books.iterrows():\n", " title = row[\"title\"]\n", " sentiment_label = row[\"sentiment_label\"]\n", " review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n", "\n", " sampled_reviews = random.sample(review_pool, 10)\n", "\n", " for review_text in sampled_reviews:\n", " review_rows.append({\n", " \"title\": title,\n", " \"sentiment_label\": sentiment_label,\n", " \"review_text\": review_text,\n", " \"rating\": row[\"rating\"],\n", " \"popularity_score\": row[\"popularity_score\"]\n", " })\n", "\n", "# Create dataframe\n", "df_reviews = pd.DataFrame(review_rows)\n", "\n", "# View first rows\n", "df_reviews.head()" ] } ], "metadata": { "colab": { "collapsed_sections": [ "jpASMyIQMaAq", "lquNYCbfL9IM", "0IWuNpxxYDJF", "oCdTsin2Yfp3", "T0TOeRC4Yrnn", "duI5dv3CZYvF", "qMjRKMBQZlJi", "p-1Pr2szaqLk", "SIaJUGIpaH4V", "pY4yCoIuaQqp", "n4-TaNTFgPak", "HnngRNTgacYt", "HF9F9HIzgT7Z", "T8AdKkmASq9a", "OhXbdGD5fH0c", "L2ak1HlcgoTe", "4IXZKcCSgxnq", "EhIjz9WohAmZ", "Gi4y9M9KuDWx", "fQhfVaDmuULT", "bmJMXF-Bukdm", "RYvGyVfXuo54" ], "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }