{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "4ba6aba8" }, "source": [ "# πŸ€– **Data Collection, Creation, Storage, and Processing**\n" ] }, { "cell_type": "markdown", "metadata": { "id": "jpASMyIQMaAq" }, "source": [ "## **1.** πŸ“¦ Install required packages" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f48c8f8c", "outputId": "13d0dd5e-82c6-489f-b1f0-e970186a4eb7" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n", "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", "Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n", "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n", "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.3)\n", "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n", "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", "Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n", "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.1)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n", "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (2025.11.3)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (4.67.3)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n" ] } ], "source": [ "!pip install beautifulsoup4 pandas matplotlib seaborn numpy textblob" ] }, { "cell_type": "markdown", "metadata": { "id": "lquNYCbfL9IM" }, "source": [ "## **2.** ⛏ Web-scrape all book titles, prices, and ratings from books.toscrape.com" ] }, { "cell_type": "markdown", "metadata": { "id": "0IWuNpxxYDJF" }, "source": [ "### *a. Initial setup*\n", "Define the base url of the website you will scrape as well as how and what you will scrape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "91d52125" }, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "import time\n", "\n", "base_url = \"https://books.toscrape.com/catalogue/page-{}.html\"\n", "headers = {\"User-Agent\": \"Mozilla/5.0\"}\n", "\n", "titles, prices, ratings = [], [], []" ] }, { "cell_type": "markdown", "metadata": { "id": "oCdTsin2Yfp3" }, "source": [ "### *b. Fill titles, prices, and ratings from the web pages*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "xqO5Y3dnYhxt" }, "outputs": [], "source": [ "# Loop through all 50 pages\n", "for page in range(1, 51):\n", " url = base_url.format(page)\n", " response = requests.get(url, headers=headers)\n", " soup = BeautifulSoup(response.content, \"html.parser\")\n", " books = soup.find_all(\"article\", class_=\"product_pod\")\n", "\n", " for book in books:\n", " titles.append(book.h3.a[\"title\"])\n", " prices.append(float(book.find(\"p\", class_=\"price_color\").text[1:]))\n", " ratings.append(book.p.get(\"class\")[1])\n", "\n", " time.sleep(0.5) # polite scraping delay" ] }, { "cell_type": "markdown", "metadata": { "id": "T0TOeRC4Yrnn" }, "source": [ "### *c. βœ‹πŸ»πŸ›‘β›”οΈ Create a dataframe df_books that contains the now complete \"title\", \"price\", and \"rating\" objects*" ] }, { "cell_type": "code", "source": [ "# Create DataFrame\n", "df_books = pd.DataFrame({\n", " \"title\": titles,\n", " \"price\": prices,\n", " \"rating\": ratings\n", "})\n", "\n", "# Display first few rows\n", "df_books.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 201 }, "id": "p5aK0rcz_6uN", "outputId": "0349ca6e-4e6e-4e95-f0a9-996b0eaed67d" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title price rating\n", "0 A Light in the Attic 51.77 Three\n", "1 Tipping the Velvet 53.74 One\n", "2 Soumission 50.10 One\n", "3 Sharp Objects 47.82 Four\n", "4 Sapiens: A Brief History of Humankind 54.23 Five" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepricerating
0A Light in the Attic51.77Three
1Tipping the Velvet53.74One
2Soumission50.10One
3Sharp Objects47.82Four
4Sapiens: A Brief History of Humankind54.23Five
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_books", "summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 4 } ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "l5FkkNhUYTHh", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "b0bfbd33-73f8-4d78-8f2b-2e0b74e2c606" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/200.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91mβ•Έ\u001b[0m\u001b[90m━\u001b[0m \u001b[32m194.6/200.8 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m200.8/200.8 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m140.6/140.6 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m92.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m52.7/52.7 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m74.5/74.5 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m201.5/201.5 kB\u001b[0m \u001b[31m15.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m69.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.12/dist-packages/sdv/single_table/base.py:168: FutureWarning: The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.\n", " warnings.warn(DEPRECATION_MSG, FutureWarning)\n", "/usr/local/lib/python3.12/dist-packages/sdv/single_table/base.py:134: UserWarning: We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.\n", " warnings.warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Real (head):\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " price rating_score\n", "0 51.77 3\n", "1 53.74 1\n", "2 50.10 1\n", "3 47.82 4\n", "4 54.23 5" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pricerating_score
051.773
153.741
250.101
347.824
454.235
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"print(\\\"Saved: books_real\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.647672562837028,\n \"min\": 47.82,\n \"max\": 54.23,\n \"num_unique_values\": 5,\n \"samples\": [\n 53.74,\n 54.23,\n 50.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 4,\n \"samples\": [\n 1,\n 5,\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Synthetic (head):\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " price rating_score\n", "0 54.70 1\n", "1 44.07 1\n", "2 53.36 4\n", "3 39.32 5\n", "4 29.55 5" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pricerating_score
054.701
144.071
253.364
339.325
429.555
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"print(\\\"Saved: books_real\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 10.399632205034946,\n \"min\": 29.55,\n \"max\": 54.7,\n \"num_unique_values\": 5,\n \"samples\": [\n 44.07,\n 29.55,\n 53.36\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 3,\n \"samples\": [\n 1,\n 4,\n 5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Real stats:\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " price rating_score\n", "count 1000.00000 1000.000000\n", "mean 35.07035 2.923000\n", "std 14.44669 1.434967\n", "min 10.00000 1.000000\n", "25% 22.10750 2.000000\n", "50% 35.98000 3.000000\n", "75% 47.45750 4.000000\n", "max 59.99000 5.000000" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pricerating_score
count1000.000001000.000000
mean35.070352.923000
std14.446691.434967
min10.000001.000000
25%22.107502.000000
50%35.980003.000000
75%47.457504.000000
max59.990005.000000
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"print(\\\"Saved: books_real\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 342.59073671210575,\n \"min\": 10.0,\n \"max\": 1000.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 35.07035,\n 35.980000000000004,\n 1000.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 352.5781108922889,\n \"min\": 1.0,\n \"max\": 1000.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 2.923,\n 3.0,\n 1000.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Synthetic stats:\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " price rating_score\n", "count 1000.000000 1000.000000\n", "mean 33.977490 2.914000\n", "std 14.402466 1.415135\n", "min 10.050000 1.000000\n", "25% 21.420000 2.000000\n", "50% 33.695000 3.000000\n", "75% 45.472500 4.000000\n", "max 59.960000 5.000000" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pricerating_score
count1000.0000001000.000000
mean33.9774902.914000
std14.4024661.415135
min10.0500001.000000
25%21.4200002.000000
50%33.6950003.000000
75%45.4725004.000000
max59.9600005.000000
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"print(\\\"Saved: books_real\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 342.8831053927661,\n \"min\": 10.05,\n \"max\": 1000.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 33.977489999999996,\n 33.695,\n 1000.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 352.57957731182347,\n \"min\": 1.0,\n \"max\": 1000.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 2.914,\n 3.0,\n 1000.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Saved: books_real.csv and books_synthetic.csv\n" ] } ], "source": [ "# =========================\n", "# Part 2-c: Create a synthetic dataset from the scraped data\n", "# =========================\n", "\n", "# 1) (Colab) Install SDV if needed\n", "!pip -q install sdv\n", "\n", "import pandas as pd\n", "import numpy as np\n", "\n", "# 2) Build the real dataset from your lists\n", "df_real = pd.DataFrame({\n", " \"title\": titles,\n", " \"price\": prices,\n", " \"rating\": ratings\n", "})\n", "\n", "# Optional: basic cleaning\n", "df_real = df_real.drop_duplicates().dropna()\n", "df_real[\"price\"] = pd.to_numeric(df_real[\"price\"], errors=\"coerce\")\n", "df_real = df_real.dropna(subset=[\"price\"])\n", "\n", "# 3) (Optional but useful) Convert rating text to an ordered numeric score\n", "rating_map = {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}\n", "df_real[\"rating_score\"] = df_real[\"rating\"].map(rating_map).fillna(np.nan)\n", "df_real = df_real.dropna(subset=[\"rating_score\"])\n", "df_real[\"rating_score\"] = df_real[\"rating_score\"].astype(int)\n", "\n", "# If you want to synthesize only the β€œmodel-friendly” columns:\n", "df_model = df_real[[\"price\", \"rating_score\"]].copy()\n", "\n", "# 4) Train a synthetic data generator and sample synthetic rows\n", "from sdv.metadata import SingleTableMetadata\n", "from sdv.single_table import GaussianCopulaSynthesizer\n", "\n", "metadata = SingleTableMetadata()\n", "metadata.detect_from_dataframe(df_model)\n", "\n", "synth = GaussianCopulaSynthesizer(metadata)\n", "synth.fit(df_model)\n", "\n", "# Choose how many synthetic rows you want\n", "n_synth = len(df_model) # same size as real\n", "df_synth = synth.sample(n_synth)\n", "\n", "# 5) Quick sanity checks (optional)\n", "print(\"Real (head):\")\n", "display(df_model.head())\n", "\n", "print(\"Synthetic (head):\")\n", "display(df_synth.head())\n", "\n", "print(\"Real stats:\")\n", "display(df_model.describe())\n", "\n", "print(\"Synthetic stats:\")\n", "display(df_synth.describe())\n", "\n", "# 6) Save outputs\n", "df_real.to_csv(\"books_real.csv\", index=False)\n", "df_synth.to_csv(\"books_synthetic.csv\", index=False)\n", "\n", "print(\"Saved: books_real.csv and books_synthetic.csv\")" ] }, { "cell_type": "markdown", "metadata": { "id": "duI5dv3CZYvF" }, "source": [ "### *d. Save web-scraped dataframe either as a CSV or Excel file*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "lC1U_YHtZifh" }, "outputs": [], "source": [ "# πŸ’Ύ Save to CSV\n", "df_books.to_csv(\"books_data.csv\", index=False)\n", "\n", "# πŸ’Ύ Or save to Excel\n", "# df_books.to_excel(\"books_data.xlsx\", index=False)" ] }, { "cell_type": "markdown", "metadata": { "id": "qMjRKMBQZlJi" }, "source": [ "### *e. βœ‹πŸ»πŸ›‘β›”οΈ View first fiew lines*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 201 }, "id": "O_wIvTxYZqCK", "outputId": "f2296107-167d-4f7b-b363-a718c98dd34f" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title price rating\n", "0 A Light in the Attic 51.77 Three\n", "1 Tipping the Velvet 53.74 One\n", "2 Soumission 50.10 One\n", "3 Sharp Objects 47.82 Four\n", "4 Sapiens: A Brief History of Humankind 54.23 Five" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepricerating
0A Light in the Attic51.77Three
1Tipping the Velvet53.74One
2Soumission50.10One
3Sharp Objects47.82Four
4Sapiens: A Brief History of Humankind54.23Five
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_books", "summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 6 } ], "source": [ "# Display first rows\n", "df_books.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "p-1Pr2szaqLk" }, "source": [ "## **3.** 🧩 Create a meaningful connection between real & synthetic datasets" ] }, { "cell_type": "markdown", "metadata": { "id": "SIaJUGIpaH4V" }, "source": [ "### *a. Initial setup*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-gPXGcRPuV_9" }, "outputs": [], "source": [ "import numpy as np\n", "import random\n", "from datetime import datetime\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "random.seed(2025)\n", "np.random.seed(2025)" ] }, { "cell_type": "markdown", "metadata": { "id": "pY4yCoIuaQqp" }, "source": [ "### *b. Generate popularity scores based on rating (with some randomness) with a generate_popularity_score function*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mnd5hdAbaNjz" }, "outputs": [], "source": [ "def generate_popularity_score(rating):\n", " base = {\"One\": 2, \"Two\": 3, \"Three\": 3, \"Four\": 4, \"Five\": 4}.get(rating, 3)\n", " trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n", " return int(np.clip(base + trend_factor, 1, 5))" ] }, { "cell_type": "markdown", "metadata": { "id": "n4-TaNTFgPak" }, "source": [ "### *c. βœ‹πŸ»πŸ›‘β›”οΈ Run the function to create a \"popularity_score\" column from \"rating\"*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "V-G3OCUCgR07", "colab": { "base_uri": "https://localhost:8080/", "height": 201 }, "outputId": "99af6d80-1016-4798-dc27-322eb8f921cf" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title price rating popularity_score\n", "0 A Light in the Attic 51.77 Three 3\n", "1 Tipping the Velvet 53.74 One 2\n", "2 Soumission 50.10 One 2\n", "3 Sharp Objects 47.82 Four 4\n", "4 Sapiens: A Brief History of Humankind 54.23 Five 3" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepriceratingpopularity_score
0A Light in the Attic51.77Three3
1Tipping the Velvet53.74One2
2Soumission50.10One2
3Sharp Objects47.82Four4
4Sapiens: A Brief History of Humankind54.23Five3
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_books", "summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 9 } ], "source": [ "# Create popularity_score column\n", "df_books[\"popularity_score\"] = df_books[\"rating\"].apply(\n", " lambda x: generate_popularity_score(x)\n", ")\n", "\n", "# Preview results\n", "df_books.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "HnngRNTgacYt" }, "source": [ "### *d. Decide on the sentiment_label based on the popularity score with a get_sentiment function*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kUtWmr8maZLZ" }, "outputs": [], "source": [ "def get_sentiment(popularity_score):\n", " if popularity_score <= 2:\n", " return \"negative\"\n", " elif popularity_score == 3:\n", " return \"neutral\"\n", " else:\n", " return \"positive\"" ] }, { "cell_type": "markdown", "metadata": { "id": "HF9F9HIzgT7Z" }, "source": [ "### *e. βœ‹πŸ»πŸ›‘β›”οΈ Run the function to create a \"sentiment_label\" column from \"popularity_score\"*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "tafQj8_7gYCG", "colab": { "base_uri": "https://localhost:8080/", "height": 201 }, "outputId": "708aa87a-9b99-4e6c-85c3-ca8f45cd3774" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title price rating popularity_score \\\n", "0 A Light in the Attic 51.77 Three 3 \n", "1 Tipping the Velvet 53.74 One 2 \n", "2 Soumission 50.10 One 2 \n", "3 Sharp Objects 47.82 Four 4 \n", "4 Sapiens: A Brief History of Humankind 54.23 Five 3 \n", "\n", " sentiment_label \n", "0 neutral \n", "1 negative \n", "2 negative \n", "3 positive \n", "4 neutral " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepriceratingpopularity_scoresentiment_label
0A Light in the Attic51.77Three3neutral
1Tipping the Velvet53.74One2negative
2Soumission50.10One2negative
3Sharp Objects47.82Four4positive
4Sapiens: A Brief History of Humankind54.23Five3neutral
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_books", "summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 11 } ], "source": [ "# Create sentiment_label column\n", "df_books[\"sentiment_label\"] = df_books[\"popularity_score\"].apply(get_sentiment)\n", "\n", "# Preview results\n", "df_books.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "T8AdKkmASq9a" }, "source": [ "## **4.** πŸ“ˆ Generate synthetic book sales data of 18 months" ] }, { "cell_type": "markdown", "metadata": { "id": "OhXbdGD5fH0c" }, "source": [ "### *a. Create a generate_sales_profit function that would generate sales patterns based on sentiment_label (with some randomness)*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qkVhYPXGbgEn" }, "outputs": [], "source": [ "def generate_sales_profile(sentiment):\n", " months = pd.date_range(end=datetime.today(), periods=18, freq=\"M\")\n", "\n", " if sentiment == \"positive\":\n", " base = random.randint(200, 300)\n", " trend = np.linspace(base, base + random.randint(20, 60), len(months))\n", " elif sentiment == \"negative\":\n", " base = random.randint(20, 80)\n", " trend = np.linspace(base, base - random.randint(10, 30), len(months))\n", " else: # neutral\n", " base = random.randint(80, 160)\n", " trend = np.full(len(months), base + random.randint(-10, 10))\n", "\n", " seasonality = 10 * np.sin(np.linspace(0, 3 * np.pi, len(months)))\n", " noise = np.random.normal(0, 5, len(months))\n", " monthly_sales = np.clip(trend + seasonality + noise, a_min=0, a_max=None).astype(int)\n", "\n", " return list(zip(months.strftime(\"%Y-%m\"), monthly_sales))" ] }, { "cell_type": "markdown", "metadata": { "id": "L2ak1HlcgoTe" }, "source": [ "### *b. Run the function as part of building sales_data*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SlJ24AUafoDB" }, "outputs": [], "source": [ "sales_data = []\n", "for _, row in df_books.iterrows():\n", " records = generate_sales_profile(row[\"sentiment_label\"])\n", " for month, units in records:\n", " sales_data.append({\n", " \"title\": row[\"title\"],\n", " \"month\": month,\n", " \"units_sold\": units,\n", " \"sentiment_label\": row[\"sentiment_label\"]\n", " })" ] }, { "cell_type": "markdown", "metadata": { "id": "4IXZKcCSgxnq" }, "source": [ "### *c. βœ‹πŸ»πŸ›‘β›”οΈ Create a df_sales DataFrame from sales_data*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "wcN6gtiZg-ws", "colab": { "base_uri": "https://localhost:8080/", "height": 201 }, "outputId": "9b0fa967-9a80-48c5-d67d-0728c9dbf0a3" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title month units_sold sentiment_label\n", "0 A Light in the Attic 2024-09 100 neutral\n", "1 A Light in the Attic 2024-10 109 neutral\n", "2 A Light in the Attic 2024-11 102 neutral\n", "3 A Light in the Attic 2024-12 107 neutral\n", "4 A Light in the Attic 2025-01 108 neutral" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlemonthunits_soldsentiment_label
0A Light in the Attic2024-09100neutral
1A Light in the Attic2024-10109neutral
2A Light in the Attic2024-11102neutral
3A Light in the Attic2024-12107neutral
4A Light in the Attic2025-01108neutral
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_sales", "summary": "{\n \"name\": \"df_sales\",\n \"rows\": 18000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"month\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"2024-09\",\n \"2024-10\",\n \"2025-05\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"units_sold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 98,\n \"min\": 0,\n \"max\": 362,\n \"num_unique_values\": 354,\n \"samples\": [\n 214,\n 289,\n 205\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 14 } ], "source": [ "# Create DataFrame from sales_data\n", "df_sales = pd.DataFrame(sales_data)\n", "\n", "# Preview first rows\n", "df_sales.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "EhIjz9WohAmZ" }, "source": [ "### *d. Save df_sales as synthetic_sales_data.csv & view first few lines*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MzbZvLcAhGaH", "outputId": "5e03d47b-9db2-4bdb-de45-cf5cd23df169" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " title month units_sold sentiment_label\n", "0 A Light in the Attic 2024-09 100 neutral\n", "1 A Light in the Attic 2024-10 109 neutral\n", "2 A Light in the Attic 2024-11 102 neutral\n", "3 A Light in the Attic 2024-12 107 neutral\n", "4 A Light in the Attic 2025-01 108 neutral\n" ] } ], "source": [ "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n", "\n", "print(df_sales.head())" ] }, { "cell_type": "markdown", "metadata": { "id": "7g9gqBgQMtJn" }, "source": [ "## **5.** 🎯 Generate synthetic customer reviews" ] }, { "cell_type": "markdown", "metadata": { "id": "Gi4y9M9KuDWx" }, "source": [ "### *a. βœ‹πŸ»πŸ›‘β›”οΈ Ask ChatGPT to create a list of 50 distinct generic book review texts for the sentiment labels \"positive\", \"neutral\", and \"negative\" called synthetic_reviews_by_sentiment*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "b3cd2a50" }, "outputs": [], "source": [ "synthetic_reviews_by_sentiment = {\n", " \"positive\": [\n", " \"A compelling and heartwarming read that stayed with me long after I finished.\",\n", " \"Brilliantly written with unforgettable characters.\",\n", " \"An inspiring story that exceeded my expectations.\",\n", " \"Absolutely loved the pacing and emotional depth.\",\n", " \"A beautifully crafted narrative from start to finish.\",\n", " \"Captivating, immersive, and hard to put down.\",\n", " \"An outstanding book that I would highly recommend.\",\n", " \"The storytelling was vivid and deeply engaging.\",\n", " \"A powerful and moving literary experience.\",\n", " \"Rich characters and a satisfying conclusion.\",\n", " \"An uplifting and memorable journey.\",\n", " \"Exceptionally well-written and thoughtfully structured.\",\n", " \"A masterpiece of modern storytelling.\",\n", " \"Heartfelt and genuinely inspiring.\",\n", " \"The plot twists were clever and well executed.\",\n", " \"A delightful surprise that kept me hooked.\",\n", " \"Emotionally resonant and beautifully paced.\",\n", " \"An engaging story with strong character development.\",\n", " \"A truly rewarding read.\",\n", " \"Fresh, original, and wonderfully told.\",\n", " \"A gripping narrative that never lost momentum.\",\n", " \"Highly entertaining and emotionally satisfying.\",\n", " \"An absolute page-turner.\",\n", " \"Creative, thoughtful, and deeply moving.\",\n", " \"A remarkable reading experience.\",\n", " \"Well-developed themes and compelling dialogue.\",\n", " \"A standout book in its genre.\",\n", " \"Engrossing and thoughtfully written.\",\n", " \"An unforgettable literary journey.\",\n", " \"Full of heart and intelligence.\",\n", " \"A thoroughly enjoyable experience.\",\n", " \"Smart, engaging, and beautifully detailed.\",\n", " \"A story that lingers in the best way.\",\n", " \"Masterfully executed with emotional depth.\",\n", " \"An inspiring and touching tale.\",\n", " \"Richly imagined and skillfully told.\",\n", " \"A fantastic read from beginning to end.\",\n", " \"Compelling characters and a strong narrative arc.\",\n", " \"Deeply satisfying and wonderfully crafted.\",\n", " \"An emotionally rich and powerful book.\",\n", " \"A beautifully told and meaningful story.\",\n", " \"The writing style was elegant and immersive.\",\n", " \"A gripping and heartfelt novel.\",\n", " \"An impressive and engaging work.\",\n", " \"Thoroughly enjoyable with great pacing.\",\n", " \"A brilliant combination of plot and character.\",\n", " \"Highly compelling and well developed.\",\n", " \"A refreshing and inspiring read.\",\n", " \"An exceptional book worth revisiting.\",\n", " \"Simply outstanding in every way.\"\n", " ],\n", "\n", " \"neutral\": [\n", " \"An average book β€” not great, but not bad either.\",\n", " \"Some parts stood out, others felt flat.\",\n", " \"It was okay overall, a decent way to pass the time.\",\n", " \"Fairly standard storytelling without many surprises.\",\n", " \"An adequate read with a few memorable moments.\",\n", " \"Neither impressive nor disappointing.\",\n", " \"A moderate experience with balanced strengths and flaws.\",\n", " \"The plot was predictable but readable.\",\n", " \"It held my attention, though not consistently.\",\n", " \"Reasonably enjoyable but not remarkable.\",\n", " \"A straightforward and simple narrative.\",\n", " \"Decent character work but uneven pacing.\",\n", " \"An acceptable read for a quiet afternoon.\",\n", " \"Not particularly memorable, but not bad.\",\n", " \"Average writing with a serviceable plot.\",\n", " \"Some interesting ideas that weren't fully explored.\",\n", " \"A book that met basic expectations.\",\n", " \"Entertaining enough, though somewhat forgettable.\",\n", " \"An ordinary story told competently.\",\n", " \"Mildly engaging from start to finish.\",\n", " \"A balanced mix of good and mediocre elements.\",\n", " \"It had potential but didn’t fully deliver.\",\n", " \"An easy but unremarkable read.\",\n", " \"The story progressed steadily without major highlights.\",\n", " \"Moderately satisfying overall.\",\n", " \"Nothing groundbreaking, but readable.\",\n", " \"A passable addition to the genre.\",\n", " \"Simple and straightforward storytelling.\",\n", " \"Occasionally engaging, occasionally dull.\",\n", " \"A fairly typical reading experience.\",\n", " \"Competent writing but lacking spark.\",\n", " \"An average effort with decent structure.\",\n", " \"Somewhat enjoyable yet inconsistent.\",\n", " \"A middle-of-the-road novel.\",\n", " \"Neither strongly recommended nor discouraged.\",\n", " \"An alright book with standard themes.\",\n", " \"Readable but not especially captivating.\",\n", " \"It delivered what was expected.\",\n", " \"A conventional and predictable plot.\",\n", " \"Pleasant but easily forgotten.\",\n", " \"Moderately interesting characters.\",\n", " \"A steady but unspectacular read.\",\n", " \"Not bad, just not exceptional.\",\n", " \"An okay narrative with minor highlights.\",\n", " \"Serviceable storytelling throughout.\",\n", " \"Balanced strengths and weaknesses.\",\n", " \"Acceptable pacing with limited surprises.\",\n", " \"Fine for casual reading.\",\n", " \"An unremarkable but functional story.\",\n", " \"Overall, a neutral reading experience.\"\n", " ],\n", "\n", " \"negative\": [\n", " \"I struggled to get through this one.\",\n", " \"The plot was confusing and underdeveloped.\",\n", " \"Disappointing and not what I expected.\",\n", " \"The characters lacked depth and realism.\",\n", " \"Poor pacing made it hard to stay engaged.\",\n", " \"The storyline felt rushed and incomplete.\",\n", " \"Difficult to connect with the narrative.\",\n", " \"Predictable and uninspired writing.\",\n", " \"The dialogue felt forced and unnatural.\",\n", " \"An underwhelming reading experience.\",\n", " \"The book failed to hold my interest.\",\n", " \"Too many clichΓ©s and weak plot points.\",\n", " \"The structure was messy and unclear.\",\n", " \"A forgettable and frustrating read.\",\n", " \"The themes were poorly executed.\",\n", " \"Lacked emotional depth and engagement.\",\n", " \"I found it tedious and repetitive.\",\n", " \"Not compelling enough to recommend.\",\n", " \"The ending was abrupt and unsatisfying.\",\n", " \"Weak character development throughout.\",\n", " \"The writing style felt flat.\",\n", " \"An overall disappointing novel.\",\n", " \"Hard to follow and poorly organized.\",\n", " \"The pacing dragged significantly.\",\n", " \"The concept had promise but failed in execution.\",\n", " \"Unconvincing character motivations.\",\n", " \"A dull and uninspired storyline.\",\n", " \"The narrative felt disconnected.\",\n", " \"More frustrating than enjoyable.\",\n", " \"The book lacked focus and clarity.\",\n", " \"Repetitive and overly predictable.\",\n", " \"The emotional impact was minimal.\",\n", " \"Not engaging from start to finish.\",\n", " \"The story felt unfinished.\",\n", " \"An exhausting and unrewarding read.\",\n", " \"Too slow to maintain interest.\",\n", " \"The plot twists were ineffective.\",\n", " \"Shallow storytelling overall.\",\n", " \"The execution didn’t match the premise.\",\n", " \"A below-average literary effort.\",\n", " \"The characters were difficult to care about.\",\n", " \"Inconsistent tone throughout.\",\n", " \"It failed to meet expectations.\",\n", " \"The narrative lacked cohesion.\",\n", " \"Overly simplistic and bland.\",\n", " \"A frustratingly weak story.\",\n", " \"The pacing and structure were flawed.\",\n", " \"Not worth the time invested.\",\n", " \"A disappointing addition to the genre.\",\n", " \"Ultimately an unsatisfying read.\"\n", " ]\n", "}" ] }, { "cell_type": "markdown", "metadata": { "id": "fQhfVaDmuULT" }, "source": [ "### *b. Generate 10 reviews per book using random sampling from the corresponding 50*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "l2SRc3PjuTGM" }, "outputs": [], "source": [ "review_rows = []\n", "for _, row in df_books.iterrows():\n", " title = row['title']\n", " sentiment_label = row['sentiment_label']\n", " review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n", " sampled_reviews = random.sample(review_pool, 10)\n", " for review_text in sampled_reviews:\n", " review_rows.append({\n", " \"title\": title,\n", " \"sentiment_label\": sentiment_label,\n", " \"review_text\": review_text,\n", " \"rating\": row['rating'],\n", " \"popularity_score\": row['popularity_score']\n", " })" ] }, { "cell_type": "markdown", "metadata": { "id": "bmJMXF-Bukdm" }, "source": [ "### *c. Create the final dataframe df_reviews & save it as synthetic_book_reviews.csv*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZUKUqZsuumsp" }, "outputs": [], "source": [ "df_reviews = pd.DataFrame(review_rows)\n", "df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": { "id": "RYvGyVfXuo54" }, "source": [ "### *d. βœ‹πŸ»πŸ›‘β›”οΈ View the first few lines*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 201 }, "id": "xfE8NMqOurKo", "outputId": "3c4f3cd0-afad-4049-d873-d6449d7fc46a" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title sentiment_label \\\n", "0 A Light in the Attic neutral \n", "1 A Light in the Attic neutral \n", "2 A Light in the Attic neutral \n", "3 A Light in the Attic neutral \n", "4 A Light in the Attic neutral \n", "\n", " review_text rating popularity_score \n", "0 Acceptable pacing with limited surprises. Three 3 \n", "1 An easy but unremarkable read. Three 3 \n", "2 Neither strongly recommended nor discouraged. Three 3 \n", "3 It held my attention, though not consistently. Three 3 \n", "4 Serviceable storytelling throughout. Three 3 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlesentiment_labelreview_textratingpopularity_score
0A Light in the AtticneutralAcceptable pacing with limited surprises.Three3
1A Light in the AtticneutralAn easy but unremarkable read.Three3
2A Light in the AtticneutralNeither strongly recommended nor discouraged.Three3
3A Light in the AtticneutralIt held my attention, though not consistently.Three3
4A Light in the AtticneutralServiceable storytelling throughout.Three3
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_reviews", "summary": "{\n \"name\": \"df_reviews\",\n \"rows\": 10000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review_text\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 150,\n \"samples\": [\n \"A gripping narrative that never lost momentum.\",\n \"The themes were poorly executed.\",\n \"A thoroughly enjoyable experience.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 20 } ], "source": [ "df_reviews.head()" ] } ], "metadata": { "colab": { "collapsed_sections": [ "jpASMyIQMaAq", "lquNYCbfL9IM", "0IWuNpxxYDJF", "oCdTsin2Yfp3", "T0TOeRC4Yrnn", "duI5dv3CZYvF", "qMjRKMBQZlJi", "p-1Pr2szaqLk", "SIaJUGIpaH4V", "pY4yCoIuaQqp", "n4-TaNTFgPak", "HnngRNTgacYt", "HF9F9HIzgT7Z", "T8AdKkmASq9a", "OhXbdGD5fH0c", "L2ak1HlcgoTe", "4IXZKcCSgxnq", "EhIjz9WohAmZ", "Gi4y9M9KuDWx", "fQhfVaDmuULT", "bmJMXF-Bukdm", "RYvGyVfXuo54" ], "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }