{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "4ba6aba8"
},
"source": [
"# π€ **Data Collection, Creation, Storage, and Processing**\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jpASMyIQMaAq"
},
"source": [
"## **1.** π¦ Install required packages"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "f48c8f8c",
"outputId": "13d0dd5e-82c6-489f-b1f0-e970186a4eb7"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n",
"Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n",
"Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n",
"Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n",
"Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n",
"Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.3)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n",
"Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n",
"Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.1)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n",
"Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (2025.11.3)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (4.67.3)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n"
]
}
],
"source": [
"!pip install beautifulsoup4 pandas matplotlib seaborn numpy textblob"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "lquNYCbfL9IM"
},
"source": [
"## **2.** β Web-scrape all book titles, prices, and ratings from books.toscrape.com"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0IWuNpxxYDJF"
},
"source": [
"### *a. Initial setup*\n",
"Define the base url of the website you will scrape as well as how and what you will scrape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "91d52125"
},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"import time\n",
"\n",
"base_url = \"https://books.toscrape.com/catalogue/page-{}.html\"\n",
"headers = {\"User-Agent\": \"Mozilla/5.0\"}\n",
"\n",
"titles, prices, ratings = [], [], []"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "oCdTsin2Yfp3"
},
"source": [
"### *b. Fill titles, prices, and ratings from the web pages*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xqO5Y3dnYhxt"
},
"outputs": [],
"source": [
"# Loop through all 50 pages\n",
"for page in range(1, 51):\n",
" url = base_url.format(page)\n",
" response = requests.get(url, headers=headers)\n",
" soup = BeautifulSoup(response.content, \"html.parser\")\n",
" books = soup.find_all(\"article\", class_=\"product_pod\")\n",
"\n",
" for book in books:\n",
" titles.append(book.h3.a[\"title\"])\n",
" prices.append(float(book.find(\"p\", class_=\"price_color\").text[1:]))\n",
" ratings.append(book.p.get(\"class\")[1])\n",
"\n",
" time.sleep(0.5) # polite scraping delay"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "T0TOeRC4Yrnn"
},
"source": [
"### *c. βπ»πβοΈ Create a dataframe df_books that contains the now complete \"title\", \"price\", and \"rating\" objects*"
]
},
{
"cell_type": "code",
"source": [
"# Create DataFrame\n",
"df_books = pd.DataFrame({\n",
" \"title\": titles,\n",
" \"price\": prices,\n",
" \"rating\": ratings\n",
"})\n",
"\n",
"# Display first few rows\n",
"df_books.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 201
},
"id": "p5aK0rcz_6uN",
"outputId": "0349ca6e-4e6e-4e95-f0a9-996b0eaed67d"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" title price rating\n",
"0 A Light in the Attic 51.77 Three\n",
"1 Tipping the Velvet 53.74 One\n",
"2 Soumission 50.10 One\n",
"3 Sharp Objects 47.82 Four\n",
"4 Sapiens: A Brief History of Humankind 54.23 Five"
],
"text/html": [
"\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" price | \n",
" rating | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A Light in the Attic | \n",
" 51.77 | \n",
" Three | \n",
"
\n",
" \n",
" | 1 | \n",
" Tipping the Velvet | \n",
" 53.74 | \n",
" One | \n",
"
\n",
" \n",
" | 2 | \n",
" Soumission | \n",
" 50.10 | \n",
" One | \n",
"
\n",
" \n",
" | 3 | \n",
" Sharp Objects | \n",
" 47.82 | \n",
" Four | \n",
"
\n",
" \n",
" | 4 | \n",
" Sapiens: A Brief History of Humankind | \n",
" 54.23 | \n",
" Five | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df_books",
"summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 4
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "l5FkkNhUYTHh",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"outputId": "b0bfbd33-73f8-4d78-8f2b-2e0b74e2c606"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[?25l \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m0.0/200.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91mββββββββββββββββββββββββββββββββββββββ\u001b[0m\u001b[91mβΈ\u001b[0m\u001b[90mβ\u001b[0m \u001b[32m194.6/200.8 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m200.8/200.8 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m140.6/140.6 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m92.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m52.7/52.7 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m74.5/74.5 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m201.5/201.5 kB\u001b[0m \u001b[31m15.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m69.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.12/dist-packages/sdv/single_table/base.py:168: FutureWarning: The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.\n",
" warnings.warn(DEPRECATION_MSG, FutureWarning)\n",
"/usr/local/lib/python3.12/dist-packages/sdv/single_table/base.py:134: UserWarning: We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.\n",
" warnings.warn(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Real (head):\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" price rating_score\n",
"0 51.77 3\n",
"1 53.74 1\n",
"2 50.10 1\n",
"3 47.82 4\n",
"4 54.23 5"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" price | \n",
" rating_score | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 51.77 | \n",
" 3 | \n",
"
\n",
" \n",
" | 1 | \n",
" 53.74 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" 50.10 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" 47.82 | \n",
" 4 | \n",
"
\n",
" \n",
" | 4 | \n",
" 54.23 | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"print(\\\"Saved: books_real\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.647672562837028,\n \"min\": 47.82,\n \"max\": 54.23,\n \"num_unique_values\": 5,\n \"samples\": [\n 53.74,\n 54.23,\n 50.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 4,\n \"samples\": [\n 1,\n 5,\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Synthetic (head):\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" price rating_score\n",
"0 54.70 1\n",
"1 44.07 1\n",
"2 53.36 4\n",
"3 39.32 5\n",
"4 29.55 5"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" price | \n",
" rating_score | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 54.70 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 44.07 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" 53.36 | \n",
" 4 | \n",
"
\n",
" \n",
" | 3 | \n",
" 39.32 | \n",
" 5 | \n",
"
\n",
" \n",
" | 4 | \n",
" 29.55 | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"print(\\\"Saved: books_real\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 10.399632205034946,\n \"min\": 29.55,\n \"max\": 54.7,\n \"num_unique_values\": 5,\n \"samples\": [\n 44.07,\n 29.55,\n 53.36\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 3,\n \"samples\": [\n 1,\n 4,\n 5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Real stats:\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" price rating_score\n",
"count 1000.00000 1000.000000\n",
"mean 35.07035 2.923000\n",
"std 14.44669 1.434967\n",
"min 10.00000 1.000000\n",
"25% 22.10750 2.000000\n",
"50% 35.98000 3.000000\n",
"75% 47.45750 4.000000\n",
"max 59.99000 5.000000"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" price | \n",
" rating_score | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 1000.00000 | \n",
" 1000.000000 | \n",
"
\n",
" \n",
" | mean | \n",
" 35.07035 | \n",
" 2.923000 | \n",
"
\n",
" \n",
" | std | \n",
" 14.44669 | \n",
" 1.434967 | \n",
"
\n",
" \n",
" | min | \n",
" 10.00000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" | 25% | \n",
" 22.10750 | \n",
" 2.000000 | \n",
"
\n",
" \n",
" | 50% | \n",
" 35.98000 | \n",
" 3.000000 | \n",
"
\n",
" \n",
" | 75% | \n",
" 47.45750 | \n",
" 4.000000 | \n",
"
\n",
" \n",
" | max | \n",
" 59.99000 | \n",
" 5.000000 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"print(\\\"Saved: books_real\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 342.59073671210575,\n \"min\": 10.0,\n \"max\": 1000.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 35.07035,\n 35.980000000000004,\n 1000.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 352.5781108922889,\n \"min\": 1.0,\n \"max\": 1000.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 2.923,\n 3.0,\n 1000.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Synthetic stats:\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" price rating_score\n",
"count 1000.000000 1000.000000\n",
"mean 33.977490 2.914000\n",
"std 14.402466 1.415135\n",
"min 10.050000 1.000000\n",
"25% 21.420000 2.000000\n",
"50% 33.695000 3.000000\n",
"75% 45.472500 4.000000\n",
"max 59.960000 5.000000"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" price | \n",
" rating_score | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
"
\n",
" \n",
" | mean | \n",
" 33.977490 | \n",
" 2.914000 | \n",
"
\n",
" \n",
" | std | \n",
" 14.402466 | \n",
" 1.415135 | \n",
"
\n",
" \n",
" | min | \n",
" 10.050000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" | 25% | \n",
" 21.420000 | \n",
" 2.000000 | \n",
"
\n",
" \n",
" | 50% | \n",
" 33.695000 | \n",
" 3.000000 | \n",
"
\n",
" \n",
" | 75% | \n",
" 45.472500 | \n",
" 4.000000 | \n",
"
\n",
" \n",
" | max | \n",
" 59.960000 | \n",
" 5.000000 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"print(\\\"Saved: books_real\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 342.8831053927661,\n \"min\": 10.05,\n \"max\": 1000.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 33.977489999999996,\n 33.695,\n 1000.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 352.57957731182347,\n \"min\": 1.0,\n \"max\": 1000.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 2.914,\n 3.0,\n 1000.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Saved: books_real.csv and books_synthetic.csv\n"
]
}
],
"source": [
"# =========================\n",
"# Part 2-c: Create a synthetic dataset from the scraped data\n",
"# =========================\n",
"\n",
"# 1) (Colab) Install SDV if needed\n",
"!pip -q install sdv\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"# 2) Build the real dataset from your lists\n",
"df_real = pd.DataFrame({\n",
" \"title\": titles,\n",
" \"price\": prices,\n",
" \"rating\": ratings\n",
"})\n",
"\n",
"# Optional: basic cleaning\n",
"df_real = df_real.drop_duplicates().dropna()\n",
"df_real[\"price\"] = pd.to_numeric(df_real[\"price\"], errors=\"coerce\")\n",
"df_real = df_real.dropna(subset=[\"price\"])\n",
"\n",
"# 3) (Optional but useful) Convert rating text to an ordered numeric score\n",
"rating_map = {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}\n",
"df_real[\"rating_score\"] = df_real[\"rating\"].map(rating_map).fillna(np.nan)\n",
"df_real = df_real.dropna(subset=[\"rating_score\"])\n",
"df_real[\"rating_score\"] = df_real[\"rating_score\"].astype(int)\n",
"\n",
"# If you want to synthesize only the βmodel-friendlyβ columns:\n",
"df_model = df_real[[\"price\", \"rating_score\"]].copy()\n",
"\n",
"# 4) Train a synthetic data generator and sample synthetic rows\n",
"from sdv.metadata import SingleTableMetadata\n",
"from sdv.single_table import GaussianCopulaSynthesizer\n",
"\n",
"metadata = SingleTableMetadata()\n",
"metadata.detect_from_dataframe(df_model)\n",
"\n",
"synth = GaussianCopulaSynthesizer(metadata)\n",
"synth.fit(df_model)\n",
"\n",
"# Choose how many synthetic rows you want\n",
"n_synth = len(df_model) # same size as real\n",
"df_synth = synth.sample(n_synth)\n",
"\n",
"# 5) Quick sanity checks (optional)\n",
"print(\"Real (head):\")\n",
"display(df_model.head())\n",
"\n",
"print(\"Synthetic (head):\")\n",
"display(df_synth.head())\n",
"\n",
"print(\"Real stats:\")\n",
"display(df_model.describe())\n",
"\n",
"print(\"Synthetic stats:\")\n",
"display(df_synth.describe())\n",
"\n",
"# 6) Save outputs\n",
"df_real.to_csv(\"books_real.csv\", index=False)\n",
"df_synth.to_csv(\"books_synthetic.csv\", index=False)\n",
"\n",
"print(\"Saved: books_real.csv and books_synthetic.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "duI5dv3CZYvF"
},
"source": [
"### *d. Save web-scraped dataframe either as a CSV or Excel file*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lC1U_YHtZifh"
},
"outputs": [],
"source": [
"# πΎ Save to CSV\n",
"df_books.to_csv(\"books_data.csv\", index=False)\n",
"\n",
"# πΎ Or save to Excel\n",
"# df_books.to_excel(\"books_data.xlsx\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "qMjRKMBQZlJi"
},
"source": [
"### *e. βπ»πβοΈ View first fiew lines*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 201
},
"id": "O_wIvTxYZqCK",
"outputId": "f2296107-167d-4f7b-b363-a718c98dd34f"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" title price rating\n",
"0 A Light in the Attic 51.77 Three\n",
"1 Tipping the Velvet 53.74 One\n",
"2 Soumission 50.10 One\n",
"3 Sharp Objects 47.82 Four\n",
"4 Sapiens: A Brief History of Humankind 54.23 Five"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" price | \n",
" rating | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A Light in the Attic | \n",
" 51.77 | \n",
" Three | \n",
"
\n",
" \n",
" | 1 | \n",
" Tipping the Velvet | \n",
" 53.74 | \n",
" One | \n",
"
\n",
" \n",
" | 2 | \n",
" Soumission | \n",
" 50.10 | \n",
" One | \n",
"
\n",
" \n",
" | 3 | \n",
" Sharp Objects | \n",
" 47.82 | \n",
" Four | \n",
"
\n",
" \n",
" | 4 | \n",
" Sapiens: A Brief History of Humankind | \n",
" 54.23 | \n",
" Five | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df_books",
"summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 6
}
],
"source": [
"# Display first rows\n",
"df_books.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "p-1Pr2szaqLk"
},
"source": [
"## **3.** π§© Create a meaningful connection between real & synthetic datasets"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SIaJUGIpaH4V"
},
"source": [
"### *a. Initial setup*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "-gPXGcRPuV_9"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import random\n",
"from datetime import datetime\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"random.seed(2025)\n",
"np.random.seed(2025)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pY4yCoIuaQqp"
},
"source": [
"### *b. Generate popularity scores based on rating (with some randomness) with a generate_popularity_score function*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "mnd5hdAbaNjz"
},
"outputs": [],
"source": [
"def generate_popularity_score(rating):\n",
" base = {\"One\": 2, \"Two\": 3, \"Three\": 3, \"Four\": 4, \"Five\": 4}.get(rating, 3)\n",
" trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n",
" return int(np.clip(base + trend_factor, 1, 5))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "n4-TaNTFgPak"
},
"source": [
"### *c. βπ»πβοΈ Run the function to create a \"popularity_score\" column from \"rating\"*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "V-G3OCUCgR07",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 201
},
"outputId": "99af6d80-1016-4798-dc27-322eb8f921cf"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" title price rating popularity_score\n",
"0 A Light in the Attic 51.77 Three 3\n",
"1 Tipping the Velvet 53.74 One 2\n",
"2 Soumission 50.10 One 2\n",
"3 Sharp Objects 47.82 Four 4\n",
"4 Sapiens: A Brief History of Humankind 54.23 Five 3"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" price | \n",
" rating | \n",
" popularity_score | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A Light in the Attic | \n",
" 51.77 | \n",
" Three | \n",
" 3 | \n",
"
\n",
" \n",
" | 1 | \n",
" Tipping the Velvet | \n",
" 53.74 | \n",
" One | \n",
" 2 | \n",
"
\n",
" \n",
" | 2 | \n",
" Soumission | \n",
" 50.10 | \n",
" One | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" Sharp Objects | \n",
" 47.82 | \n",
" Four | \n",
" 4 | \n",
"
\n",
" \n",
" | 4 | \n",
" Sapiens: A Brief History of Humankind | \n",
" 54.23 | \n",
" Five | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df_books",
"summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 9
}
],
"source": [
"# Create popularity_score column\n",
"df_books[\"popularity_score\"] = df_books[\"rating\"].apply(\n",
" lambda x: generate_popularity_score(x)\n",
")\n",
"\n",
"# Preview results\n",
"df_books.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HnngRNTgacYt"
},
"source": [
"### *d. Decide on the sentiment_label based on the popularity score with a get_sentiment function*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "kUtWmr8maZLZ"
},
"outputs": [],
"source": [
"def get_sentiment(popularity_score):\n",
" if popularity_score <= 2:\n",
" return \"negative\"\n",
" elif popularity_score == 3:\n",
" return \"neutral\"\n",
" else:\n",
" return \"positive\""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HF9F9HIzgT7Z"
},
"source": [
"### *e. βπ»πβοΈ Run the function to create a \"sentiment_label\" column from \"popularity_score\"*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "tafQj8_7gYCG",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 201
},
"outputId": "708aa87a-9b99-4e6c-85c3-ca8f45cd3774"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" title price rating popularity_score \\\n",
"0 A Light in the Attic 51.77 Three 3 \n",
"1 Tipping the Velvet 53.74 One 2 \n",
"2 Soumission 50.10 One 2 \n",
"3 Sharp Objects 47.82 Four 4 \n",
"4 Sapiens: A Brief History of Humankind 54.23 Five 3 \n",
"\n",
" sentiment_label \n",
"0 neutral \n",
"1 negative \n",
"2 negative \n",
"3 positive \n",
"4 neutral "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" price | \n",
" rating | \n",
" popularity_score | \n",
" sentiment_label | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A Light in the Attic | \n",
" 51.77 | \n",
" Three | \n",
" 3 | \n",
" neutral | \n",
"
\n",
" \n",
" | 1 | \n",
" Tipping the Velvet | \n",
" 53.74 | \n",
" One | \n",
" 2 | \n",
" negative | \n",
"
\n",
" \n",
" | 2 | \n",
" Soumission | \n",
" 50.10 | \n",
" One | \n",
" 2 | \n",
" negative | \n",
"
\n",
" \n",
" | 3 | \n",
" Sharp Objects | \n",
" 47.82 | \n",
" Four | \n",
" 4 | \n",
" positive | \n",
"
\n",
" \n",
" | 4 | \n",
" Sapiens: A Brief History of Humankind | \n",
" 54.23 | \n",
" Five | \n",
" 3 | \n",
" neutral | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df_books",
"summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 11
}
],
"source": [
"# Create sentiment_label column\n",
"df_books[\"sentiment_label\"] = df_books[\"popularity_score\"].apply(get_sentiment)\n",
"\n",
"# Preview results\n",
"df_books.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "T8AdKkmASq9a"
},
"source": [
"## **4.** π Generate synthetic book sales data of 18 months"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "OhXbdGD5fH0c"
},
"source": [
"### *a. Create a generate_sales_profit function that would generate sales patterns based on sentiment_label (with some randomness)*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qkVhYPXGbgEn"
},
"outputs": [],
"source": [
"def generate_sales_profile(sentiment):\n",
" months = pd.date_range(end=datetime.today(), periods=18, freq=\"M\")\n",
"\n",
" if sentiment == \"positive\":\n",
" base = random.randint(200, 300)\n",
" trend = np.linspace(base, base + random.randint(20, 60), len(months))\n",
" elif sentiment == \"negative\":\n",
" base = random.randint(20, 80)\n",
" trend = np.linspace(base, base - random.randint(10, 30), len(months))\n",
" else: # neutral\n",
" base = random.randint(80, 160)\n",
" trend = np.full(len(months), base + random.randint(-10, 10))\n",
"\n",
" seasonality = 10 * np.sin(np.linspace(0, 3 * np.pi, len(months)))\n",
" noise = np.random.normal(0, 5, len(months))\n",
" monthly_sales = np.clip(trend + seasonality + noise, a_min=0, a_max=None).astype(int)\n",
"\n",
" return list(zip(months.strftime(\"%Y-%m\"), monthly_sales))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "L2ak1HlcgoTe"
},
"source": [
"### *b. Run the function as part of building sales_data*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "SlJ24AUafoDB"
},
"outputs": [],
"source": [
"sales_data = []\n",
"for _, row in df_books.iterrows():\n",
" records = generate_sales_profile(row[\"sentiment_label\"])\n",
" for month, units in records:\n",
" sales_data.append({\n",
" \"title\": row[\"title\"],\n",
" \"month\": month,\n",
" \"units_sold\": units,\n",
" \"sentiment_label\": row[\"sentiment_label\"]\n",
" })"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4IXZKcCSgxnq"
},
"source": [
"### *c. βπ»πβοΈ Create a df_sales DataFrame from sales_data*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "wcN6gtiZg-ws",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 201
},
"outputId": "9b0fa967-9a80-48c5-d67d-0728c9dbf0a3"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" title month units_sold sentiment_label\n",
"0 A Light in the Attic 2024-09 100 neutral\n",
"1 A Light in the Attic 2024-10 109 neutral\n",
"2 A Light in the Attic 2024-11 102 neutral\n",
"3 A Light in the Attic 2024-12 107 neutral\n",
"4 A Light in the Attic 2025-01 108 neutral"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" month | \n",
" units_sold | \n",
" sentiment_label | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A Light in the Attic | \n",
" 2024-09 | \n",
" 100 | \n",
" neutral | \n",
"
\n",
" \n",
" | 1 | \n",
" A Light in the Attic | \n",
" 2024-10 | \n",
" 109 | \n",
" neutral | \n",
"
\n",
" \n",
" | 2 | \n",
" A Light in the Attic | \n",
" 2024-11 | \n",
" 102 | \n",
" neutral | \n",
"
\n",
" \n",
" | 3 | \n",
" A Light in the Attic | \n",
" 2024-12 | \n",
" 107 | \n",
" neutral | \n",
"
\n",
" \n",
" | 4 | \n",
" A Light in the Attic | \n",
" 2025-01 | \n",
" 108 | \n",
" neutral | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df_sales",
"summary": "{\n \"name\": \"df_sales\",\n \"rows\": 18000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"month\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"2024-09\",\n \"2024-10\",\n \"2025-05\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"units_sold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 98,\n \"min\": 0,\n \"max\": 362,\n \"num_unique_values\": 354,\n \"samples\": [\n 214,\n 289,\n 205\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 14
}
],
"source": [
"# Create DataFrame from sales_data\n",
"df_sales = pd.DataFrame(sales_data)\n",
"\n",
"# Preview first rows\n",
"df_sales.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EhIjz9WohAmZ"
},
"source": [
"### *d. Save df_sales as synthetic_sales_data.csv & view first few lines*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "MzbZvLcAhGaH",
"outputId": "5e03d47b-9db2-4bdb-de45-cf5cd23df169"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" title month units_sold sentiment_label\n",
"0 A Light in the Attic 2024-09 100 neutral\n",
"1 A Light in the Attic 2024-10 109 neutral\n",
"2 A Light in the Attic 2024-11 102 neutral\n",
"3 A Light in the Attic 2024-12 107 neutral\n",
"4 A Light in the Attic 2025-01 108 neutral\n"
]
}
],
"source": [
"df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n",
"\n",
"print(df_sales.head())"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7g9gqBgQMtJn"
},
"source": [
"## **5.** π― Generate synthetic customer reviews"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Gi4y9M9KuDWx"
},
"source": [
"### *a. βπ»πβοΈ Ask ChatGPT to create a list of 50 distinct generic book review texts for the sentiment labels \"positive\", \"neutral\", and \"negative\" called synthetic_reviews_by_sentiment*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "b3cd2a50"
},
"outputs": [],
"source": [
"synthetic_reviews_by_sentiment = {\n",
" \"positive\": [\n",
" \"A compelling and heartwarming read that stayed with me long after I finished.\",\n",
" \"Brilliantly written with unforgettable characters.\",\n",
" \"An inspiring story that exceeded my expectations.\",\n",
" \"Absolutely loved the pacing and emotional depth.\",\n",
" \"A beautifully crafted narrative from start to finish.\",\n",
" \"Captivating, immersive, and hard to put down.\",\n",
" \"An outstanding book that I would highly recommend.\",\n",
" \"The storytelling was vivid and deeply engaging.\",\n",
" \"A powerful and moving literary experience.\",\n",
" \"Rich characters and a satisfying conclusion.\",\n",
" \"An uplifting and memorable journey.\",\n",
" \"Exceptionally well-written and thoughtfully structured.\",\n",
" \"A masterpiece of modern storytelling.\",\n",
" \"Heartfelt and genuinely inspiring.\",\n",
" \"The plot twists were clever and well executed.\",\n",
" \"A delightful surprise that kept me hooked.\",\n",
" \"Emotionally resonant and beautifully paced.\",\n",
" \"An engaging story with strong character development.\",\n",
" \"A truly rewarding read.\",\n",
" \"Fresh, original, and wonderfully told.\",\n",
" \"A gripping narrative that never lost momentum.\",\n",
" \"Highly entertaining and emotionally satisfying.\",\n",
" \"An absolute page-turner.\",\n",
" \"Creative, thoughtful, and deeply moving.\",\n",
" \"A remarkable reading experience.\",\n",
" \"Well-developed themes and compelling dialogue.\",\n",
" \"A standout book in its genre.\",\n",
" \"Engrossing and thoughtfully written.\",\n",
" \"An unforgettable literary journey.\",\n",
" \"Full of heart and intelligence.\",\n",
" \"A thoroughly enjoyable experience.\",\n",
" \"Smart, engaging, and beautifully detailed.\",\n",
" \"A story that lingers in the best way.\",\n",
" \"Masterfully executed with emotional depth.\",\n",
" \"An inspiring and touching tale.\",\n",
" \"Richly imagined and skillfully told.\",\n",
" \"A fantastic read from beginning to end.\",\n",
" \"Compelling characters and a strong narrative arc.\",\n",
" \"Deeply satisfying and wonderfully crafted.\",\n",
" \"An emotionally rich and powerful book.\",\n",
" \"A beautifully told and meaningful story.\",\n",
" \"The writing style was elegant and immersive.\",\n",
" \"A gripping and heartfelt novel.\",\n",
" \"An impressive and engaging work.\",\n",
" \"Thoroughly enjoyable with great pacing.\",\n",
" \"A brilliant combination of plot and character.\",\n",
" \"Highly compelling and well developed.\",\n",
" \"A refreshing and inspiring read.\",\n",
" \"An exceptional book worth revisiting.\",\n",
" \"Simply outstanding in every way.\"\n",
" ],\n",
"\n",
" \"neutral\": [\n",
" \"An average book β not great, but not bad either.\",\n",
" \"Some parts stood out, others felt flat.\",\n",
" \"It was okay overall, a decent way to pass the time.\",\n",
" \"Fairly standard storytelling without many surprises.\",\n",
" \"An adequate read with a few memorable moments.\",\n",
" \"Neither impressive nor disappointing.\",\n",
" \"A moderate experience with balanced strengths and flaws.\",\n",
" \"The plot was predictable but readable.\",\n",
" \"It held my attention, though not consistently.\",\n",
" \"Reasonably enjoyable but not remarkable.\",\n",
" \"A straightforward and simple narrative.\",\n",
" \"Decent character work but uneven pacing.\",\n",
" \"An acceptable read for a quiet afternoon.\",\n",
" \"Not particularly memorable, but not bad.\",\n",
" \"Average writing with a serviceable plot.\",\n",
" \"Some interesting ideas that weren't fully explored.\",\n",
" \"A book that met basic expectations.\",\n",
" \"Entertaining enough, though somewhat forgettable.\",\n",
" \"An ordinary story told competently.\",\n",
" \"Mildly engaging from start to finish.\",\n",
" \"A balanced mix of good and mediocre elements.\",\n",
" \"It had potential but didnβt fully deliver.\",\n",
" \"An easy but unremarkable read.\",\n",
" \"The story progressed steadily without major highlights.\",\n",
" \"Moderately satisfying overall.\",\n",
" \"Nothing groundbreaking, but readable.\",\n",
" \"A passable addition to the genre.\",\n",
" \"Simple and straightforward storytelling.\",\n",
" \"Occasionally engaging, occasionally dull.\",\n",
" \"A fairly typical reading experience.\",\n",
" \"Competent writing but lacking spark.\",\n",
" \"An average effort with decent structure.\",\n",
" \"Somewhat enjoyable yet inconsistent.\",\n",
" \"A middle-of-the-road novel.\",\n",
" \"Neither strongly recommended nor discouraged.\",\n",
" \"An alright book with standard themes.\",\n",
" \"Readable but not especially captivating.\",\n",
" \"It delivered what was expected.\",\n",
" \"A conventional and predictable plot.\",\n",
" \"Pleasant but easily forgotten.\",\n",
" \"Moderately interesting characters.\",\n",
" \"A steady but unspectacular read.\",\n",
" \"Not bad, just not exceptional.\",\n",
" \"An okay narrative with minor highlights.\",\n",
" \"Serviceable storytelling throughout.\",\n",
" \"Balanced strengths and weaknesses.\",\n",
" \"Acceptable pacing with limited surprises.\",\n",
" \"Fine for casual reading.\",\n",
" \"An unremarkable but functional story.\",\n",
" \"Overall, a neutral reading experience.\"\n",
" ],\n",
"\n",
" \"negative\": [\n",
" \"I struggled to get through this one.\",\n",
" \"The plot was confusing and underdeveloped.\",\n",
" \"Disappointing and not what I expected.\",\n",
" \"The characters lacked depth and realism.\",\n",
" \"Poor pacing made it hard to stay engaged.\",\n",
" \"The storyline felt rushed and incomplete.\",\n",
" \"Difficult to connect with the narrative.\",\n",
" \"Predictable and uninspired writing.\",\n",
" \"The dialogue felt forced and unnatural.\",\n",
" \"An underwhelming reading experience.\",\n",
" \"The book failed to hold my interest.\",\n",
" \"Too many clichΓ©s and weak plot points.\",\n",
" \"The structure was messy and unclear.\",\n",
" \"A forgettable and frustrating read.\",\n",
" \"The themes were poorly executed.\",\n",
" \"Lacked emotional depth and engagement.\",\n",
" \"I found it tedious and repetitive.\",\n",
" \"Not compelling enough to recommend.\",\n",
" \"The ending was abrupt and unsatisfying.\",\n",
" \"Weak character development throughout.\",\n",
" \"The writing style felt flat.\",\n",
" \"An overall disappointing novel.\",\n",
" \"Hard to follow and poorly organized.\",\n",
" \"The pacing dragged significantly.\",\n",
" \"The concept had promise but failed in execution.\",\n",
" \"Unconvincing character motivations.\",\n",
" \"A dull and uninspired storyline.\",\n",
" \"The narrative felt disconnected.\",\n",
" \"More frustrating than enjoyable.\",\n",
" \"The book lacked focus and clarity.\",\n",
" \"Repetitive and overly predictable.\",\n",
" \"The emotional impact was minimal.\",\n",
" \"Not engaging from start to finish.\",\n",
" \"The story felt unfinished.\",\n",
" \"An exhausting and unrewarding read.\",\n",
" \"Too slow to maintain interest.\",\n",
" \"The plot twists were ineffective.\",\n",
" \"Shallow storytelling overall.\",\n",
" \"The execution didnβt match the premise.\",\n",
" \"A below-average literary effort.\",\n",
" \"The characters were difficult to care about.\",\n",
" \"Inconsistent tone throughout.\",\n",
" \"It failed to meet expectations.\",\n",
" \"The narrative lacked cohesion.\",\n",
" \"Overly simplistic and bland.\",\n",
" \"A frustratingly weak story.\",\n",
" \"The pacing and structure were flawed.\",\n",
" \"Not worth the time invested.\",\n",
" \"A disappointing addition to the genre.\",\n",
" \"Ultimately an unsatisfying read.\"\n",
" ]\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "fQhfVaDmuULT"
},
"source": [
"### *b. Generate 10 reviews per book using random sampling from the corresponding 50*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "l2SRc3PjuTGM"
},
"outputs": [],
"source": [
"review_rows = []\n",
"for _, row in df_books.iterrows():\n",
" title = row['title']\n",
" sentiment_label = row['sentiment_label']\n",
" review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n",
" sampled_reviews = random.sample(review_pool, 10)\n",
" for review_text in sampled_reviews:\n",
" review_rows.append({\n",
" \"title\": title,\n",
" \"sentiment_label\": sentiment_label,\n",
" \"review_text\": review_text,\n",
" \"rating\": row['rating'],\n",
" \"popularity_score\": row['popularity_score']\n",
" })"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bmJMXF-Bukdm"
},
"source": [
"### *c. Create the final dataframe df_reviews & save it as synthetic_book_reviews.csv*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ZUKUqZsuumsp"
},
"outputs": [],
"source": [
"df_reviews = pd.DataFrame(review_rows)\n",
"df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "RYvGyVfXuo54"
},
"source": [
"### *d. βπ»πβοΈ View the first few lines*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 201
},
"id": "xfE8NMqOurKo",
"outputId": "3c4f3cd0-afad-4049-d873-d6449d7fc46a"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" title sentiment_label \\\n",
"0 A Light in the Attic neutral \n",
"1 A Light in the Attic neutral \n",
"2 A Light in the Attic neutral \n",
"3 A Light in the Attic neutral \n",
"4 A Light in the Attic neutral \n",
"\n",
" review_text rating popularity_score \n",
"0 Acceptable pacing with limited surprises. Three 3 \n",
"1 An easy but unremarkable read. Three 3 \n",
"2 Neither strongly recommended nor discouraged. Three 3 \n",
"3 It held my attention, though not consistently. Three 3 \n",
"4 Serviceable storytelling throughout. Three 3 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" sentiment_label | \n",
" review_text | \n",
" rating | \n",
" popularity_score | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A Light in the Attic | \n",
" neutral | \n",
" Acceptable pacing with limited surprises. | \n",
" Three | \n",
" 3 | \n",
"
\n",
" \n",
" | 1 | \n",
" A Light in the Attic | \n",
" neutral | \n",
" An easy but unremarkable read. | \n",
" Three | \n",
" 3 | \n",
"
\n",
" \n",
" | 2 | \n",
" A Light in the Attic | \n",
" neutral | \n",
" Neither strongly recommended nor discouraged. | \n",
" Three | \n",
" 3 | \n",
"
\n",
" \n",
" | 3 | \n",
" A Light in the Attic | \n",
" neutral | \n",
" It held my attention, though not consistently. | \n",
" Three | \n",
" 3 | \n",
"
\n",
" \n",
" | 4 | \n",
" A Light in the Attic | \n",
" neutral | \n",
" Serviceable storytelling throughout. | \n",
" Three | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df_reviews",
"summary": "{\n \"name\": \"df_reviews\",\n \"rows\": 10000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review_text\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 150,\n \"samples\": [\n \"A gripping narrative that never lost momentum.\",\n \"The themes were poorly executed.\",\n \"A thoroughly enjoyable experience.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 20
}
],
"source": [
"df_reviews.head()"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [
"jpASMyIQMaAq",
"lquNYCbfL9IM",
"0IWuNpxxYDJF",
"oCdTsin2Yfp3",
"T0TOeRC4Yrnn",
"duI5dv3CZYvF",
"qMjRKMBQZlJi",
"p-1Pr2szaqLk",
"SIaJUGIpaH4V",
"pY4yCoIuaQqp",
"n4-TaNTFgPak",
"HnngRNTgacYt",
"HF9F9HIzgT7Z",
"T8AdKkmASq9a",
"OhXbdGD5fH0c",
"L2ak1HlcgoTe",
"4IXZKcCSgxnq",
"EhIjz9WohAmZ",
"Gi4y9M9KuDWx",
"fQhfVaDmuULT",
"bmJMXF-Bukdm",
"RYvGyVfXuo54"
],
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}