{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "4ba6aba8"
},
"source": [
"# ๐ค **Data Collection, Creation, Storage, and Processing**\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jpASMyIQMaAq"
},
"source": [
"## **1.** ๐ฆ Install required packages"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "f48c8f8c",
"outputId": "b2ca7f1a-9d54-4844-d0b5-6bdcac1fdf58"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n",
"Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n",
"Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n",
"Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n",
"Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n",
"Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.3)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n",
"Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n",
"Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.1)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n",
"Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (2025.11.3)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (4.67.3)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n"
]
}
],
"source": [
"!pip install beautifulsoup4 pandas matplotlib seaborn numpy textblob"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "lquNYCbfL9IM"
},
"source": [
"## **2.** โ Web-scrape all book titles, prices, and ratings from books.toscrape.com"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0IWuNpxxYDJF"
},
"source": [
"### *a. Initial setup*\n",
"Define the base url of the website you will scrape as well as how and what you will scrape"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"id": "91d52125"
},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"import time\n",
"\n",
"base_url = \"https://books.toscrape.com/catalogue/page-{}.html\"\n",
"headers = {\"User-Agent\": \"Mozilla/5.0\"}\n",
"\n",
"titles, prices, ratings = [], [], []"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "oCdTsin2Yfp3"
},
"source": [
"### *b. Fill titles, prices, and ratings from the web pages*"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"id": "xqO5Y3dnYhxt"
},
"outputs": [],
"source": [
"# Loop through all 50 pages\n",
"for page in range(1, 51):\n",
" url = base_url.format(page)\n",
" response = requests.get(url, headers=headers)\n",
" soup = BeautifulSoup(response.content, \"html.parser\")\n",
" books = soup.find_all(\"article\", class_=\"product_pod\")\n",
"\n",
" for book in books:\n",
" titles.append(book.h3.a[\"title\"])\n",
" prices.append(float(book.find(\"p\", class_=\"price_color\").text[1:]))\n",
" ratings.append(book.p.get(\"class\")[1])\n",
"\n",
" time.sleep(0.5) # polite scraping delay"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "T0TOeRC4Yrnn"
},
"source": [
"### *c. โ๐ป๐โ๏ธ Create a dataframe df_books that contains the now complete \"title\", \"price\", and \"rating\" objects*"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"id": "l5FkkNhUYTHh",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"outputId": "8c962ec1-dc39-44ad-b779-730351edde24"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" title price rating\n",
"0 A Light in the Attic 51.77 Three\n",
"1 Tipping the Velvet 53.74 One\n",
"2 Soumission 50.10 One\n",
"3 Sharp Objects 47.82 Four\n",
"4 Sapiens: A Brief History of Humankind 54.23 Five"
],
"text/html": [
"\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" price | \n",
" rating | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A Light in the Attic | \n",
" 51.77 | \n",
" Three | \n",
"
\n",
" \n",
" | 1 | \n",
" Tipping the Velvet | \n",
" 53.74 | \n",
" One | \n",
"
\n",
" \n",
" | 2 | \n",
" Soumission | \n",
" 50.10 | \n",
" One | \n",
"
\n",
" \n",
" | 3 | \n",
" Sharp Objects | \n",
" 47.82 | \n",
" Four | \n",
"
\n",
" \n",
" | 4 | \n",
" Sapiens: A Brief History of Humankind | \n",
" 54.23 | \n",
" Five | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df_books",
"summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 7
}
],
"source": [
"import pandas as pd\n",
"\n",
"df_books = pd.DataFrame({\n",
" \"title\": titles,\n",
" \"price\": prices,\n",
" \"rating\": ratings\n",
"})\n",
"\n",
"df_books.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "duI5dv3CZYvF"
},
"source": [
"### *d. Save web-scraped dataframe either as a CSV or Excel file*"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"id": "lC1U_YHtZifh"
},
"outputs": [],
"source": [
"# ๐พ Save to CSV\n",
"df_books.to_csv(\"books_data.csv\", index=False)\n",
"\n",
"# ๐พ Or save to Excel\n",
"# df_books.to_excel(\"books_data.xlsx\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "qMjRKMBQZlJi"
},
"source": [
"### *e. โ๐ป๐โ๏ธ View first fiew lines*"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "O_wIvTxYZqCK",
"outputId": "9d403240-8a04-4e3d-9a2d-fec660b9b8f4"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" title price rating\n",
"0 A Light in the Attic 51.77 Three\n",
"1 Tipping the Velvet 53.74 One\n",
"2 Soumission 50.10 One\n",
"3 Sharp Objects 47.82 Four\n",
"4 Sapiens: A Brief History of Humankind 54.23 Five"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" price | \n",
" rating | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A Light in the Attic | \n",
" 51.77 | \n",
" Three | \n",
"
\n",
" \n",
" | 1 | \n",
" Tipping the Velvet | \n",
" 53.74 | \n",
" One | \n",
"
\n",
" \n",
" | 2 | \n",
" Soumission | \n",
" 50.10 | \n",
" One | \n",
"
\n",
" \n",
" | 3 | \n",
" Sharp Objects | \n",
" 47.82 | \n",
" Four | \n",
"
\n",
" \n",
" | 4 | \n",
" Sapiens: A Brief History of Humankind | \n",
" 54.23 | \n",
" Five | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df_books",
"summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 9
}
],
"source": [
"# View first few lines\n",
"df_books.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "p-1Pr2szaqLk"
},
"source": [
"## **3.** ๐งฉ Create a meaningful connection between real & synthetic datasets"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SIaJUGIpaH4V"
},
"source": [
"### *a. Initial setup*"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "-gPXGcRPuV_9"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import random\n",
"from datetime import datetime\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"random.seed(2025)\n",
"np.random.seed(2025)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pY4yCoIuaQqp"
},
"source": [
"### *b. Generate popularity scores based on rating (with some randomness) with a generate_popularity_score function*"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"id": "mnd5hdAbaNjz"
},
"outputs": [],
"source": [
"def generate_popularity_score(rating):\n",
" base = {\"One\": 2, \"Two\": 3, \"Three\": 3, \"Four\": 4, \"Five\": 4}.get(rating, 3)\n",
" trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n",
" return int(np.clip(base + trend_factor, 1, 5))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "n4-TaNTFgPak"
},
"source": [
"### *c. โ๐ป๐โ๏ธ Run the function to create a \"popularity_score\" column from \"rating\"*"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"id": "V-G3OCUCgR07",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"outputId": "3bfd9e0f-fa90-4b13-ba80-9b468d041978"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" title price rating popularity_score\n",
"0 A Light in the Attic 51.77 Three 3\n",
"1 Tipping the Velvet 53.74 One 2\n",
"2 Soumission 50.10 One 2\n",
"3 Sharp Objects 47.82 Four 4\n",
"4 Sapiens: A Brief History of Humankind 54.23 Five 3"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" price | \n",
" rating | \n",
" popularity_score | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A Light in the Attic | \n",
" 51.77 | \n",
" Three | \n",
" 3 | \n",
"
\n",
" \n",
" | 1 | \n",
" Tipping the Velvet | \n",
" 53.74 | \n",
" One | \n",
" 2 | \n",
"
\n",
" \n",
" | 2 | \n",
" Soumission | \n",
" 50.10 | \n",
" One | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" Sharp Objects | \n",
" 47.82 | \n",
" Four | \n",
" 4 | \n",
"
\n",
" \n",
" | 4 | \n",
" Sapiens: A Brief History of Humankind | \n",
" 54.23 | \n",
" Five | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df_books",
"summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 13
}
],
"source": [
"# Create popularity_score column based on rating\n",
"df_books[\"popularity_score\"] = df_books[\"rating\"].apply(generate_popularity_score)\n",
"\n",
"df_books.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HnngRNTgacYt"
},
"source": [
"### *d. Decide on the sentiment_label based on the popularity score with a get_sentiment function*"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"id": "kUtWmr8maZLZ"
},
"outputs": [],
"source": [
"def get_sentiment(popularity_score):\n",
" if popularity_score <= 2:\n",
" return \"negative\"\n",
" elif popularity_score == 3:\n",
" return \"neutral\"\n",
" else:\n",
" return \"positive\""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HF9F9HIzgT7Z"
},
"source": [
"### *e. โ๐ป๐โ๏ธ Run the function to create a \"sentiment_label\" column from \"popularity_score\"*"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"id": "tafQj8_7gYCG",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"outputId": "57910696-4fbc-4df5-c86e-ad4dc1f0321a"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" title price rating popularity_score \\\n",
"0 A Light in the Attic 51.77 Three 3 \n",
"1 Tipping the Velvet 53.74 One 2 \n",
"2 Soumission 50.10 One 2 \n",
"3 Sharp Objects 47.82 Four 4 \n",
"4 Sapiens: A Brief History of Humankind 54.23 Five 3 \n",
"\n",
" sentiment_label \n",
"0 neutral \n",
"1 negative \n",
"2 negative \n",
"3 positive \n",
"4 neutral "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" price | \n",
" rating | \n",
" popularity_score | \n",
" sentiment_label | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A Light in the Attic | \n",
" 51.77 | \n",
" Three | \n",
" 3 | \n",
" neutral | \n",
"
\n",
" \n",
" | 1 | \n",
" Tipping the Velvet | \n",
" 53.74 | \n",
" One | \n",
" 2 | \n",
" negative | \n",
"
\n",
" \n",
" | 2 | \n",
" Soumission | \n",
" 50.10 | \n",
" One | \n",
" 2 | \n",
" negative | \n",
"
\n",
" \n",
" | 3 | \n",
" Sharp Objects | \n",
" 47.82 | \n",
" Four | \n",
" 4 | \n",
" positive | \n",
"
\n",
" \n",
" | 4 | \n",
" Sapiens: A Brief History of Humankind | \n",
" 54.23 | \n",
" Five | \n",
" 3 | \n",
" neutral | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df_books",
"summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 16
}
],
"source": [
"# Create sentiment_label column based on popularity_score\n",
"df_books[\"sentiment_label\"] = df_books[\"popularity_score\"].apply(get_sentiment)\n",
"\n",
"df_books.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "T8AdKkmASq9a"
},
"source": [
"## **4.** ๐ Generate synthetic book sales data of 18 months"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "OhXbdGD5fH0c"
},
"source": [
"### *a. Create a generate_sales_profit function that would generate sales patterns based on sentiment_label (with some randomness)*"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"id": "qkVhYPXGbgEn"
},
"outputs": [],
"source": [
"def generate_sales_profile(sentiment):\n",
" months = pd.date_range(end=datetime.today(), periods=18, freq=\"M\")\n",
"\n",
" if sentiment == \"positive\":\n",
" base = random.randint(200, 300)\n",
" trend = np.linspace(base, base + random.randint(20, 60), len(months))\n",
" elif sentiment == \"negative\":\n",
" base = random.randint(20, 80)\n",
" trend = np.linspace(base, base - random.randint(10, 30), len(months))\n",
" else: # neutral\n",
" base = random.randint(80, 160)\n",
" trend = np.full(len(months), base + random.randint(-10, 10))\n",
"\n",
" seasonality = 10 * np.sin(np.linspace(0, 3 * np.pi, len(months)))\n",
" noise = np.random.normal(0, 5, len(months))\n",
" monthly_sales = np.clip(trend + seasonality + noise, a_min=0, a_max=None).astype(int)\n",
"\n",
" return list(zip(months.strftime(\"%Y-%m\"), monthly_sales))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "L2ak1HlcgoTe"
},
"source": [
"### *b. Run the function as part of building sales_data*"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"id": "SlJ24AUafoDB"
},
"outputs": [],
"source": [
"sales_data = []\n",
"for _, row in df_books.iterrows():\n",
" records = generate_sales_profile(row[\"sentiment_label\"])\n",
" for month, units in records:\n",
" sales_data.append({\n",
" \"title\": row[\"title\"],\n",
" \"month\": month,\n",
" \"units_sold\": units,\n",
" \"sentiment_label\": row[\"sentiment_label\"]\n",
" })"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4IXZKcCSgxnq"
},
"source": [
"### *c. โ๐ป๐โ๏ธ Create a df_sales DataFrame from sales_data*"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"id": "wcN6gtiZg-ws",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"outputId": "2c5a32e4-0960-4f83-8397-36f750ffc7f0"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" title month units_sold sentiment_label\n",
"0 A Light in the Attic 2024-08 100 neutral\n",
"1 A Light in the Attic 2024-09 109 neutral\n",
"2 A Light in the Attic 2024-10 102 neutral\n",
"3 A Light in the Attic 2024-11 107 neutral\n",
"4 A Light in the Attic 2024-12 108 neutral"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" month | \n",
" units_sold | \n",
" sentiment_label | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A Light in the Attic | \n",
" 2024-08 | \n",
" 100 | \n",
" neutral | \n",
"
\n",
" \n",
" | 1 | \n",
" A Light in the Attic | \n",
" 2024-09 | \n",
" 109 | \n",
" neutral | \n",
"
\n",
" \n",
" | 2 | \n",
" A Light in the Attic | \n",
" 2024-10 | \n",
" 102 | \n",
" neutral | \n",
"
\n",
" \n",
" | 3 | \n",
" A Light in the Attic | \n",
" 2024-11 | \n",
" 107 | \n",
" neutral | \n",
"
\n",
" \n",
" | 4 | \n",
" A Light in the Attic | \n",
" 2024-12 | \n",
" 108 | \n",
" neutral | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df_sales",
"summary": "{\n \"name\": \"df_sales\",\n \"rows\": 18000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"month\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"2024-08\",\n \"2024-09\",\n \"2025-04\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"units_sold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 98,\n \"min\": 0,\n \"max\": 362,\n \"num_unique_values\": 354,\n \"samples\": [\n 214,\n 289,\n 205\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 19
}
],
"source": [
"import pandas as pd\n",
"\n",
"df_sales = pd.DataFrame(sales_data)\n",
"\n",
"df_sales.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EhIjz9WohAmZ"
},
"source": [
"### *d. Save df_sales as synthetic_sales_data.csv & view first few lines*"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "MzbZvLcAhGaH",
"outputId": "55e476f5-8c5e-4c7b-b6a4-043afc037bae"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" title month units_sold sentiment_label\n",
"0 A Light in the Attic 2024-08 100 neutral\n",
"1 A Light in the Attic 2024-09 109 neutral\n",
"2 A Light in the Attic 2024-10 102 neutral\n",
"3 A Light in the Attic 2024-11 107 neutral\n",
"4 A Light in the Attic 2024-12 108 neutral\n"
]
}
],
"source": [
"df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n",
"\n",
"print(df_sales.head())"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7g9gqBgQMtJn"
},
"source": [
"## **5.** ๐ฏ Generate synthetic customer reviews"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Gi4y9M9KuDWx"
},
"source": [
"### *a. โ๐ป๐โ๏ธ Ask ChatGPT to create a list of 50 distinct generic book review texts for the sentiment labels \"positive\", \"neutral\", and \"negative\" called synthetic_reviews_by_sentiment*"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"id": "b3cd2a50"
},
"outputs": [],
"source": [
"synthetic_reviews_by_sentiment = {\n",
" \"positive\": [\n",
" \"Absolutely loved this book from start to finish.\",\n",
" \"A beautifully written story with memorable characters.\",\n",
" \"Engaging plot and emotionally satisfying ending.\",\n",
" \"One of the most enjoyable reads I've had in a while.\",\n",
" \"Inspiring, thoughtful, and deeply moving.\",\n",
" \"The storytelling was immersive and captivating.\",\n",
" \"A powerful narrative that stayed with me.\",\n",
" \"Well-paced and wonderfully developed.\",\n",
" \"The characters felt authentic and relatable.\",\n",
" \"An uplifting and rewarding experience.\",\n",
" \"Brilliant execution and compelling themes.\",\n",
" \"The authorโs voice was strong and confident.\",\n",
" \"A delightful surprise that exceeded expectations.\",\n",
" \"Emotionally rich and incredibly engaging.\",\n",
" \"Hard to put down once I started.\",\n",
" \"An outstanding contribution to the genre.\",\n",
" \"Smart, entertaining, and beautifully crafted.\",\n",
" \"A truly satisfying and meaningful read.\",\n",
" \"The writing style was elegant and fluid.\",\n",
" \"An unforgettable literary experience.\",\n",
" \"Creative, thoughtful, and well-structured.\",\n",
" \"The plot twists were exciting and well done.\",\n",
" \"A heartwarming and impactful story.\",\n",
" \"Masterfully written and highly engaging.\",\n",
" \"The pacing kept me hooked throughout.\",\n",
" \"An impressive and polished work.\",\n",
" \"Deeply resonant and emotionally compelling.\",\n",
" \"A refreshing and enjoyable read.\",\n",
" \"Strong character development and vivid scenes.\",\n",
" \"A fascinating and rewarding journey.\",\n",
" \"The dialogue felt natural and sharp.\",\n",
" \"An inspiring and beautifully told tale.\",\n",
" \"Richly detailed and thoughtfully written.\",\n",
" \"A standout book in its category.\",\n",
" \"Highly recommended for fans of the genre.\",\n",
" \"An engaging blend of emotion and action.\",\n",
" \"Thought-provoking and satisfying.\",\n",
" \"A memorable and meaningful story.\",\n",
" \"The themes were handled with depth and care.\",\n",
" \"An exceptional and absorbing narrative.\",\n",
" \"Truly enjoyable from beginning to end.\",\n",
" \"A smart and emotionally layered story.\",\n",
" \"Compelling storytelling with a strong finish.\",\n",
" \"A wonderfully crafted and immersive book.\",\n",
" \"An exciting and heartfelt read.\",\n",
" \"The author did a fantastic job.\",\n",
" \"A captivating and beautifully structured novel.\",\n",
" \"A rewarding and enriching experience.\",\n",
" \"Strong writing and vivid imagination.\",\n",
" \"An excellent book I would gladly reread.\"\n",
" ],\n",
"\n",
" \"neutral\": [\n",
" \"An average read with some interesting moments.\",\n",
" \"It had both strengths and weaknesses.\",\n",
" \"Not bad, but not particularly memorable either.\",\n",
" \"Some parts were engaging, others less so.\",\n",
" \"A decent book for passing the time.\",\n",
" \"The story was fine but nothing extraordinary.\",\n",
" \"Moderately enjoyable overall.\",\n",
" \"The pacing was uneven at times.\",\n",
" \"An okay read with mixed impressions.\",\n",
" \"Some characters stood out more than others.\",\n",
" \"The plot was predictable but acceptable.\",\n",
" \"A fairly standard story.\",\n",
" \"There were highlights, but also dull sections.\",\n",
" \"It met expectations but didnโt exceed them.\",\n",
" \"Reasonably entertaining but not remarkable.\",\n",
" \"The writing was serviceable.\",\n",
" \"An average experience overall.\",\n",
" \"The concept was interesting but execution varied.\",\n",
" \"A typical entry in the genre.\",\n",
" \"It held my attention in parts.\",\n",
" \"Not particularly original, but readable.\",\n",
" \"Some scenes worked better than others.\",\n",
" \"An adequate and straightforward read.\",\n",
" \"The themes were present but lightly explored.\",\n",
" \"Neither impressive nor disappointing.\",\n",
" \"A passable story with moderate appeal.\",\n",
" \"The ending was acceptable.\",\n",
" \"It had potential that wasnโt fully realized.\",\n",
" \"Somewhat engaging but uneven.\",\n",
" \"An alright book with room for improvement.\",\n",
" \"Fairly enjoyable in places.\",\n",
" \"The dialogue was average.\",\n",
" \"A mixed but tolerable experience.\",\n",
" \"The characters were moderately developed.\",\n",
" \"It was fine for casual reading.\",\n",
" \"An ordinary story with standard pacing.\",\n",
" \"There were moments of interest.\",\n",
" \"Not bad overall, just not standout.\",\n",
" \"A reasonable but forgettable read.\",\n",
" \"The book was competently written.\",\n",
" \"It did what it set out to do.\",\n",
" \"Slightly engaging but not gripping.\",\n",
" \"An acceptable addition to the shelf.\",\n",
" \"The story had both highs and lows.\",\n",
" \"An average literary effort.\",\n",
" \"Somewhat enjoyable but not memorable.\",\n",
" \"The narrative was steady but plain.\",\n",
" \"A simple and predictable read.\",\n",
" \"It was okay overall.\",\n",
" \"A neutral reading experience.\"\n",
" ],\n",
"\n",
" \"negative\": [\n",
" \"I struggled to stay interested throughout.\",\n",
" \"The plot felt confusing and disjointed.\",\n",
" \"Disappointing overall and hard to finish.\",\n",
" \"The characters lacked depth and realism.\",\n",
" \"It didnโt live up to expectations.\",\n",
" \"The pacing was slow and uneven.\",\n",
" \"I found it difficult to connect with the story.\",\n",
" \"The writing style didnโt appeal to me.\",\n",
" \"A frustrating and underwhelming read.\",\n",
" \"The narrative felt repetitive.\",\n",
" \"Not as engaging as I had hoped.\",\n",
" \"The story lacked coherence.\",\n",
" \"I lost interest midway through.\",\n",
" \"The dialogue felt unnatural.\",\n",
" \"A missed opportunity with weak execution.\",\n",
" \"The ending was unsatisfying.\",\n",
" \"The plot twists were predictable.\",\n",
" \"It felt overly long and dragged out.\",\n",
" \"The themes were poorly developed.\",\n",
" \"I expected much more from this book.\",\n",
" \"The characters were forgettable.\",\n",
" \"The story felt flat and uninspired.\",\n",
" \"It failed to capture my attention.\",\n",
" \"The structure was confusing.\",\n",
" \"A dull and disappointing experience.\",\n",
" \"The writing lacked clarity.\",\n",
" \"I wouldnโt recommend this one.\",\n",
" \"The concept was interesting but poorly executed.\",\n",
" \"It felt rushed in key parts.\",\n",
" \"The emotional impact was minimal.\",\n",
" \"The storyline was weak and inconsistent.\",\n",
" \"I found it tedious to read.\",\n",
" \"The book didnโt resonate with me.\",\n",
" \"The pacing made it hard to enjoy.\",\n",
" \"The character arcs were unsatisfying.\",\n",
" \"A forgettable and disappointing read.\",\n",
" \"It lacked originality and depth.\",\n",
" \"The plot development was poor.\",\n",
" \"I struggled to understand the direction.\",\n",
" \"It didnโt hold my attention.\",\n",
" \"The story felt incomplete.\",\n",
" \"The writing was uninspired.\",\n",
" \"The book was not engaging.\",\n",
" \"I wouldnโt read it again.\",\n",
" \"The overall execution was lacking.\",\n",
" \"The narrative felt forced.\",\n",
" \"It was difficult to stay invested.\",\n",
" \"A disappointing literary experience.\",\n",
" \"The book failed to impress.\",\n",
" \"Not worth the time in my opinion.\"\n",
" ]\n",
"}\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "fQhfVaDmuULT"
},
"source": [
"### *b. Generate 10 reviews per book using random sampling from the corresponding 50*"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"id": "l2SRc3PjuTGM"
},
"outputs": [],
"source": [
"review_rows = []\n",
"for _, row in df_books.iterrows():\n",
" title = row['title']\n",
" sentiment_label = row['sentiment_label']\n",
" review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n",
" sampled_reviews = random.sample(review_pool, 10)\n",
" for review_text in sampled_reviews:\n",
" review_rows.append({\n",
" \"title\": title,\n",
" \"sentiment_label\": sentiment_label,\n",
" \"review_text\": review_text,\n",
" \"rating\": row['rating'],\n",
" \"popularity_score\": row['popularity_score']\n",
" })"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bmJMXF-Bukdm"
},
"source": [
"### *c. Create the final dataframe df_reviews & save it as synthetic_book_reviews.csv*"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"id": "ZUKUqZsuumsp"
},
"outputs": [],
"source": [
"df_reviews = pd.DataFrame(review_rows)\n",
"df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"source": [
"### *c. inputs for R*"
],
"metadata": {
"id": "_602pYUS3gY5"
}
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3946e521",
"outputId": "33160805-20df-4483-dde9-3b557b25f063"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"โ
Wrote synthetic_title_level_features.csv\n",
"โ
Wrote synthetic_monthly_revenue_series.csv\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"def _safe_num(s):\n",
" return pd.to_numeric(\n",
" pd.Series(s).astype(str).str.replace(r\"[^0-9.]\", \"\", regex=True),\n",
" errors=\"coerce\"\n",
" )\n",
"\n",
"# --- Clean book metadata (price/rating) ---\n",
"df_books_r = df_books.copy()\n",
"if \"price\" in df_books_r.columns:\n",
" df_books_r[\"price\"] = _safe_num(df_books_r[\"price\"])\n",
"if \"rating\" in df_books_r.columns:\n",
" df_books_r[\"rating\"] = _safe_num(df_books_r[\"rating\"])\n",
"\n",
"df_books_r[\"title\"] = df_books_r[\"title\"].astype(str).str.strip()\n",
"\n",
"# --- Clean sales ---\n",
"df_sales_r = df_sales.copy()\n",
"df_sales_r[\"title\"] = df_sales_r[\"title\"].astype(str).str.strip()\n",
"df_sales_r[\"month\"] = pd.to_datetime(df_sales_r[\"month\"], errors=\"coerce\")\n",
"df_sales_r[\"units_sold\"] = _safe_num(df_sales_r[\"units_sold\"])\n",
"\n",
"# --- Clean reviews ---\n",
"df_reviews_r = df_reviews.copy()\n",
"df_reviews_r[\"title\"] = df_reviews_r[\"title\"].astype(str).str.strip()\n",
"df_reviews_r[\"sentiment_label\"] = df_reviews_r[\"sentiment_label\"].astype(str).str.lower().str.strip()\n",
"if \"rating\" in df_reviews_r.columns:\n",
" df_reviews_r[\"rating\"] = _safe_num(df_reviews_r[\"rating\"])\n",
"if \"popularity_score\" in df_reviews_r.columns:\n",
" df_reviews_r[\"popularity_score\"] = _safe_num(df_reviews_r[\"popularity_score\"])\n",
"\n",
"# --- Sentiment shares per title (from reviews) ---\n",
"sent_counts = (\n",
" df_reviews_r.groupby([\"title\", \"sentiment_label\"])\n",
" .size()\n",
" .unstack(fill_value=0)\n",
")\n",
"for lab in [\"positive\", \"neutral\", \"negative\"]:\n",
" if lab not in sent_counts.columns:\n",
" sent_counts[lab] = 0\n",
"\n",
"sent_counts[\"total_reviews\"] = sent_counts[[\"positive\", \"neutral\", \"negative\"]].sum(axis=1)\n",
"den = sent_counts[\"total_reviews\"].replace(0, np.nan)\n",
"sent_counts[\"share_positive\"] = sent_counts[\"positive\"] / den\n",
"sent_counts[\"share_neutral\"] = sent_counts[\"neutral\"] / den\n",
"sent_counts[\"share_negative\"] = sent_counts[\"negative\"] / den\n",
"sent_counts = sent_counts.reset_index()\n",
"\n",
"# --- Sales aggregation per title ---\n",
"sales_by_title = (\n",
" df_sales_r.dropna(subset=[\"title\"])\n",
" .groupby(\"title\", as_index=False)\n",
" .agg(\n",
" months_observed=(\"month\", \"nunique\"),\n",
" avg_units_sold=(\"units_sold\", \"mean\"),\n",
" total_units_sold=(\"units_sold\", \"sum\"),\n",
" )\n",
")\n",
"\n",
"# --- Title-level features (join sales + books + sentiment) ---\n",
"df_title = (\n",
" sales_by_title\n",
" .merge(df_books_r[[\"title\", \"price\", \"rating\"]], on=\"title\", how=\"left\")\n",
" .merge(sent_counts[[\"title\", \"share_positive\", \"share_neutral\", \"share_negative\", \"total_reviews\"]],\n",
" on=\"title\", how=\"left\")\n",
")\n",
"\n",
"df_title[\"avg_revenue\"] = df_title[\"avg_units_sold\"] * df_title[\"price\"]\n",
"df_title[\"total_revenue\"] = df_title[\"total_units_sold\"] * df_title[\"price\"]\n",
"\n",
"df_title.to_csv(\"synthetic_title_level_features.csv\", index=False)\n",
"print(\"โ
Wrote synthetic_title_level_features.csv\")\n",
"\n",
"# --- Monthly revenue series (proxy: units_sold * price) ---\n",
"monthly_rev = (\n",
" df_sales_r.merge(df_books_r[[\"title\", \"price\"]], on=\"title\", how=\"left\")\n",
")\n",
"monthly_rev[\"revenue\"] = monthly_rev[\"units_sold\"] * monthly_rev[\"price\"]\n",
"\n",
"df_monthly = (\n",
" monthly_rev.dropna(subset=[\"month\"])\n",
" .groupby(\"month\", as_index=False)[\"revenue\"]\n",
" .sum()\n",
" .rename(columns={\"revenue\": \"total_revenue\"})\n",
" .sort_values(\"month\")\n",
")\n",
"# if revenue is all NA (e.g., missing price), fallback to units_sold as a teaching proxy\n",
"if df_monthly[\"total_revenue\"].notna().sum() == 0:\n",
" df_monthly = (\n",
" df_sales_r.dropna(subset=[\"month\"])\n",
" .groupby(\"month\", as_index=False)[\"units_sold\"]\n",
" .sum()\n",
" .rename(columns={\"units_sold\": \"total_revenue\"})\n",
" .sort_values(\"month\")\n",
" )\n",
"\n",
"df_monthly[\"month\"] = pd.to_datetime(df_monthly[\"month\"], errors=\"coerce\").dt.strftime(\"%Y-%m-%d\")\n",
"df_monthly.to_csv(\"synthetic_monthly_revenue_series.csv\", index=False)\n",
"print(\"โ
Wrote synthetic_monthly_revenue_series.csv\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "RYvGyVfXuo54"
},
"source": [
"### *d. โ๐ป๐โ๏ธ View the first few lines*"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 963
},
"id": "xfE8NMqOurKo",
"outputId": "b34adff4-1832-4c6c-b4d6-7a40b303615e"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"=== df_title (title-level features) ===\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" title months_observed \\\n",
"0 \"Most Blessed of the Patriarchs\": Thomas Jeffe... 18 \n",
"1 #GIRLBOSS 18 \n",
"2 #HigherSelfie: Wake Up Your Life. Free Your So... 18 \n",
"3 'Salem's Lot 18 \n",
"4 (Un)Qualified: How God Uses Broken People to D... 18 \n",
"\n",
" avg_units_sold total_units_sold price rating share_positive \\\n",
"0 285.555556 5140 44.48 NaN 1.0 \n",
"1 47.944444 863 50.96 NaN 0.0 \n",
"2 226.777778 4082 23.11 NaN 1.0 \n",
"3 246.055556 4429 49.56 NaN 1.0 \n",
"4 294.444444 5300 54.00 NaN 1.0 \n",
"\n",
" share_neutral share_negative total_reviews avg_revenue total_revenue \n",
"0 0.0 0.0 10 12701.511111 228627.20 \n",
"1 0.0 1.0 10 2443.248889 43978.48 \n",
"2 0.0 0.0 10 5240.834444 94335.02 \n",
"3 0.0 0.0 10 12194.513333 219501.24 \n",
"4 0.0 0.0 10 15900.000000 286200.00 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" months_observed | \n",
" avg_units_sold | \n",
" total_units_sold | \n",
" price | \n",
" rating | \n",
" share_positive | \n",
" share_neutral | \n",
" share_negative | \n",
" total_reviews | \n",
" avg_revenue | \n",
" total_revenue | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" \"Most Blessed of the Patriarchs\": Thomas Jeffe... | \n",
" 18 | \n",
" 285.555556 | \n",
" 5140 | \n",
" 44.48 | \n",
" NaN | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 10 | \n",
" 12701.511111 | \n",
" 228627.20 | \n",
"
\n",
" \n",
" | 1 | \n",
" #GIRLBOSS | \n",
" 18 | \n",
" 47.944444 | \n",
" 863 | \n",
" 50.96 | \n",
" NaN | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 10 | \n",
" 2443.248889 | \n",
" 43978.48 | \n",
"
\n",
" \n",
" | 2 | \n",
" #HigherSelfie: Wake Up Your Life. Free Your So... | \n",
" 18 | \n",
" 226.777778 | \n",
" 4082 | \n",
" 23.11 | \n",
" NaN | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 10 | \n",
" 5240.834444 | \n",
" 94335.02 | \n",
"
\n",
" \n",
" | 3 | \n",
" 'Salem's Lot | \n",
" 18 | \n",
" 246.055556 | \n",
" 4429 | \n",
" 49.56 | \n",
" NaN | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 10 | \n",
" 12194.513333 | \n",
" 219501.24 | \n",
"
\n",
" \n",
" | 4 | \n",
" (Un)Qualified: How God Uses Broken People to D... | \n",
" 18 | \n",
" 294.444444 | \n",
" 5300 | \n",
" 54.00 | \n",
" NaN | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 10 | \n",
" 15900.000000 | \n",
" 286200.00 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"print(df_monthly\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"#GIRLBOSS\",\n \"(Un)Qualified: How God Uses Broken People to Do Big Things\",\n \"#HigherSelfie: Wake Up Your Life. Free Your Soul. Find Your Tribe.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"months_observed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 18,\n \"max\": 18,\n \"num_unique_values\": 1,\n \"samples\": [\n 18\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"avg_units_sold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 100.20894374958456,\n \"min\": 47.94444444444444,\n \"max\": 294.44444444444446,\n \"num_unique_values\": 5,\n \"samples\": [\n 47.94444444444444\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"total_units_sold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1803,\n \"min\": 863,\n \"max\": 5300,\n \"num_unique_values\": 5,\n \"samples\": [\n 863\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12.400476603743908,\n \"min\": 23.11,\n \"max\": 54.0,\n \"num_unique_values\": 5,\n \"samples\": [\n 50.96\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": null,\n \"max\": null,\n \"num_unique_values\": 0,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"share_positive\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.44721359549995804,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"share_neutral\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"share_negative\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.44721359549995804,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"total_reviews\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 10,\n \"max\": 10,\n \"num_unique_values\": 1,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"avg_revenue\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5617.298744073206,\n \"min\": 2443.248888888889,\n \"max\": 15900.0,\n \"num_unique_values\": 5,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"total_revenue\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 101111.37739331771,\n \"min\": 43978.48,\n \"max\": 286200.0,\n \"num_unique_values\": 5,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"shape: (1000, 12)\n",
"rating 1000\n",
"title 0\n",
"avg_units_sold 0\n",
"months_observed 0\n",
"total_units_sold 0\n",
"price 0\n",
"share_positive 0\n",
"share_neutral 0\n",
"share_negative 0\n",
"total_reviews 0\n",
"dtype: int64\n",
"\n",
"=== df_monthly (monthly revenue series) ===\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" month total_revenue\n",
"0 2024-08-01 5631956.77\n",
"1 2024-09-01 5856653.68\n",
"2 2024-10-01 6006876.26\n",
"3 2024-11-01 6061519.85\n",
"4 2024-12-01 6014276.79"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" month | \n",
" total_revenue | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 2024-08-01 | \n",
" 5631956.77 | \n",
"
\n",
" \n",
" | 1 | \n",
" 2024-09-01 | \n",
" 5856653.68 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2024-10-01 | \n",
" 6006876.26 | \n",
"
\n",
" \n",
" | 3 | \n",
" 2024-11-01 | \n",
" 6061519.85 | \n",
"
\n",
" \n",
" | 4 | \n",
" 2024-12-01 | \n",
" 6014276.79 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"print(df_monthly\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"month\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"2024-09-01\",\n \"2024-12-01\",\n \"2024-10-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"total_revenue\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 175556.39765987248,\n \"min\": 5631956.77,\n \"max\": 6061519.85,\n \"num_unique_values\": 5,\n \"samples\": [\n 5856653.68,\n 6014276.79,\n 6006876.26\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"shape: (18, 2)\n",
"month 0\n",
"total_revenue 0\n",
"dtype: int64\n"
]
}
],
"source": [
"# d. View the first few lines (and quick checks)\n",
"\n",
"print(\"=== df_title (title-level features) ===\")\n",
"display(df_title.head())\n",
"print(\"shape:\", df_title.shape)\n",
"print(df_title.isna().sum().sort_values(ascending=False).head(10))\n",
"\n",
"print(\"\\n=== df_monthly (monthly revenue series) ===\")\n",
"display(df_monthly.head())\n",
"print(\"shape:\", df_monthly.shape)\n",
"print(df_monthly.isna().sum())\n"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}