{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "4ba6aba8" }, "source": [ "# 🤖 **Data Collection, Creation, Storage, and Processing**\n" ] }, { "cell_type": "markdown", "metadata": { "id": "jpASMyIQMaAq" }, "source": [ "## **1.** 📦 Install required packages" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f48c8f8c", "outputId": "12bccee2-077c-492f-9e8e-615db2caa9dc" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n", "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", "Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n", "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n", "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.3)\n", "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n", "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", "Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n", "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.1)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n", "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (2025.11.3)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (4.67.3)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n" ] } ], "source": [ "!pip install beautifulsoup4 pandas matplotlib seaborn numpy textblob" ] }, { "cell_type": "markdown", "metadata": { "id": "lquNYCbfL9IM" }, "source": [ "## **2.** ⛏ Web-scrape all book titles, prices, and ratings from books.toscrape.com" ] }, { "cell_type": "markdown", "metadata": { "id": "0IWuNpxxYDJF" }, "source": [ "### *a. Initial setup*\n", "Define the base url of the website you will scrape as well as how and what you will scrape" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "91d52125" }, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "import time\n", "\n", "base_url = \"https://books.toscrape.com/catalogue/page-{}.html\"\n", "headers = {\"User-Agent\": \"Mozilla/5.0\"}\n", "\n", "titles, prices, ratings = [], [], []" ] }, { "cell_type": "markdown", "metadata": { "id": "oCdTsin2Yfp3" }, "source": [ "### *b. Fill titles, prices, and ratings from the web pages*" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "xqO5Y3dnYhxt" }, "outputs": [], "source": [ "# Loop through all 50 pages\n", "for page in range(1, 51):\n", " url = base_url.format(page)\n", " response = requests.get(url, headers=headers)\n", " soup = BeautifulSoup(response.content, \"html.parser\")\n", " books = soup.find_all(\"article\", class_=\"product_pod\")\n", "\n", " for book in books:\n", " titles.append(book.h3.a[\"title\"])\n", " prices.append(float(book.find(\"p\", class_=\"price_color\").text[1:]))\n", " ratings.append(book.p.get(\"class\")[1])\n", "\n", " time.sleep(0.5) # polite scraping delay" ] }, { "cell_type": "markdown", "metadata": { "id": "T0TOeRC4Yrnn" }, "source": [ "### *c. ✋🏻🛑⛔️ Create a dataframe df_books that contains the now complete \"title\", \"price\", and \"rating\" objects*" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "l5FkkNhUYTHh", "colab": { "base_uri": "https://localhost:8080/", "height": 518 }, "outputId": "05fcbb8a-6fa1-4eb8-a884-659333c6d723" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Length check:\n", "Titles: 1000\n", "Prices: 1000\n", "Ratings: 1000\n", "\n", "DataFrame Shape: (1000, 3)\n", "\n", "Data Types:\n", "title string[python]\n", "price float64\n", "rating string[python]\n", "dtype: object\n", "\n", "Missing Values:\n", "title 0\n", "price 0\n", "rating 0\n", "dtype: int64\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " title price rating\n", "0 A Light in the Attic 51.77 Three\n", "1 Tipping the Velvet 53.74 One\n", "2 Soumission 50.10 One\n", "3 Sharp Objects 47.82 Four\n", "4 Sapiens: A Brief History of Humankind 54.23 Five" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepricerating
0A Light in the Attic51.77Three
1Tipping the Velvet53.74One
2Soumission50.10One
3Sharp Objects47.82Four
4Sapiens: A Brief History of Humankind54.23Five
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"display(df_books\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Tipping the Velvet\",\n \"Sapiens: A Brief History of Humankind\",\n \"Soumission\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.647672562837028,\n \"min\": 47.82,\n \"max\": 54.23,\n \"num_unique_values\": 5,\n \"samples\": [\n 53.74,\n 54.23,\n 50.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"One\",\n \"Five\",\n \"Three\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} } ], "source": [ "# =========================\n", "# Part 2-c: Create df_books\n", "# =========================\n", "\n", "# 1️⃣ Check that all lists have the same length\n", "print(\"Length check:\")\n", "print(\"Titles:\", len(titles))\n", "print(\"Prices:\", len(prices))\n", "print(\"Ratings:\", len(ratings))\n", "\n", "if not (len(titles) == len(prices) == len(ratings)):\n", " raise ValueError(\"The lists do not have the same length. Scraping may have failed on some pages.\")\n", "\n", "# 2️⃣ Create the dataframe\n", "df_books = pd.DataFrame({\n", " \"title\": pd.Series(titles, dtype=\"string\").str.strip(),\n", " \"price\": pd.to_numeric(prices, errors=\"coerce\"),\n", " \"rating\": pd.Series(ratings, dtype=\"string\").str.strip()\n", "})\n", "\n", "# 3️⃣ Reset index (clean structure)\n", "df_books = df_books.reset_index(drop=True)\n", "\n", "# 4️⃣ Basic validation\n", "print(\"\\nDataFrame Shape:\", df_books.shape)\n", "print(\"\\nData Types:\")\n", "print(df_books.dtypes)\n", "\n", "print(\"\\nMissing Values:\")\n", "print(df_books.isna().sum())\n", "\n", "display(df_books.head())" ] }, { "cell_type": "markdown", "metadata": { "id": "duI5dv3CZYvF" }, "source": [ "### *d. Save web-scraped dataframe either as a CSV or Excel file*" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "id": "lC1U_YHtZifh" }, "outputs": [], "source": [ "# 💾 Save to CSV\n", "df_books.to_csv(\"books_data.csv\", index=False)\n", "\n", "# 💾 Or save to Excel\n", "# df_books.to_excel(\"books_data.xlsx\", index=False)" ] }, { "cell_type": "markdown", "metadata": { "id": "qMjRKMBQZlJi" }, "source": [ "### *e. ✋🏻🛑⛔️ View first fiew lines*" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "O_wIvTxYZqCK", "outputId": "29327c64-20f0-41e2-c635-e25d5ed002ea" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title price rating\n", "0 A Light in the Attic 51.77 Three\n", "1 Tipping the Velvet 53.74 One\n", "2 Soumission 50.10 One\n", "3 Sharp Objects 47.82 Four\n", "4 Sapiens: A Brief History of Humankind 54.23 Five" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepricerating
0A Light in the Attic51.77Three
1Tipping the Velvet53.74One
2Soumission50.10One
3Sharp Objects47.82Four
4Sapiens: A Brief History of Humankind54.23Five
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_books", "summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 10 } ], "source": [ "# Show the first 5 rows\n", "df_books.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "p-1Pr2szaqLk" }, "source": [ "## **3.** 🧩 Create a meaningful connection between real & synthetic datasets" ] }, { "cell_type": "markdown", "metadata": { "id": "SIaJUGIpaH4V" }, "source": [ "### *a. Initial setup*" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "-gPXGcRPuV_9" }, "outputs": [], "source": [ "import numpy as np\n", "import random\n", "from datetime import datetime\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "random.seed(2025)\n", "np.random.seed(2025)" ] }, { "cell_type": "markdown", "metadata": { "id": "pY4yCoIuaQqp" }, "source": [ "### *b. Generate popularity scores based on rating (with some randomness) with a generate_popularity_score function*" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "id": "mnd5hdAbaNjz" }, "outputs": [], "source": [ "def generate_popularity_score(rating):\n", " base = {\"One\": 2, \"Two\": 3, \"Three\": 3, \"Four\": 4, \"Five\": 4}.get(rating, 3)\n", " trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n", " return int(np.clip(base + trend_factor, 1, 5))" ] }, { "cell_type": "markdown", "metadata": { "id": "n4-TaNTFgPak" }, "source": [ "### *c. ✋🏻🛑⛔️ Run the function to create a \"popularity_score\" column from \"rating\"*" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "V-G3OCUCgR07", "colab": { "base_uri": "https://localhost:8080/", "height": 379 }, "outputId": "3835df83-5761-406c-95c4-7d8f3660e6a8" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "DataFrame shape: (1000, 4)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " title price rating popularity_score\n", "0 A Light in the Attic 51.77 Three 3\n", "1 Tipping the Velvet 53.74 One 2\n", "2 Soumission 50.10 One 2\n", "3 Sharp Objects 47.82 Four 4\n", "4 Sapiens: A Brief History of Humankind 54.23 Five 3" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepriceratingpopularity_score
0A Light in the Attic51.77Three3
1Tipping the Velvet53.74One2
2Soumission50.10One2
3Sharp Objects47.82Four4
4Sapiens: A Brief History of Humankind54.23Five3
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"print(df_books[\\\"popularity_score\\\"]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Tipping the Velvet\",\n \"Sapiens: A Brief History of Humankind\",\n \"Soumission\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.647672562837028,\n \"min\": 47.82,\n \"max\": 54.23,\n \"num_unique_values\": 5,\n \"samples\": [\n 53.74,\n 54.23,\n 50.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"One\",\n \"Five\",\n \"Three\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 2,\n \"max\": 4,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 2,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\n", "Popularity Score Distribution:\n", "popularity_score\n", "1 38\n", "2 197\n", "3 327\n", "4 321\n", "5 117\n", "Name: count, dtype: int64\n" ] } ], "source": [ "# =========================\n", "# Create popularity_score column\n", "# =========================\n", "\n", "# Apply the function to the rating column\n", "df_books[\"popularity_score\"] = df_books[\"rating\"].apply(generate_popularity_score)\n", "\n", "# Quick validation\n", "print(\"DataFrame shape:\", df_books.shape)\n", "\n", "# Show first 5 rows\n", "display(df_books.head())\n", "\n", "# Check distribution of the new variable\n", "print(\"\\nPopularity Score Distribution:\")\n", "print(df_books[\"popularity_score\"].value_counts().sort_index())" ] }, { "cell_type": "markdown", "metadata": { "id": "HnngRNTgacYt" }, "source": [ "### *d. Decide on the sentiment_label based on the popularity score with a get_sentiment function*" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "kUtWmr8maZLZ" }, "outputs": [], "source": [ "def get_sentiment(popularity_score):\n", " if popularity_score <= 2:\n", " return \"negative\"\n", " elif popularity_score == 3:\n", " return \"neutral\"\n", " else:\n", " return \"positive\"" ] }, { "cell_type": "markdown", "metadata": { "id": "HF9F9HIzgT7Z" }, "source": [ "### *e. ✋🏻🛑⛔️ Run the function to create a \"sentiment_label\" column from \"popularity_score\"*" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "tafQj8_7gYCG", "colab": { "base_uri": "https://localhost:8080/", "height": 345 }, "outputId": "9bb6ed59-2bf6-4a41-8354-509515d182e7" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "DataFrame shape: (1000, 5)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " title price rating popularity_score \\\n", "0 A Light in the Attic 51.77 Three 3 \n", "1 Tipping the Velvet 53.74 One 2 \n", "2 Soumission 50.10 One 2 \n", "3 Sharp Objects 47.82 Four 4 \n", "4 Sapiens: A Brief History of Humankind 54.23 Five 3 \n", "\n", " sentiment_label \n", "0 neutral \n", "1 negative \n", "2 negative \n", "3 positive \n", "4 neutral " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepriceratingpopularity_scoresentiment_label
0A Light in the Attic51.77Three3neutral
1Tipping the Velvet53.74One2negative
2Soumission50.10One2negative
3Sharp Objects47.82Four4positive
4Sapiens: A Brief History of Humankind54.23Five3neutral
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"print(df_books[\\\"sentiment_label\\\"]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Tipping the Velvet\",\n \"Sapiens: A Brief History of Humankind\",\n \"Soumission\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.647672562837028,\n \"min\": 47.82,\n \"max\": 54.23,\n \"num_unique_values\": 5,\n \"samples\": [\n 53.74,\n 54.23,\n 50.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"One\",\n \"Five\",\n \"Three\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 2,\n \"max\": 4,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 2,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\n", "Sentiment Distribution:\n", "sentiment_label\n", "positive 438\n", "neutral 327\n", "negative 235\n", "Name: count, dtype: int64\n" ] } ], "source": [ "# =========================\n", "# Create sentiment_label column\n", "# =========================\n", "\n", "# Apply function to popularity_score\n", "df_books[\"sentiment_label\"] = df_books[\"popularity_score\"].apply(get_sentiment)\n", "\n", "# Quick validation\n", "print(\"DataFrame shape:\", df_books.shape)\n", "\n", "# Show first 5 rows\n", "display(df_books.head())\n", "\n", "# Check sentiment distribution\n", "print(\"\\nSentiment Distribution:\")\n", "print(df_books[\"sentiment_label\"].value_counts())" ] }, { "cell_type": "markdown", "metadata": { "id": "T8AdKkmASq9a" }, "source": [ "## **4.** 📈 Generate synthetic book sales data of 18 months" ] }, { "cell_type": "markdown", "metadata": { "id": "OhXbdGD5fH0c" }, "source": [ "### *a. Create a generate_sales_profit function that would generate sales patterns based on sentiment_label (with some randomness)*" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "id": "qkVhYPXGbgEn" }, "outputs": [], "source": [ "def generate_sales_profile(sentiment):\n", " months = pd.date_range(end=datetime.today(), periods=18, freq=\"M\")\n", "\n", " if sentiment == \"positive\":\n", " base = random.randint(200, 300)\n", " trend = np.linspace(base, base + random.randint(20, 60), len(months))\n", " elif sentiment == \"negative\":\n", " base = random.randint(20, 80)\n", " trend = np.linspace(base, base - random.randint(10, 30), len(months))\n", " else: # neutral\n", " base = random.randint(80, 160)\n", " trend = np.full(len(months), base + random.randint(-10, 10))\n", "\n", " seasonality = 10 * np.sin(np.linspace(0, 3 * np.pi, len(months)))\n", " noise = np.random.normal(0, 5, len(months))\n", " monthly_sales = np.clip(trend + seasonality + noise, a_min=0, a_max=None).astype(int)\n", "\n", " return list(zip(months.strftime(\"%Y-%m\"), monthly_sales))" ] }, { "cell_type": "markdown", "metadata": { "id": "L2ak1HlcgoTe" }, "source": [ "### *b. Run the function as part of building sales_data*" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "id": "SlJ24AUafoDB" }, "outputs": [], "source": [ "sales_data = []\n", "for _, row in df_books.iterrows():\n", " records = generate_sales_profile(row[\"sentiment_label\"])\n", " for month, units in records:\n", " sales_data.append({\n", " \"title\": row[\"title\"],\n", " \"month\": month,\n", " \"units_sold\": units,\n", " \"sentiment_label\": row[\"sentiment_label\"]\n", " })" ] }, { "cell_type": "markdown", "metadata": { "id": "4IXZKcCSgxnq" }, "source": [ "### *c. ✋🏻🛑⛔️ Create a df_sales DataFrame from sales_data*" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "id": "wcN6gtiZg-ws", "colab": { "base_uri": "https://localhost:8080/", "height": 640 }, "outputId": "edaf3e0c-0135-4218-a13a-24b0fdab4287" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Shape of df_sales: (18000, 4)\n", "\n", "Columns:\n", "Index(['title', 'month', 'units_sold', 'sentiment_label'], dtype='object')\n", "\n", "Data types before cleaning:\n", "title object\n", "month object\n", "units_sold int64\n", "sentiment_label object\n", "dtype: object\n", "\n", "Data types after cleaning:\n", "title object\n", "month datetime64[ns]\n", "units_sold int64\n", "sentiment_label object\n", "dtype: object\n", "\n", "Missing values:\n", "title 0\n", "month 0\n", "units_sold 0\n", "sentiment_label 0\n", "dtype: int64\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " title month units_sold sentiment_label\n", "0 A Light in the Attic 2024-09-01 100 neutral\n", "1 A Light in the Attic 2024-10-01 109 neutral\n", "2 A Light in the Attic 2024-11-01 102 neutral\n", "3 A Light in the Attic 2024-12-01 107 neutral\n", "4 A Light in the Attic 2025-01-01 108 neutral" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlemonthunits_soldsentiment_label
0A Light in the Attic2024-09-01100neutral
1A Light in the Attic2024-10-01109neutral
2A Light in the Attic2024-11-01102neutral
3A Light in the Attic2024-12-01107neutral
4A Light in the Attic2025-01-01108neutral
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"display(df_sales\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"A Light in the Attic\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"month\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2024-09-01 00:00:00\",\n \"max\": \"2025-01-01 00:00:00\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"2024-10-01 00:00:00\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"units_sold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 100,\n \"max\": 109,\n \"num_unique_values\": 5,\n \"samples\": [\n 109\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"neutral\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} } ], "source": [ "# =========================\n", "# Create df_sales from sales_data\n", "# =========================\n", "\n", "# 1️⃣ Convert list of dictionaries into DataFrame\n", "df_sales = pd.DataFrame(sales_data)\n", "\n", "# 2️⃣ Basic validation\n", "print(\"Shape of df_sales:\", df_sales.shape)\n", "print(\"\\nColumns:\")\n", "print(df_sales.columns)\n", "\n", "print(\"\\nData types before cleaning:\")\n", "print(df_sales.dtypes)\n", "\n", "# 3️⃣ Ensure correct data types\n", "df_sales[\"month\"] = pd.to_datetime(df_sales[\"month\"], format=\"%Y-%m\")\n", "df_sales[\"units_sold\"] = pd.to_numeric(df_sales[\"units_sold\"], errors=\"coerce\")\n", "\n", "# 4️⃣ Final validation\n", "print(\"\\nData types after cleaning:\")\n", "print(df_sales.dtypes)\n", "\n", "print(\"\\nMissing values:\")\n", "print(df_sales.isna().sum())\n", "\n", "display(df_sales.head())" ] }, { "cell_type": "markdown", "metadata": { "id": "EhIjz9WohAmZ" }, "source": [ "### *d. Save df_sales as synthetic_sales_data.csv & view first few lines*" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MzbZvLcAhGaH", "outputId": "e5a2089f-49fb-4311-9e02-1e7204382cd5" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " title month units_sold sentiment_label\n", "0 A Light in the Attic 2024-09-01 100 neutral\n", "1 A Light in the Attic 2024-10-01 109 neutral\n", "2 A Light in the Attic 2024-11-01 102 neutral\n", "3 A Light in the Attic 2024-12-01 107 neutral\n", "4 A Light in the Attic 2025-01-01 108 neutral\n" ] } ], "source": [ "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n", "\n", "print(df_sales.head())" ] }, { "cell_type": "markdown", "metadata": { "id": "7g9gqBgQMtJn" }, "source": [ "## **5.** 🎯 Generate synthetic customer reviews" ] }, { "cell_type": "markdown", "metadata": { "id": "Gi4y9M9KuDWx" }, "source": [ "### *a. ✋🏻🛑⛔️ Ask ChatGPT to create a list of 50 distinct generic book review texts for the sentiment labels \"positive\", \"neutral\", and \"negative\" called synthetic_reviews_by_sentiment*" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "id": "b3cd2a50" }, "outputs": [], "source": [ "# =========================\n", "# Synthetic Review Library\n", "# =========================\n", "\n", "synthetic_reviews_by_sentiment = {\n", " \"positive\": [\n", " \"Absolutely loved this book — it exceeded my expectations.\",\n", " \"A beautifully written story that kept me engaged throughout.\",\n", " \"The characters felt real and the journey was unforgettable.\",\n", " \"An inspiring and uplifting read.\",\n", " \"I couldn't put it down — truly captivating.\",\n", " \"A powerful narrative with emotional depth.\",\n", " \"One of the most enjoyable books I've read recently.\",\n", " \"Thought-provoking and wonderfully paced.\",\n", " \"An outstanding piece of storytelling.\",\n", " \"Rich in detail and full of heart.\",\n", " \"A masterfully crafted and compelling novel.\",\n", " \"The writing style was elegant and immersive.\",\n", " \"Highly recommended for anyone who loves great fiction.\",\n", " \"A deeply satisfying reading experience.\",\n", " \"It delivered everything I hoped for and more.\",\n", " \"An engaging plot with meaningful themes.\",\n", " \"Beautiful prose and a gripping storyline.\",\n", " \"A refreshing and memorable read.\",\n", " \"I was hooked from the first chapter.\",\n", " \"The emotional impact was incredible.\",\n", " \"A fantastic blend of drama and insight.\",\n", " \"Creative, smart, and thoroughly enjoyable.\",\n", " \"This book truly stands out.\",\n", " \"A rewarding and impactful story.\",\n", " \"An exceptional and moving narrative.\",\n", " \"I would gladly read this again.\",\n", " \"Strong characters and excellent pacing.\",\n", " \"It left a lasting impression on me.\",\n", " \"A brilliant and heartfelt story.\",\n", " \"Compelling from beginning to end.\",\n", " \"An imaginative and beautifully told tale.\",\n", " \"A story that resonates long after finishing.\",\n", " \"Thoroughly entertaining and meaningful.\",\n", " \"An absorbing and skillfully written book.\",\n", " \"The themes were handled with great care.\",\n", " \"An impressive and emotionally rich novel.\",\n", " \"The author did a fantastic job.\",\n", " \"A wonderful surprise and a joy to read.\",\n", " \"Truly inspiring and well-executed.\",\n", " \"An unforgettable reading experience.\",\n", " \"Deeply engaging and thoughtfully written.\",\n", " \"A delightful and captivating story.\",\n", " \"Everything about this book worked for me.\",\n", " \"A well-structured and compelling narrative.\",\n", " \"A standout title in its genre.\",\n", " \"An emotional rollercoaster in the best way.\",\n", " \"Expertly written and thoroughly enjoyable.\",\n", " \"The storytelling was simply outstanding.\",\n", " \"A gripping and meaningful journey.\",\n", " \"A beautifully developed and inspiring book.\"\n", " ],\n", "\n", " \"neutral\": [\n", " \"An average book — not particularly memorable.\",\n", " \"It had some strong moments but also some weak ones.\",\n", " \"A decent read overall.\",\n", " \"Neither impressive nor disappointing.\",\n", " \"Some chapters were engaging, others less so.\",\n", " \"It was okay, but nothing extraordinary.\",\n", " \"A fairly standard story.\",\n", " \"An acceptable way to spend an afternoon.\",\n", " \"The plot was predictable but readable.\",\n", " \"Not bad, but not outstanding either.\",\n", " \"A mixed reading experience.\",\n", " \"Some characters stood out, others faded.\",\n", " \"The pacing was inconsistent at times.\",\n", " \"It held my attention occasionally.\",\n", " \"An ordinary but readable novel.\",\n", " \"It had potential but didn't fully deliver.\",\n", " \"The writing was competent but not remarkable.\",\n", " \"An average effort overall.\",\n", " \"Interesting in parts, slow in others.\",\n", " \"A moderately enjoyable read.\",\n", " \"I neither loved nor disliked it.\",\n", " \"The themes were somewhat engaging.\",\n", " \"A reasonable but forgettable book.\",\n", " \"Not as strong as I expected.\",\n", " \"It was fine, just not memorable.\",\n", " \"Some elements worked better than others.\",\n", " \"An uneven but passable story.\",\n", " \"The concept was interesting, execution average.\",\n", " \"It didn’t fully captivate me.\",\n", " \"A fair attempt with mixed results.\",\n", " \"Serviceable but not standout.\",\n", " \"A readable yet unremarkable book.\",\n", " \"There were moments of interest.\",\n", " \"It felt somewhat conventional.\",\n", " \"An okay read with minor highlights.\",\n", " \"The storyline was acceptable.\",\n", " \"It met basic expectations.\",\n", " \"A safe and predictable narrative.\",\n", " \"Nothing particularly new or exciting.\",\n", " \"A book I won’t revisit but don’t regret.\",\n", " \"Some parts were enjoyable.\",\n", " \"A mildly engaging experience.\",\n", " \"It had both strengths and weaknesses.\",\n", " \"An overall average performance.\",\n", " \"The writing was simple and straightforward.\",\n", " \"A balanced but unremarkable read.\",\n", " \"It delivered a standard storyline.\",\n", " \"Somewhat entertaining but not gripping.\",\n", " \"It was adequate for its genre.\",\n", " \"A middle-of-the-road book.\"\n", " ],\n", "\n", " \"negative\": [\n", " \"I struggled to stay engaged throughout.\",\n", " \"The story failed to capture my interest.\",\n", " \"Disappointing from start to finish.\",\n", " \"The characters felt flat and unconvincing.\",\n", " \"It didn’t live up to the hype.\",\n", " \"The pacing was painfully slow.\",\n", " \"I found the plot confusing.\",\n", " \"The writing style didn’t appeal to me.\",\n", " \"A frustrating reading experience.\",\n", " \"The story lacked direction.\",\n", " \"I expected much more from this book.\",\n", " \"It was difficult to finish.\",\n", " \"The narrative felt disjointed.\",\n", " \"The themes weren’t well developed.\",\n", " \"The dialogue seemed unrealistic.\",\n", " \"I couldn’t connect with the characters.\",\n", " \"The storyline felt repetitive.\",\n", " \"It left me underwhelmed.\",\n", " \"The book lacked emotional impact.\",\n", " \"Not as compelling as I had hoped.\",\n", " \"The ending was unsatisfying.\",\n", " \"It felt rushed and incomplete.\",\n", " \"The plot had too many gaps.\",\n", " \"I lost interest halfway through.\",\n", " \"The execution was disappointing.\",\n", " \"The concept was better than the delivery.\",\n", " \"It didn’t hold my attention.\",\n", " \"A forgettable and dull read.\",\n", " \"The structure felt messy.\",\n", " \"It failed to leave a lasting impression.\",\n", " \"The writing felt uninspired.\",\n", " \"The story lacked depth.\",\n", " \"I found it quite tedious.\",\n", " \"The pacing was uneven and slow.\",\n", " \"It lacked originality.\",\n", " \"A missed opportunity.\",\n", " \"The characters weren’t believable.\",\n", " \"It didn’t resonate with me.\",\n", " \"The development was weak.\",\n", " \"The plot twists felt forced.\",\n", " \"I wouldn’t recommend it.\",\n", " \"It didn’t meet my expectations.\",\n", " \"The storytelling was underwhelming.\",\n", " \"The book felt overly long.\",\n", " \"The tone felt inconsistent.\",\n", " \"It was hard to stay invested.\",\n", " \"The narrative felt shallow.\",\n", " \"Not an enjoyable experience.\",\n", " \"It lacked clarity and focus.\",\n", " \"Overall, a disappointing read.\"\n", " ]\n", "}" ] }, { "cell_type": "markdown", "metadata": { "id": "fQhfVaDmuULT" }, "source": [ "### *b. Generate 10 reviews per book using random sampling from the corresponding 50*" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "id": "l2SRc3PjuTGM" }, "outputs": [], "source": [ "review_rows = []\n", "for _, row in df_books.iterrows():\n", " title = row['title']\n", " sentiment_label = row['sentiment_label']\n", " review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n", " sampled_reviews = random.sample(review_pool, 10)\n", " for review_text in sampled_reviews:\n", " review_rows.append({\n", " \"title\": title,\n", " \"sentiment_label\": sentiment_label,\n", " \"review_text\": review_text,\n", " \"rating\": row['rating'],\n", " \"popularity_score\": row['popularity_score']\n", " })" ] }, { "cell_type": "markdown", "metadata": { "id": "bmJMXF-Bukdm" }, "source": [ "### *c. Create the final dataframe df_reviews & save it as synthetic_book_reviews.csv*" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "id": "ZUKUqZsuumsp" }, "outputs": [], "source": [ "df_reviews = pd.DataFrame(review_rows)\n", "df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)" ] }, { "cell_type": "markdown", "source": [ "### *c. inputs for R*" ], "metadata": { "id": "_602pYUS3gY5" } }, { "cell_type": "markdown", "metadata": { "id": "RYvGyVfXuo54" }, "source": [ "### *d. ✋🏻🛑⛔️ View the first few lines*" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "xfE8NMqOurKo", "outputId": "29dcaaf0-5a04-4ee0-e2cf-2fb743b40f35" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title sentiment_label \\\n", "0 A Light in the Attic neutral \n", "1 A Light in the Attic neutral \n", "2 A Light in the Attic neutral \n", "3 A Light in the Attic neutral \n", "4 A Light in the Attic neutral \n", "\n", " review_text rating popularity_score \n", "0 It delivered a standard storyline. Three 3 \n", "1 A reasonable but forgettable book. Three 3 \n", "2 An okay read with minor highlights. Three 3 \n", "3 The plot was predictable but readable. Three 3 \n", "4 The writing was simple and straightforward. Three 3 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlesentiment_labelreview_textratingpopularity_score
0A Light in the AtticneutralIt delivered a standard storyline.Three3
1A Light in the AtticneutralA reasonable but forgettable book.Three3
2A Light in the AtticneutralAn okay read with minor highlights.Three3
3A Light in the AtticneutralThe plot was predictable but readable.Three3
4A Light in the AtticneutralThe writing was simple and straightforward.Three3
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_reviews", "summary": "{\n \"name\": \"df_reviews\",\n \"rows\": 10000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review_text\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 150,\n \"samples\": [\n \"A fantastic blend of drama and insight.\",\n \"The dialogue seemed unrealistic.\",\n \"An imaginative and beautifully told tale.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 25 } ], "source": [ "df_reviews.head()" ] } ], "metadata": { "colab": { "collapsed_sections": [ "jpASMyIQMaAq", "lquNYCbfL9IM", "0IWuNpxxYDJF", "oCdTsin2Yfp3", "T0TOeRC4Yrnn", "duI5dv3CZYvF", "qMjRKMBQZlJi", "p-1Pr2szaqLk", "SIaJUGIpaH4V", "pY4yCoIuaQqp", "n4-TaNTFgPak", "HnngRNTgacYt", "HF9F9HIzgT7Z", "T8AdKkmASq9a", "OhXbdGD5fH0c", "L2ak1HlcgoTe", "4IXZKcCSgxnq", "EhIjz9WohAmZ", "Gi4y9M9KuDWx", "fQhfVaDmuULT", "bmJMXF-Bukdm", "RYvGyVfXuo54" ], "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }