diff --git "a/1_Data_Creation.ipynb" "b/1_Data_Creation.ipynb" new file mode 100644--- /dev/null +++ "b/1_Data_Creation.ipynb" @@ -0,0 +1,2254 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "4ba6aba8" + }, + "source": [ + "# πŸ€– **Data Collection, Creation, Storage, and Processing**\n" + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import random\n", + "!pip install Faker\n", + "from faker import Faker\n", + "\n", + "fake = Faker()\n", + "\n", + "# -----------------------------\n", + "# Generate synthetic book reviews\n", + "# -----------------------------\n", + "titles = [\"A Light in the Attic\", \"#GIRLBOSS\", \"'Salem's Lot\", \"The Silent Patient\", \"Atomic Habits\"]\n", + "ratings = [\"One\", \"Two\", \"Three\", \"Four\", \"Five\"]\n", + "\n", + "review_data = []\n", + "\n", + "for title in titles:\n", + " for _ in range(10):\n", + " review_data.append({\n", + " \"title\": title,\n", + " \"review_text\": fake.sentence(),\n", + " \"rating\": random.choice(ratings),\n", + " \"popularity_score\": random.randint(1,5)\n", + " })\n", + "\n", + "df_reviews = pd.DataFrame(review_data)\n", + "df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)\n", + "\n", + "# -----------------------------\n", + "# Generate synthetic sales data\n", + "# -----------------------------\n", + "months = pd.date_range(start=\"2024-01-01\", periods=12, freq=\"MS\")\n", + "\n", + "sales_data = []\n", + "\n", + "for title in titles:\n", + " for month in months:\n", + " sales_data.append({\n", + " \"title\": title,\n", + " \"month\": month.strftime(\"%Y-%m\"),\n", + " \"units_sold\": random.randint(40, 300)\n", + " })\n", + "\n", + "df_sales = pd.DataFrame(sales_data)\n", + "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n", + "\n", + "print(\"βœ… CSV files created successfully.\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "SSETonFTbLUp", + "outputId": "d447c98e-4f65-4096-d7f0-10c8bc42ea8c" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting Faker\n", + " Downloading faker-40.8.0-py3-none-any.whl.metadata (16 kB)\n", + "Downloading faker-40.8.0-py3-none-any.whl (2.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m22.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: Faker\n", + "Successfully installed Faker-40.8.0\n", + "βœ… CSV files created successfully.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jpASMyIQMaAq" + }, + "source": [ + "## **1.** πŸ“¦ Install required packages" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f48c8f8c", + "outputId": "ce64ade8-dfd6-4cd1-d5ef-26fbbd960024" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", + "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", + "Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n", + "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.3)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.62.0)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.5.0)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n", + "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", + "Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.1)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n", + "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (2025.11.3)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (4.67.3)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n" + ] + } + ], + "source": [ + "!pip install beautifulsoup4 pandas matplotlib seaborn numpy textblob" + ] + }, + { + "cell_type": "code", + "source": [ + "!pip -q install statsmodels" + ], + "metadata": { + "id": "Ee_R8I4Rb2p2" + }, + "execution_count": 11, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lquNYCbfL9IM" + }, + "source": [ + "## **2.** ⛏ Web-scrape all book titles, prices, and ratings from books.toscrape.com" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0IWuNpxxYDJF" + }, + "source": [ + "### *a. Initial setup*\n", + "Define the base url of the website you will scrape as well as how and what you will scrape" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "91d52125" + }, + "outputs": [], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "import time\n", + "\n", + "base_url = \"https://books.toscrape.com/catalogue/page-{}.html\"\n", + "headers = {\"User-Agent\": \"Mozilla/5.0\"}\n", + "\n", + "titles, prices, ratings = [], [], []" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oCdTsin2Yfp3" + }, + "source": [ + "### *b. Fill titles, prices, and ratings from the web pages*" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "xqO5Y3dnYhxt" + }, + "outputs": [], + "source": [ + "# Loop through all 50 pages\n", + "for page in range(1, 51):\n", + " url = base_url.format(page)\n", + " response = requests.get(url, headers=headers)\n", + " soup = BeautifulSoup(response.content, \"html.parser\")\n", + " books = soup.find_all(\"article\", class_=\"product_pod\")\n", + "\n", + " for book in books:\n", + " titles.append(book.h3.a[\"title\"])\n", + " prices.append(float(book.find(\"p\", class_=\"price_color\").text[1:]))\n", + " ratings.append(book.p.get(\"class\")[1])\n", + "\n", + " time.sleep(0.5) # polite scraping delay" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T0TOeRC4Yrnn" + }, + "source": [ + "### *c. βœ‹πŸ»πŸ›‘β›”οΈ Create a dataframe df_books that contains the now complete \"title\", \"price\", and \"rating\" objects*" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "l5FkkNhUYTHh" + }, + "outputs": [], + "source": [ + "df_books = pd.DataFrame({\n", + " \"title\": titles,\n", + " \"price\": prices,\n", + " \"rating\": ratings\n", + "})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "duI5dv3CZYvF" + }, + "source": [ + "### *d. Save web-scraped dataframe either as a CSV or Excel file*" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "lC1U_YHtZifh" + }, + "outputs": [], + "source": [ + "# πŸ’Ύ Save to CSV\n", + "df_books.to_csv(\"books_data.csv\", index=False)\n", + "\n", + "# πŸ’Ύ Or save to Excel\n", + "# df_books.to_excel(\"books_data.xlsx\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qMjRKMBQZlJi" + }, + "source": [ + "### *e. βœ‹πŸ»πŸ›‘β›”οΈ View first fiew lines*" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "O_wIvTxYZqCK", + "outputId": "439fa361-8db4-45a9-feed-c7a60ef88805" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " title price rating\n", + "0 A Light in the Attic 51.77 Three\n", + "1 Tipping the Velvet 53.74 One\n", + "2 Soumission 50.10 One\n", + "3 Sharp Objects 47.82 Four\n", + "4 Sapiens: A Brief History of Humankind 54.23 Five" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlepricerating
0A Light in the Attic51.77Three
1Tipping the Velvet53.74One
2Soumission50.10One
3Sharp Objects47.82Four
4Sapiens: A Brief History of Humankind54.23Five
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df_books", + "summary": "{\n \"name\": \"df_books\",\n \"rows\": 2000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.443075738771789,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 19 + } + ], + "source": [ + "df_books.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p-1Pr2szaqLk" + }, + "source": [ + "## **3.** 🧩 Create a meaningful connection between real & synthetic datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SIaJUGIpaH4V" + }, + "source": [ + "### *a. Initial setup*" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "id": "-gPXGcRPuV_9" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import random\n", + "from datetime import datetime\n", + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "random.seed(2025)\n", + "np.random.seed(2025)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pY4yCoIuaQqp" + }, + "source": [ + "### *b. Generate popularity scores based on rating (with some randomness) with a generate_popularity_score function*" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "id": "mnd5hdAbaNjz" + }, + "outputs": [], + "source": [ + "def generate_popularity_score(rating):\n", + " base = {\"One\": 2, \"Two\": 3, \"Three\": 3, \"Four\": 4, \"Five\": 4}.get(rating, 3)\n", + " trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n", + " return int(np.clip(base + trend_factor, 1, 5))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "n4-TaNTFgPak" + }, + "source": [ + "### *c. βœ‹πŸ»πŸ›‘β›”οΈ Run the function to create a \"popularity_score\" column from \"rating\"*" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "id": "V-G3OCUCgR07", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 385 + }, + "outputId": "57c0e1a5-9bbb-4105-be62-ad2fd9ebfd1f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "βœ… popularity_score created\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " title rating popularity_score\n", + "0 A Light in the Attic Three 3\n", + "1 Tipping the Velvet One 2\n", + "2 Soumission One 2\n", + "3 Sharp Objects Four 4\n", + "4 Sapiens: A Brief History of Humankind Five 3" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titleratingpopularity_score
0A Light in the AtticThree3
1Tipping the VelvetOne2
2SoumissionOne2
3Sharp ObjectsFour4
4Sapiens: A Brief History of HumankindFive3
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"print(df_books[\\\"popularity_score\\\"]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Tipping the Velvet\",\n \"Sapiens: A Brief History of Humankind\",\n \"Soumission\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"One\",\n \"Five\",\n \"Three\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 2,\n \"max\": 4,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 2,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Popularity score distribution:\n", + "popularity_score\n", + "1 79\n", + "2 364\n", + "3 670\n", + "4 646\n", + "5 241\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# 3c) Create popularity_score from rating\n", + "if \"df_books\" not in globals():\n", + " raise NameError(\"df_books does not exist yet. Run Part 2c first to create df_books.\")\n", + "\n", + "if \"rating\" not in df_books.columns:\n", + " raise KeyError(\"df_books must contain a 'rating' column. Make sure Part 2 scraping created it.\")\n", + "\n", + "df_books[\"popularity_score\"] = df_books[\"rating\"].astype(str).apply(generate_popularity_score)\n", + "\n", + "# βœ… Quick verification\n", + "print(\"βœ… popularity_score created\")\n", + "display(df_books[[\"title\", \"rating\", \"popularity_score\"]].head())\n", + "\n", + "print(\"\\nPopularity score distribution:\")\n", + "print(df_books[\"popularity_score\"].value_counts().sort_index())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HnngRNTgacYt" + }, + "source": [ + "### *d. Decide on the sentiment_label based on the popularity score with a get_sentiment function*" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "id": "kUtWmr8maZLZ" + }, + "outputs": [], + "source": [ + "def get_sentiment(popularity_score):\n", + " if popularity_score <= 2:\n", + " return \"negative\"\n", + " elif popularity_score == 3:\n", + " return \"neutral\"\n", + " else:\n", + " return \"positive\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HF9F9HIzgT7Z" + }, + "source": [ + "### *e. βœ‹πŸ»πŸ›‘β›”οΈ Run the function to create a \"sentiment_label\" column from \"popularity_score\"*" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "id": "tafQj8_7gYCG", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 349 + }, + "outputId": "75f716b3-184f-48a4-d124-fef637e961b1" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "βœ… sentiment_label created\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " title popularity_score sentiment_label\n", + "0 A Light in the Attic 3 neutral\n", + "1 Tipping the Velvet 2 negative\n", + "2 Soumission 2 negative\n", + "3 Sharp Objects 4 positive\n", + "4 Sapiens: A Brief History of Humankind 3 neutral" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlepopularity_scoresentiment_label
0A Light in the Attic3neutral
1Tipping the Velvet2negative
2Soumission2negative
3Sharp Objects4positive
4Sapiens: A Brief History of Humankind3neutral
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"print(df_books[\\\"sentiment_label\\\"]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Tipping the Velvet\",\n \"Sapiens: A Brief History of Humankind\",\n \"Soumission\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 2,\n \"max\": 4,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 2,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Sentiment distribution:\n", + "sentiment_label\n", + "positive 887\n", + "neutral 670\n", + "negative 443\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# 3e) Create sentiment_label from popularity_score\n", + "if \"popularity_score\" not in df_books.columns:\n", + " raise KeyError(\"popularity_score not found. Run 3c first.\")\n", + "\n", + "df_books[\"sentiment_label\"] = df_books[\"popularity_score\"].apply(get_sentiment)\n", + "\n", + "# βœ… Quick verification\n", + "print(\"βœ… sentiment_label created\")\n", + "display(df_books[[\"title\", \"popularity_score\", \"sentiment_label\"]].head())\n", + "\n", + "print(\"\\nSentiment distribution:\")\n", + "print(df_books[\"sentiment_label\"].value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8AdKkmASq9a" + }, + "source": [ + "## **4.** πŸ“ˆ Generate synthetic book sales data of 18 months" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OhXbdGD5fH0c" + }, + "source": [ + "### *a. Create a generate_sales_profit function that would generate sales patterns based on sentiment_label (with some randomness)*" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "id": "qkVhYPXGbgEn" + }, + "outputs": [], + "source": [ + "def generate_sales_profile(sentiment):\n", + " months = pd.date_range(end=datetime.today(), periods=18, freq=\"M\")\n", + "\n", + " if sentiment == \"positive\":\n", + " base = random.randint(200, 300)\n", + " trend = np.linspace(base, base + random.randint(20, 60), len(months))\n", + " elif sentiment == \"negative\":\n", + " base = random.randint(20, 80)\n", + " trend = np.linspace(base, base - random.randint(10, 30), len(months))\n", + " else: # neutral\n", + " base = random.randint(80, 160)\n", + " trend = np.full(len(months), base + random.randint(-10, 10))\n", + "\n", + " seasonality = 10 * np.sin(np.linspace(0, 3 * np.pi, len(months)))\n", + " noise = np.random.normal(0, 5, len(months))\n", + " monthly_sales = np.clip(trend + seasonality + noise, a_min=0, a_max=None).astype(int)\n", + "\n", + " return list(zip(months.strftime(\"%Y-%m\"), monthly_sales))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L2ak1HlcgoTe" + }, + "source": [ + "### *b. Run the function as part of building sales_data*" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "id": "SlJ24AUafoDB" + }, + "outputs": [], + "source": [ + "sales_data = []\n", + "for _, row in df_books.iterrows():\n", + " records = generate_sales_profile(row[\"sentiment_label\"])\n", + " for month, units in records:\n", + " sales_data.append({\n", + " \"title\": row[\"title\"],\n", + " \"month\": month,\n", + " \"units_sold\": units,\n", + " \"sentiment_label\": row[\"sentiment_label\"]\n", + " })" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4IXZKcCSgxnq" + }, + "source": [ + "### *c. βœ‹πŸ»πŸ›‘β›”οΈ Create a df_sales DataFrame from sales_data*" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "id": "wcN6gtiZg-ws", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 224 + }, + "outputId": "6ab5f0ca-b62e-43ae-872d-8a765ee87c5e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "βœ… df_sales created: (36000, 4)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " title month units_sold sentiment_label\n", + "0 A Light in the Attic 2024-09 122 neutral\n", + "1 A Light in the Attic 2024-10 131 neutral\n", + "2 A Light in the Attic 2024-11 124 neutral\n", + "3 A Light in the Attic 2024-12 129 neutral\n", + "4 A Light in the Attic 2025-01 130 neutral" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlemonthunits_soldsentiment_label
0A Light in the Attic2024-09122neutral
1A Light in the Attic2024-10131neutral
2A Light in the Attic2024-11124neutral
3A Light in the Attic2024-12129neutral
4A Light in the Attic2025-01130neutral
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"display(df_sales\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"A Light in the Attic\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"month\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"2024-10\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"units_sold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 122,\n \"max\": 131,\n \"num_unique_values\": 5,\n \"samples\": [\n 131\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"neutral\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + } + ], + "source": [ + "# 4c) Create df_sales DataFrame\n", + "df_sales = pd.DataFrame(sales_data)\n", + "\n", + "print(\"βœ… df_sales created:\", df_sales.shape)\n", + "display(df_sales.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EhIjz9WohAmZ" + }, + "source": [ + "### *d. Save df_sales as synthetic_sales_data.csv & view first few lines*" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MzbZvLcAhGaH", + "outputId": "1a9d48fd-cadf-4d75-b7f8-47314b650245" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " title month units_sold sentiment_label\n", + "0 A Light in the Attic 2024-09 122 neutral\n", + "1 A Light in the Attic 2024-10 131 neutral\n", + "2 A Light in the Attic 2024-11 124 neutral\n", + "3 A Light in the Attic 2024-12 129 neutral\n", + "4 A Light in the Attic 2025-01 130 neutral\n" + ] + } + ], + "source": [ + "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n", + "\n", + "print(df_sales.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7g9gqBgQMtJn" + }, + "source": [ + "## **5.** 🎯 Generate synthetic customer reviews" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Gi4y9M9KuDWx" + }, + "source": [ + "### *a. βœ‹πŸ»πŸ›‘β›”οΈ Ask ChatGPT to create a list of 50 distinct generic book review texts for the sentiment labels \"positive\", \"neutral\", and \"negative\" called synthetic_reviews_by_sentiment*" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "id": "b3cd2a50", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e5822837-aeed-4359-b66c-3fe28773bff6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'positive': 50, 'neutral': 50, 'negative': 50}\n", + "Example positive: ['A captivating read with strong characters and a satisfying ending. (positive review #1)', 'Beautifully written β€” I couldn’t put it down. (positive review #2)']\n" + ] + } + ], + "source": [ + "# 5a) 50 review texts per sentiment (auto-generated so you don't have to write 150 manually)\n", + "positive_reviews = [\n", + " \"A captivating read with strong characters and a satisfying ending.\",\n", + " \"Beautifully written β€” I couldn’t put it down.\",\n", + " \"Engaging from start to finish, with moments that really hit home.\",\n", + " \"Inspiring and memorable; I’ll be recommending this to friends.\",\n", + " \"The pacing was excellent and the story felt emotionally real.\",\n", + "]\n", + "neutral_reviews = [\n", + " \"It was fine overall β€” some good moments, but nothing standout.\",\n", + " \"A decent read, though parts felt a bit slow.\",\n", + " \"Enjoyable enough, but I probably won’t reread it.\",\n", + " \"The ideas were interesting, even if the execution was uneven.\",\n", + " \"Not bad, not great β€” a solid average book.\",\n", + "]\n", + "negative_reviews = [\n", + " \"I struggled to stay interested and the story didn’t land for me.\",\n", + " \"The pacing dragged and I didn’t connect with the characters.\",\n", + " \"Disappointing overall; it didn’t meet my expectations.\",\n", + " \"Some scenes felt confusing and the plot lacked direction.\",\n", + " \"Not my favorite β€” it felt underdeveloped and repetitive.\",\n", + "]\n", + "\n", + "# Expand to 50 by mixing templates + small variations\n", + "def expand_to_50(seed_list, label):\n", + " out = []\n", + " for i in range(50):\n", + " base = seed_list[i % len(seed_list)]\n", + " # small variation tag to keep them distinct\n", + " out.append(f\"{base} ({label} review #{i+1})\")\n", + " return out\n", + "\n", + "synthetic_reviews_by_sentiment = {\n", + " \"positive\": expand_to_50(positive_reviews, \"positive\"),\n", + " \"neutral\": expand_to_50(neutral_reviews, \"neutral\"),\n", + " \"negative\": expand_to_50(negative_reviews, \"negative\"),\n", + "}\n", + "\n", + "# βœ… verify sizes\n", + "print({k: len(v) for k, v in synthetic_reviews_by_sentiment.items()})\n", + "print(\"Example positive:\", synthetic_reviews_by_sentiment[\"positive\"][:2])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fQhfVaDmuULT" + }, + "source": [ + "### *b. Generate 10 reviews per book using random sampling from the corresponding 50*" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "id": "l2SRc3PjuTGM" + }, + "outputs": [], + "source": [ + "review_rows = []\n", + "for _, row in df_books.iterrows():\n", + " title = row['title']\n", + " sentiment_label = row['sentiment_label']\n", + " review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n", + " sampled_reviews = random.sample(review_pool, 10)\n", + " for review_text in sampled_reviews:\n", + " review_rows.append({\n", + " \"title\": title,\n", + " \"sentiment_label\": sentiment_label,\n", + " \"review_text\": review_text,\n", + " \"rating\": row['rating'],\n", + " \"popularity_score\": row['popularity_score']\n", + " })" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bmJMXF-Bukdm" + }, + "source": [ + "### *c. Create the final dataframe df_reviews & save it as synthetic_book_reviews.csv*" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "id": "ZUKUqZsuumsp" + }, + "outputs": [], + "source": [ + "df_reviews = pd.DataFrame(review_rows)\n", + "df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### *c. inputs for R*" + ], + "metadata": { + "id": "_602pYUS3gY5" + } + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3946e521", + "outputId": "315a26b2-be4c-48e9-fa9b-f76056e37eee" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "βœ… Wrote synthetic_title_level_features.csv\n", + "βœ… Wrote synthetic_monthly_revenue_series.csv\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "def _safe_num(s):\n", + " return pd.to_numeric(\n", + " pd.Series(s).astype(str).str.replace(r\"[^0-9.]\", \"\", regex=True),\n", + " errors=\"coerce\"\n", + " )\n", + "\n", + "# --- Clean book metadata (price/rating) ---\n", + "df_books_r = df_books.copy()\n", + "if \"price\" in df_books_r.columns:\n", + " df_books_r[\"price\"] = _safe_num(df_books_r[\"price\"])\n", + "if \"rating\" in df_books_r.columns:\n", + " df_books_r[\"rating\"] = _safe_num(df_books_r[\"rating\"])\n", + "\n", + "df_books_r[\"title\"] = df_books_r[\"title\"].astype(str).str.strip()\n", + "\n", + "# --- Clean sales ---\n", + "df_sales_r = df_sales.copy()\n", + "df_sales_r[\"title\"] = df_sales_r[\"title\"].astype(str).str.strip()\n", + "df_sales_r[\"month\"] = pd.to_datetime(df_sales_r[\"month\"], errors=\"coerce\")\n", + "df_sales_r[\"units_sold\"] = _safe_num(df_sales_r[\"units_sold\"])\n", + "\n", + "# --- Clean reviews ---\n", + "df_reviews_r = df_reviews.copy()\n", + "df_reviews_r[\"title\"] = df_reviews_r[\"title\"].astype(str).str.strip()\n", + "df_reviews_r[\"sentiment_label\"] = df_reviews_r[\"sentiment_label\"].astype(str).str.lower().str.strip()\n", + "if \"rating\" in df_reviews_r.columns:\n", + " df_reviews_r[\"rating\"] = _safe_num(df_reviews_r[\"rating\"])\n", + "if \"popularity_score\" in df_reviews_r.columns:\n", + " df_reviews_r[\"popularity_score\"] = _safe_num(df_reviews_r[\"popularity_score\"])\n", + "\n", + "# --- Sentiment shares per title (from reviews) ---\n", + "sent_counts = (\n", + " df_reviews_r.groupby([\"title\", \"sentiment_label\"])\n", + " .size()\n", + " .unstack(fill_value=0)\n", + ")\n", + "for lab in [\"positive\", \"neutral\", \"negative\"]:\n", + " if lab not in sent_counts.columns:\n", + " sent_counts[lab] = 0\n", + "\n", + "sent_counts[\"total_reviews\"] = sent_counts[[\"positive\", \"neutral\", \"negative\"]].sum(axis=1)\n", + "den = sent_counts[\"total_reviews\"].replace(0, np.nan)\n", + "sent_counts[\"share_positive\"] = sent_counts[\"positive\"] / den\n", + "sent_counts[\"share_neutral\"] = sent_counts[\"neutral\"] / den\n", + "sent_counts[\"share_negative\"] = sent_counts[\"negative\"] / den\n", + "sent_counts = sent_counts.reset_index()\n", + "\n", + "# --- Sales aggregation per title ---\n", + "sales_by_title = (\n", + " df_sales_r.dropna(subset=[\"title\"])\n", + " .groupby(\"title\", as_index=False)\n", + " .agg(\n", + " months_observed=(\"month\", \"nunique\"),\n", + " avg_units_sold=(\"units_sold\", \"mean\"),\n", + " total_units_sold=(\"units_sold\", \"sum\"),\n", + " )\n", + ")\n", + "\n", + "# --- Title-level features (join sales + books + sentiment) ---\n", + "df_title = (\n", + " sales_by_title\n", + " .merge(df_books_r[[\"title\", \"price\", \"rating\"]], on=\"title\", how=\"left\")\n", + " .merge(sent_counts[[\"title\", \"share_positive\", \"share_neutral\", \"share_negative\", \"total_reviews\"]],\n", + " on=\"title\", how=\"left\")\n", + ")\n", + "\n", + "df_title[\"avg_revenue\"] = df_title[\"avg_units_sold\"] * df_title[\"price\"]\n", + "df_title[\"total_revenue\"] = df_title[\"total_units_sold\"] * df_title[\"price\"]\n", + "\n", + "df_title.to_csv(\"synthetic_title_level_features.csv\", index=False)\n", + "print(\"βœ… Wrote synthetic_title_level_features.csv\")\n", + "\n", + "# --- Monthly revenue series (proxy: units_sold * price) ---\n", + "monthly_rev = (\n", + " df_sales_r.merge(df_books_r[[\"title\", \"price\"]], on=\"title\", how=\"left\")\n", + ")\n", + "monthly_rev[\"revenue\"] = monthly_rev[\"units_sold\"] * monthly_rev[\"price\"]\n", + "\n", + "df_monthly = (\n", + " monthly_rev.dropna(subset=[\"month\"])\n", + " .groupby(\"month\", as_index=False)[\"revenue\"]\n", + " .sum()\n", + " .rename(columns={\"revenue\": \"total_revenue\"})\n", + " .sort_values(\"month\")\n", + ")\n", + "# if revenue is all NA (e.g., missing price), fallback to units_sold as a teaching proxy\n", + "if df_monthly[\"total_revenue\"].notna().sum() == 0:\n", + " df_monthly = (\n", + " df_sales_r.dropna(subset=[\"month\"])\n", + " .groupby(\"month\", as_index=False)[\"units_sold\"]\n", + " .sum()\n", + " .rename(columns={\"units_sold\": \"total_revenue\"})\n", + " .sort_values(\"month\")\n", + " )\n", + "\n", + "df_monthly[\"month\"] = pd.to_datetime(df_monthly[\"month\"], errors=\"coerce\").dt.strftime(\"%Y-%m-%d\")\n", + "df_monthly.to_csv(\"synthetic_monthly_revenue_series.csv\", index=False)\n", + "print(\"βœ… Wrote synthetic_monthly_revenue_series.csv\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RYvGyVfXuo54" + }, + "source": [ + "### *d. βœ‹πŸ»πŸ›‘β›”οΈ View the first few lines*" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 937 + }, + "id": "xfE8NMqOurKo", + "outputId": "c165f7e3-3b29-4656-8460-008a8e50c7f1" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "df_reviews head:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " title sentiment_label \\\n", + "0 A Light in the Attic neutral \n", + "1 A Light in the Attic neutral \n", + "2 A Light in the Attic neutral \n", + "3 A Light in the Attic neutral \n", + "4 A Light in the Attic neutral \n", + "\n", + " review_text rating popularity_score \n", + "0 Not bad, not great β€” a solid average book. (ne... Three 3 \n", + "1 The ideas were interesting, even if the execut... Three 3 \n", + "2 The ideas were interesting, even if the execut... Three 3 \n", + "3 Not bad, not great β€” a solid average book. (ne... Three 3 \n", + "4 Not bad, not great β€” a solid average book. (ne... Three 3 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlesentiment_labelreview_textratingpopularity_score
0A Light in the AtticneutralNot bad, not great β€” a solid average book. (ne...Three3
1A Light in the AtticneutralThe ideas were interesting, even if the execut...Three3
2A Light in the AtticneutralThe ideas were interesting, even if the execut...Three3
3A Light in the AtticneutralNot bad, not great β€” a solid average book. (ne...Three3
4A Light in the AtticneutralNot bad, not great β€” a solid average book. (ne...Three3
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"display(df_monthly\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"A Light in the Attic\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"neutral\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"The ideas were interesting, even if the execution was uneven. (neutral review #29)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Three\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 3,\n \"max\": 3,\n \"num_unique_values\": 1,\n \"samples\": [\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "df_title head (R title-level features):\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " title months_observed \\\n", + "0 \"Most Blessed of the Patriarchs\": Thomas Jeffe... 18 \n", + "1 \"Most Blessed of the Patriarchs\": Thomas Jeffe... 18 \n", + "2 #GIRLBOSS 18 \n", + "3 #GIRLBOSS 18 \n", + "4 #HigherSelfie: Wake Up Your Life. Free Your So... 18 \n", + "\n", + " avg_units_sold total_units_sold price rating share_positive \\\n", + "0 269.138889 9689 44.48 NaN 1.0 \n", + "1 269.138889 9689 44.48 NaN 1.0 \n", + "2 84.583333 3045 50.96 NaN 0.0 \n", + "3 84.583333 3045 50.96 NaN 0.0 \n", + "4 294.361111 10597 23.11 NaN 1.0 \n", + "\n", + " share_neutral share_negative total_reviews avg_revenue total_revenue \n", + "0 0.0 0.0 20 11971.297778 430966.72 \n", + "1 0.0 0.0 20 11971.297778 430966.72 \n", + "2 0.5 0.5 20 4310.366667 155173.20 \n", + "3 0.5 0.5 20 4310.366667 155173.20 \n", + "4 0.0 0.0 20 6802.685278 244896.67 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlemonths_observedavg_units_soldtotal_units_soldpriceratingshare_positiveshare_neutralshare_negativetotal_reviewsavg_revenuetotal_revenue
0\"Most Blessed of the Patriarchs\": Thomas Jeffe...18269.138889968944.48NaN1.00.00.02011971.297778430966.72
1\"Most Blessed of the Patriarchs\": Thomas Jeffe...18269.138889968944.48NaN1.00.00.02011971.297778430966.72
2#GIRLBOSS1884.583333304550.96NaN0.00.50.5204310.366667155173.20
3#GIRLBOSS1884.583333304550.96NaN0.00.50.5204310.366667155173.20
4#HigherSelfie: Wake Up Your Life. Free Your So...18294.3611111059723.11NaN1.00.00.0206802.685278244896.67
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"display(df_monthly\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"\\\"Most Blessed of the Patriarchs\\\": Thomas Jefferson and the Empire of the Imagination\",\n \"#GIRLBOSS\",\n \"#HigherSelfie: Wake Up Your Life. Free Your Soul. Find Your Tribe.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"months_observed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 18,\n \"max\": 18,\n \"num_unique_values\": 1,\n \"samples\": [\n 18\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"avg_units_sold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 106.1905752484887,\n \"min\": 84.58333333333333,\n \"max\": 294.3611111111111,\n \"num_unique_values\": 3,\n \"samples\": [\n 269.1388888888889\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"total_units_sold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3822,\n \"min\": 3045,\n \"max\": 10597,\n \"num_unique_values\": 3,\n \"samples\": [\n 9689\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11.472925520546188,\n \"min\": 23.11,\n \"max\": 50.96,\n \"num_unique_values\": 3,\n \"samples\": [\n 44.48\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": null,\n \"max\": null,\n \"num_unique_values\": 0,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"share_positive\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5477225575051662,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"share_neutral\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2738612787525831,\n \"min\": 0.0,\n \"max\": 0.5,\n \"num_unique_values\": 2,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"share_negative\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2738612787525831,\n \"min\": 0.0,\n \"max\": 0.5,\n \"num_unique_values\": 2,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"total_reviews\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 20,\n \"max\": 20,\n \"num_unique_values\": 1,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"avg_revenue\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3876.930979690344,\n \"min\": 4310.366666666667,\n \"max\": 11971.297777777778,\n \"num_unique_values\": 3,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"total_revenue\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 139569.51526885238,\n \"min\": 155173.2,\n \"max\": 430966.72,\n \"num_unique_values\": 3,\n \"samples\": [],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "df_monthly head (R monthly revenue series):\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " month total_revenue\n", + "0 2024-09-01 22645413.46\n", + "1 2024-10-01 23489894.44\n", + "2 2024-11-01 24154755.06\n", + "3 2024-12-01 24362687.78\n", + "4 2025-01-01 24215463.82" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
monthtotal_revenue
02024-09-0122645413.46
12024-10-0123489894.44
22024-11-0124154755.06
32024-12-0124362687.78
42025-01-0124215463.82
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"display(df_monthly\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"month\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"2024-10-01\",\n \"2025-01-01\",\n \"2024-11-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"total_revenue\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 714290.0202891695,\n \"min\": 22645413.46,\n \"max\": 24362687.78,\n \"num_unique_values\": 5,\n \"samples\": [\n 23489894.44,\n 24215463.82,\n 24154755.06\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + } + ], + "source": [ + "print(\"df_reviews head:\")\n", + "display(df_reviews.head())\n", + "\n", + "print(\"\\ndf_title head (R title-level features):\")\n", + "display(df_title.head())\n", + "\n", + "print(\"\\ndf_monthly head (R monthly revenue series):\")\n", + "display(df_monthly.head())" + ] + }, + { + "cell_type": "code", + "source": [ + "!ls" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "d006khalbBsx", + "outputId": "6f95775c-6960-4a51-c247-2e5a24437f8a" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "sample_data synthetic_book_reviews.csv synthetic_sales_data.csv\n" + ] + } + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "jpASMyIQMaAq", + "lquNYCbfL9IM", + "0IWuNpxxYDJF", + "oCdTsin2Yfp3", + "T0TOeRC4Yrnn", + "duI5dv3CZYvF", + "qMjRKMBQZlJi", + "p-1Pr2szaqLk", + "SIaJUGIpaH4V", + "pY4yCoIuaQqp", + "n4-TaNTFgPak", + "HnngRNTgacYt", + "HF9F9HIzgT7Z", + "T8AdKkmASq9a", + "OhXbdGD5fH0c", + "L2ak1HlcgoTe", + "4IXZKcCSgxnq", + "EhIjz9WohAmZ", + "Gi4y9M9KuDWx", + "fQhfVaDmuULT", + "bmJMXF-Bukdm", + "RYvGyVfXuo54" + ], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file