Spaces:

ESCP
/

session5

Sleeping

File size: 33,385 Bytes

{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4ba6aba8"
      },
      "source": [
        "# 🤖 **Data Collection, Creation, Storage, and Processing**\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jpASMyIQMaAq"
      },
      "source": [
        "## **1.** 📦 Install required packages"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "f48c8f8c",
        "outputId": "589fe704-b3da-4c3a-b4d8-c8e877c7c88a"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n",
            "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n",
            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n",
            "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n",
            "Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n",
            "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n",
            "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n",
            "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n",
            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n",
            "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.3)\n",
            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n",
            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n",
            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.62.1)\n",
            "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.5.0)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n",
            "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n",
            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n",
            "Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n",
            "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.1)\n",
            "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n",
            "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (2025.11.3)\n",
            "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (4.67.3)\n",
            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n"
          ]
        }
      ],
      "source": [
        "!pip install beautifulsoup4 pandas matplotlib seaborn numpy textblob"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lquNYCbfL9IM"
      },
      "source": [
        "## **2.** ⛏ Web-scrape all book titles, prices, and ratings from books.toscrape.com"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0IWuNpxxYDJF"
      },
      "source": [
        "### *a. Initial setup*\n",
        "Define the base url of the website you will scrape as well as how and what you will scrape"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "91d52125"
      },
      "outputs": [],
      "source": [
        "import requests\n",
        "from bs4 import BeautifulSoup\n",
        "import pandas as pd\n",
        "import time\n",
        "\n",
        "base_url = \"https://books.toscrape.com/catalogue/page-{}.html\"\n",
        "headers = {\"User-Agent\": \"Mozilla/5.0\"}\n",
        "\n",
        "titles, prices, ratings = [], [], []"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "oCdTsin2Yfp3"
      },
      "source": [
        "### *b. Fill titles, prices, and ratings from the web pages*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "xqO5Y3dnYhxt"
      },
      "outputs": [],
      "source": [
        "# Loop through all 50 pages\n",
        "for page in range(1, 51):\n",
        "    url = base_url.format(page)\n",
        "    response = requests.get(url, headers=headers)\n",
        "    soup = BeautifulSoup(response.content, \"html.parser\")\n",
        "    books = soup.find_all(\"article\", class_=\"product_pod\")\n",
        "\n",
        "    for book in books:\n",
        "        titles.append(book.h3.a[\"title\"])\n",
        "        prices.append(float(book.find(\"p\", class_=\"price_color\").text[1:]))\n",
        "        ratings.append(book.p.get(\"class\")[1])\n",
        "\n",
        "    time.sleep(0.5)  # polite scraping delay"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "T0TOeRC4Yrnn"
      },
      "source": [
        "### *c. ✋🏻🛑⛔️ Create a dataframe df_books that contains the now complete \"title\", \"price\", and \"rating\" objects*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "l5FkkNhUYTHh"
      },
      "outputs": [],
      "source": [
        "df_books = pd.DataFrame({\n",
        "    \"title\": titles,\n",
        "    \"price\": prices,\n",
        "    \"rating\": ratings\n",
        "})"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "duI5dv3CZYvF"
      },
      "source": [
        "### *d. Save web-scraped dataframe either as a CSV or Excel file*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "lC1U_YHtZifh"
      },
      "outputs": [],
      "source": [
        "# 💾 Save to CSV\n",
        "df_books.to_csv(\"books_data.csv\", index=False)\n",
        "\n",
        "# 💾 Or save to Excel\n",
        "# df_books.to_excel(\"books_data.xlsx\", index=False)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qMjRKMBQZlJi"
      },
      "source": [
        "### *e. ✋🏻🛑⛔️ View first fiew lines*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "O_wIvTxYZqCK"
      },
      "outputs": [],
      "source": [
        "df_books.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "p-1Pr2szaqLk"
      },
      "source": [
        "## **3.** 🧩 Create a meaningful connection between real & synthetic datasets"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SIaJUGIpaH4V"
      },
      "source": [
        "### *a. Initial setup*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-gPXGcRPuV_9"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import random\n",
        "from datetime import datetime\n",
        "import warnings\n",
        "\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "random.seed(2025)\n",
        "np.random.seed(2025)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "pY4yCoIuaQqp"
      },
      "source": [
        "### *b. Generate popularity scores based on rating (with some randomness) with a generate_popularity_score function*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "mnd5hdAbaNjz"
      },
      "outputs": [],
      "source": [
        "def generate_popularity_score(rating):\n",
        "    base = {\"One\": 2, \"Two\": 3, \"Three\": 3, \"Four\": 4, \"Five\": 4}.get(rating, 3)\n",
        "    trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n",
        "    return int(np.clip(base + trend_factor, 1, 5))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "n4-TaNTFgPak"
      },
      "source": [
        "### *c. ✋🏻🛑⛔️ Run the function to create a \"popularity_score\" column from \"rating\"*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "V-G3OCUCgR07"
      },
      "outputs": [],
      "source": [
        "df_books[\"popularity_score\"] = df_books[\"rating\"].apply(generate_popularity_score)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HnngRNTgacYt"
      },
      "source": [
        "### *d. Decide on the sentiment_label based on the popularity score with a get_sentiment function*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kUtWmr8maZLZ"
      },
      "outputs": [],
      "source": [
        "def get_sentiment(popularity_score):\n",
        "    if popularity_score <= 2:\n",
        "        return \"negative\"\n",
        "    elif popularity_score == 3:\n",
        "        return \"neutral\"\n",
        "    else:\n",
        "        return \"positive\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HF9F9HIzgT7Z"
      },
      "source": [
        "### *e. ✋🏻🛑⛔️ Run the function to create a \"sentiment_label\" column from \"popularity_score\"*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "tafQj8_7gYCG"
      },
      "outputs": [],
      "source": [
        "df_books[\"sentiment_label\"] = df_books[\"popularity_score\"].apply(get_sentiment)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "T8AdKkmASq9a"
      },
      "source": [
        "## **4.** 📈 Generate synthetic book sales data of 18 months"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OhXbdGD5fH0c"
      },
      "source": [
        "### *a. Create a generate_sales_profit function that would generate sales patterns based on sentiment_label (with some randomness)*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "qkVhYPXGbgEn"
      },
      "outputs": [],
      "source": [
        "def generate_sales_profile(sentiment):\n",
        "    months = pd.date_range(end=datetime.today(), periods=18, freq=\"M\")\n",
        "\n",
        "    if sentiment == \"positive\":\n",
        "        base = random.randint(200, 300)\n",
        "        trend = np.linspace(base, base + random.randint(20, 60), len(months))\n",
        "    elif sentiment == \"negative\":\n",
        "        base = random.randint(20, 80)\n",
        "        trend = np.linspace(base, base - random.randint(10, 30), len(months))\n",
        "    else:  # neutral\n",
        "        base = random.randint(80, 160)\n",
        "        trend = np.full(len(months), base + random.randint(-10, 10))\n",
        "\n",
        "    seasonality = 10 * np.sin(np.linspace(0, 3 * np.pi, len(months)))\n",
        "    noise = np.random.normal(0, 5, len(months))\n",
        "    monthly_sales = np.clip(trend + seasonality + noise, a_min=0, a_max=None).astype(int)\n",
        "\n",
        "    return list(zip(months.strftime(\"%Y-%m\"), monthly_sales))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "L2ak1HlcgoTe"
      },
      "source": [
        "### *b. Run the function as part of building sales_data*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "SlJ24AUafoDB"
      },
      "outputs": [],
      "source": [
        "sales_data = []\n",
        "for _, row in df_books.iterrows():\n",
        "    records = generate_sales_profile(row[\"sentiment_label\"])\n",
        "    for month, units in records:\n",
        "        sales_data.append({\n",
        "            \"title\": row[\"title\"],\n",
        "            \"month\": month,\n",
        "            \"units_sold\": units,\n",
        "            \"sentiment_label\": row[\"sentiment_label\"]\n",
        "        })"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4IXZKcCSgxnq"
      },
      "source": [
        "### *c. ✋🏻🛑⛔️ Create a df_sales DataFrame from sales_data*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wcN6gtiZg-ws"
      },
      "outputs": [],
      "source": [
        "df_sales = pd.DataFrame(sales_data)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "EhIjz9WohAmZ"
      },
      "source": [
        "### *d. Save df_sales as synthetic_sales_data.csv & view first few lines*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "MzbZvLcAhGaH"
      },
      "outputs": [],
      "source": [
        "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n",
        "\n",
        "print(df_sales.head())"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7g9gqBgQMtJn"
      },
      "source": [
        "## **5.** 🎯 Generate synthetic customer reviews"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Gi4y9M9KuDWx"
      },
      "source": [
        "### *a. ✋🏻🛑⛔️ Ask ChatGPT to create a list of 50 distinct generic book review texts for the sentiment labels \"positive\", \"neutral\", and \"negative\" called synthetic_reviews_by_sentiment*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "b3cd2a50"
      },
      "outputs": [],
      "source": [
        "synthetic_reviews_by_sentiment = {\n",
        "    \"positive\": [\n",
        "        \"A compelling and heartwarming read that stayed with me long after I finished.\",\n",
        "        \"Brilliantly written! The characters were unforgettable and the plot was engaging.\",\n",
        "        \"One of the best books I've read this year — inspiring and emotionally rich.\",\n",
        "        \"Absolutely loved this book from beginning to end.\",\n",
        "        \"The storytelling was immersive and beautifully crafted.\",\n",
        "        \"An outstanding novel with depth and heart.\",\n",
        "        \"A truly captivating and uplifting experience.\",\n",
        "        \"The characters felt real and relatable.\",\n",
        "        \"A masterpiece that exceeded my expectations.\",\n",
        "        \"Emotionally powerful and wonderfully written.\",\n",
        "        \"A gripping story that kept me turning pages.\",\n",
        "        \"Incredibly well-developed plot and themes.\",\n",
        "        \"A fantastic read that I would highly recommend.\",\n",
        "        \"The pacing was perfect and the ending satisfying.\",\n",
        "        \"An inspiring and thought-provoking book.\",\n",
        "        \"Beautiful prose and compelling narrative.\",\n",
        "        \"One of the most memorable books I've read.\",\n",
        "        \"A rich and engaging literary journey.\",\n",
        "        \"Heartfelt and meaningful storytelling.\",\n",
        "        \"An exceptional piece of writing.\",\n",
        "        \"Loved every chapter of this book.\",\n",
        "        \"A remarkable and moving story.\",\n",
        "        \"Creative, original, and deeply engaging.\",\n",
        "        \"The author truly brought the story to life.\",\n",
        "        \"An unforgettable reading experience.\",\n",
        "        \"Deeply touching and emotionally resonant.\",\n",
        "        \"A brilliant concept executed perfectly.\",\n",
        "        \"The dialogue felt authentic and powerful.\",\n",
        "        \"A must-read for fans of great storytelling.\",\n",
        "        \"Absolutely brilliant from start to finish.\",\n",
        "        \"A story that lingers in your mind.\",\n",
        "        \"Engaging, emotional, and beautifully written.\",\n",
        "        \"The plot twists were masterfully done.\",\n",
        "        \"An inspiring and satisfying novel.\",\n",
        "        \"Highly enjoyable and expertly crafted.\",\n",
        "        \"The characters had incredible depth.\",\n",
        "        \"A captivating and heartwarming story.\",\n",
        "        \"Wonderful balance of drama and emotion.\",\n",
        "        \"A powerful narrative with strong themes.\",\n",
        "        \"Simply outstanding in every way.\",\n",
        "        \"A joy to read and experience.\",\n",
        "        \"The writing style was elegant and immersive.\",\n",
        "        \"A thoughtful and engaging story.\",\n",
        "        \"A beautifully structured novel.\",\n",
        "        \"An impressive and rewarding read.\",\n",
        "        \"Emotionally gripping and meaningful.\",\n",
        "        \"A standout book in its genre.\",\n",
        "        \"A delightful and compelling read.\",\n",
        "        \"Strong storytelling and vivid imagery.\",\n",
        "        \"A truly excellent novel.\"\n",
        "    ],\n",
        "    \"neutral\": [\n",
        "        \"An average book — not great, but not bad either.\",\n",
        "        \"Some parts really stood out, others felt a bit flat.\",\n",
        "        \"It was okay overall. A decent way to pass the time.\",\n",
        "        \"A fairly standard and predictable story.\",\n",
        "        \"Not particularly memorable, but not terrible.\",\n",
        "        \"The pacing was fine, though nothing special.\",\n",
        "        \"An acceptable read with some interesting moments.\",\n",
        "        \"It had strengths and weaknesses throughout.\",\n",
        "        \"A moderately engaging book.\",\n",
        "        \"Nothing extraordinary, but readable.\",\n",
        "        \"The concept was good, execution was average.\",\n",
        "        \"Some chapters were better than others.\",\n",
        "        \"A decent storyline with mixed results.\",\n",
        "        \"Not bad, just not remarkable.\",\n",
        "        \"An ordinary reading experience.\",\n",
        "        \"It met expectations but didn't exceed them.\",\n",
        "        \"Fairly typical for its genre.\",\n",
        "        \"A simple and straightforward narrative.\",\n",
        "        \"Reasonably enjoyable but not exciting.\",\n",
        "        \"A serviceable and competent story.\",\n",
        "        \"The writing was solid but unremarkable.\",\n",
        "        \"An average addition to the genre.\",\n",
        "        \"Entertaining enough, though forgettable.\",\n",
        "        \"The characters were fine but lacked depth.\",\n",
        "        \"Neither impressive nor disappointing.\",\n",
        "        \"A balanced mix of good and weak elements.\",\n",
        "        \"An okay book for a weekend read.\",\n",
        "        \"Predictable but coherent storytelling.\",\n",
        "        \"A moderately interesting concept.\",\n",
        "        \"Some moments stood out positively.\",\n",
        "        \"The ending was satisfactory.\",\n",
        "        \"A fair and decent reading experience.\",\n",
        "        \"Competent writing without surprises.\",\n",
        "        \"It had potential but felt safe.\",\n",
        "        \"A mild and steady narrative.\",\n",
        "        \"Reasonably structured but not innovative.\",\n",
        "        \"An average level of engagement.\",\n",
        "        \"Acceptable but not particularly gripping.\",\n",
        "        \"Not very original but readable.\",\n",
        "        \"An overall neutral experience.\",\n",
        "        \"A steady but unexciting plot.\",\n",
        "        \"Moderately well-written.\",\n",
        "        \"It had some good ideas.\",\n",
        "        \"A fairly consistent story.\",\n",
        "        \"Nothing too impressive or disappointing.\",\n",
        "        \"The execution was decent.\",\n",
        "        \"A readable yet ordinary novel.\",\n",
        "        \"Fine but not something I'd reread.\",\n",
        "        \"An average literary effort.\",\n",
        "        \"Satisfactory overall.\"\n",
        "    ],\n",
        "    \"negative\": [\n",
        "        \"I struggled to get through this one — it just didn’t grab me.\",\n",
        "        \"The plot was confusing and the characters felt underdeveloped.\",\n",
        "        \"Disappointing. I had high hopes, but they weren't met.\",\n",
        "        \"The pacing was painfully slow.\",\n",
        "        \"I couldn't connect with the story at all.\",\n",
        "        \"The writing style didn't work for me.\",\n",
        "        \"Predictable and uninspired throughout.\",\n",
        "        \"The ending was abrupt and unsatisfying.\",\n",
        "        \"It felt disjointed and poorly structured.\",\n",
        "        \"The characters lacked depth and realism.\",\n",
        "        \"I found it difficult to stay interested.\",\n",
        "        \"The dialogue felt forced and unnatural.\",\n",
        "        \"Too many plot holes to ignore.\",\n",
        "        \"An underwhelming reading experience.\",\n",
        "        \"The story lacked emotional impact.\",\n",
        "        \"Not as engaging as I expected.\",\n",
        "        \"The narrative felt messy and unclear.\",\n",
        "        \"I wouldn’t recommend this book.\",\n",
        "        \"It simply failed to hold my attention.\",\n",
        "        \"A frustrating and disappointing read.\",\n",
        "        \"The themes were poorly developed.\",\n",
        "        \"The plot felt repetitive and dull.\",\n",
        "        \"I expected much more from this book.\",\n",
        "        \"The characters were hard to relate to.\",\n",
        "        \"It dragged on unnecessarily.\",\n",
        "        \"A weak execution of an interesting idea.\",\n",
        "        \"I nearly gave up halfway through.\",\n",
        "        \"The storyline lacked coherence.\",\n",
        "        \"Not memorable in any positive way.\",\n",
        "        \"The writing felt flat and uninspired.\",\n",
        "        \"The structure was confusing.\",\n",
        "        \"It didn't live up to the description.\",\n",
        "        \"The pacing ruined the experience.\",\n",
        "        \"The book felt rushed at the end.\",\n",
        "        \"I struggled to understand the direction.\",\n",
        "        \"The development was inconsistent.\",\n",
        "        \"A bland and forgettable novel.\",\n",
        "        \"The plot twists were predictable.\",\n",
        "        \"The emotional depth was missing.\",\n",
        "        \"It felt incomplete and unsatisfying.\",\n",
        "        \"The storytelling lacked clarity.\",\n",
        "        \"The characters felt one-dimensional.\",\n",
        "        \"Overall, a disappointing read.\",\n",
        "        \"The narrative was difficult to follow.\",\n",
        "        \"I was bored most of the time.\",\n",
        "        \"The writing lacked polish.\",\n",
        "        \"It failed to deliver on its promise.\",\n",
        "        \"The story never truly engaged me.\",\n",
        "        \"A poorly executed concept.\",\n",
        "        \"Simply not enjoyable.\"\n",
        "    ]\n",
        "}"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fQhfVaDmuULT"
      },
      "source": [
        "### *b. Generate 10 reviews per book using random sampling from the corresponding 50*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "l2SRc3PjuTGM"
      },
      "outputs": [],
      "source": [
        "review_rows = []\n",
        "for _, row in df_books.iterrows():\n",
        "    title = row['title']\n",
        "    sentiment_label = row['sentiment_label']\n",
        "    review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n",
        "    sampled_reviews = random.sample(review_pool, 10)\n",
        "    for review_text in sampled_reviews:\n",
        "        review_rows.append({\n",
        "            \"title\": title,\n",
        "            \"sentiment_label\": sentiment_label,\n",
        "            \"review_text\": review_text,\n",
        "            \"rating\": row['rating'],\n",
        "            \"popularity_score\": row['popularity_score']\n",
        "        })"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "bmJMXF-Bukdm"
      },
      "source": [
        "### *c. Create the final dataframe df_reviews & save it as synthetic_book_reviews.csv*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ZUKUqZsuumsp"
      },
      "outputs": [],
      "source": [
        "df_reviews = pd.DataFrame(review_rows)\n",
        "df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### *c. inputs for R*"
      ],
      "metadata": {
        "id": "_602pYUS3gY5"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "3946e521"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "\n",
        "def _safe_num(s):\n",
        "    return pd.to_numeric(\n",
        "        pd.Series(s).astype(str).str.replace(r\"[^0-9.]\", \"\", regex=True),\n",
        "        errors=\"coerce\"\n",
        "    )\n",
        "\n",
        "# --- Clean book metadata (price/rating) ---\n",
        "df_books_r = df_books.copy()\n",
        "if \"price\" in df_books_r.columns:\n",
        "    df_books_r[\"price\"] = _safe_num(df_books_r[\"price\"])\n",
        "if \"rating\" in df_books_r.columns:\n",
        "    df_books_r[\"rating\"] = _safe_num(df_books_r[\"rating\"])\n",
        "\n",
        "df_books_r[\"title\"] = df_books_r[\"title\"].astype(str).str.strip()\n",
        "\n",
        "# --- Clean sales ---\n",
        "df_sales_r = df_sales.copy()\n",
        "df_sales_r[\"title\"] = df_sales_r[\"title\"].astype(str).str.strip()\n",
        "df_sales_r[\"month\"] = pd.to_datetime(df_sales_r[\"month\"], errors=\"coerce\")\n",
        "df_sales_r[\"units_sold\"] = _safe_num(df_sales_r[\"units_sold\"])\n",
        "\n",
        "# --- Clean reviews ---\n",
        "df_reviews_r = df_reviews.copy()\n",
        "df_reviews_r[\"title\"] = df_reviews_r[\"title\"].astype(str).str.strip()\n",
        "df_reviews_r[\"sentiment_label\"] = df_reviews_r[\"sentiment_label\"].astype(str).str.lower().str.strip()\n",
        "if \"rating\" in df_reviews_r.columns:\n",
        "    df_reviews_r[\"rating\"] = _safe_num(df_reviews_r[\"rating\"])\n",
        "if \"popularity_score\" in df_reviews_r.columns:\n",
        "    df_reviews_r[\"popularity_score\"] = _safe_num(df_reviews_r[\"popularity_score\"])\n",
        "\n",
        "# --- Sentiment shares per title (from reviews) ---\n",
        "sent_counts = (\n",
        "    df_reviews_r.groupby([\"title\", \"sentiment_label\"])\n",
        "    .size()\n",
        "    .unstack(fill_value=0)\n",
        ")\n",
        "for lab in [\"positive\", \"neutral\", \"negative\"]:\n",
        "    if lab not in sent_counts.columns:\n",
        "        sent_counts[lab] = 0\n",
        "\n",
        "sent_counts[\"total_reviews\"] = sent_counts[[\"positive\", \"neutral\", \"negative\"]].sum(axis=1)\n",
        "den = sent_counts[\"total_reviews\"].replace(0, np.nan)\n",
        "sent_counts[\"share_positive\"] = sent_counts[\"positive\"] / den\n",
        "sent_counts[\"share_neutral\"]  = sent_counts[\"neutral\"]  / den\n",
        "sent_counts[\"share_negative\"] = sent_counts[\"negative\"] / den\n",
        "sent_counts = sent_counts.reset_index()\n",
        "\n",
        "# --- Sales aggregation per title ---\n",
        "sales_by_title = (\n",
        "    df_sales_r.dropna(subset=[\"title\"])\n",
        "    .groupby(\"title\", as_index=False)\n",
        "    .agg(\n",
        "        months_observed=(\"month\", \"nunique\"),\n",
        "        avg_units_sold=(\"units_sold\", \"mean\"),\n",
        "        total_units_sold=(\"units_sold\", \"sum\"),\n",
        "    )\n",
        ")\n",
        "\n",
        "# --- Title-level features (join sales + books + sentiment) ---\n",
        "df_title = (\n",
        "    sales_by_title\n",
        "    .merge(df_books_r[[\"title\", \"price\", \"rating\"]], on=\"title\", how=\"left\")\n",
        "    .merge(sent_counts[[\"title\", \"share_positive\", \"share_neutral\", \"share_negative\", \"total_reviews\"]],\n",
        "           on=\"title\", how=\"left\")\n",
        ")\n",
        "\n",
        "df_title[\"avg_revenue\"] = df_title[\"avg_units_sold\"] * df_title[\"price\"]\n",
        "df_title[\"total_revenue\"] = df_title[\"total_units_sold\"] * df_title[\"price\"]\n",
        "\n",
        "df_title.to_csv(\"synthetic_title_level_features.csv\", index=False)\n",
        "print(\"✅ Wrote synthetic_title_level_features.csv\")\n",
        "\n",
        "# --- Monthly revenue series (proxy: units_sold * price) ---\n",
        "monthly_rev = (\n",
        "    df_sales_r.merge(df_books_r[[\"title\", \"price\"]], on=\"title\", how=\"left\")\n",
        ")\n",
        "monthly_rev[\"revenue\"] = monthly_rev[\"units_sold\"] * monthly_rev[\"price\"]\n",
        "\n",
        "df_monthly = (\n",
        "    monthly_rev.dropna(subset=[\"month\"])\n",
        "    .groupby(\"month\", as_index=False)[\"revenue\"]\n",
        "    .sum()\n",
        "    .rename(columns={\"revenue\": \"total_revenue\"})\n",
        "    .sort_values(\"month\")\n",
        ")\n",
        "# if revenue is all NA (e.g., missing price), fallback to units_sold as a teaching proxy\n",
        "if df_monthly[\"total_revenue\"].notna().sum() == 0:\n",
        "    df_monthly = (\n",
        "        df_sales_r.dropna(subset=[\"month\"])\n",
        "        .groupby(\"month\", as_index=False)[\"units_sold\"]\n",
        "        .sum()\n",
        "        .rename(columns={\"units_sold\": \"total_revenue\"})\n",
        "        .sort_values(\"month\")\n",
        "    )\n",
        "\n",
        "df_monthly[\"month\"] = pd.to_datetime(df_monthly[\"month\"], errors=\"coerce\").dt.strftime(\"%Y-%m-%d\")\n",
        "df_monthly.to_csv(\"synthetic_monthly_revenue_series.csv\", index=False)\n",
        "print(\"✅ Wrote synthetic_monthly_revenue_series.csv\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "RYvGyVfXuo54"
      },
      "source": [
        "### *d. ✋🏻🛑⛔️ View the first few lines*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "xfE8NMqOurKo"
      },
      "outputs": [],
      "source": [
        "df_reviews.head()"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "collapsed_sections": [
        "jpASMyIQMaAq",
        "lquNYCbfL9IM",
        "0IWuNpxxYDJF",
        "oCdTsin2Yfp3",
        "T0TOeRC4Yrnn",
        "duI5dv3CZYvF",
        "qMjRKMBQZlJi",
        "p-1Pr2szaqLk",
        "SIaJUGIpaH4V",
        "pY4yCoIuaQqp",
        "n4-TaNTFgPak",
        "HnngRNTgacYt",
        "HF9F9HIzgT7Z",
        "T8AdKkmASq9a",
        "OhXbdGD5fH0c",
        "L2ak1HlcgoTe",
        "4IXZKcCSgxnq",
        "EhIjz9WohAmZ",
        "Gi4y9M9KuDWx",
        "fQhfVaDmuULT",
        "bmJMXF-Bukdm",
        "RYvGyVfXuo54"
      ],
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}