{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "4ba6aba8" }, "source": [ "# ๐Ÿค– **Data Collection, Creation, Storage, and Processing**\n" ] }, { "cell_type": "markdown", "metadata": { "id": "jpASMyIQMaAq" }, "source": [ "## **1.** ๐Ÿ“ฆ Install required packages" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f48c8f8c", "outputId": "58a1c753-5c10-4635-8a1b-df629b67e2ac" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n", "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", "Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n", "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n", "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.3)\n", "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n", "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", "Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n", "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.1)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n", "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (2025.11.3)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (4.67.3)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n" ] } ], "source": [ "!pip install beautifulsoup4 pandas matplotlib seaborn numpy textblob" ] }, { "cell_type": "markdown", "metadata": { "id": "lquNYCbfL9IM" }, "source": [ "## **2.** โ› Web-scrape all book titles, prices, and ratings from books.toscrape.com" ] }, { "cell_type": "markdown", "metadata": { "id": "0IWuNpxxYDJF" }, "source": [ "### *a. Initial setup*\n", "Define the base url of the website you will scrape as well as how and what you will scrape" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "91d52125" }, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "import time\n", "\n", "base_url = \"https://books.toscrape.com/catalogue/page-{}.html\"\n", "headers = {\"User-Agent\": \"Mozilla/5.0\"}\n", "\n", "titles, prices, ratings = [], [], []" ] }, { "cell_type": "markdown", "metadata": { "id": "oCdTsin2Yfp3" }, "source": [ "### *b. Fill titles, prices, and ratings from the web pages*" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "xqO5Y3dnYhxt" }, "outputs": [], "source": [ "# Loop through all 50 pages\n", "for page in range(1, 51):\n", " url = base_url.format(page)\n", " response = requests.get(url, headers=headers)\n", " soup = BeautifulSoup(response.content, \"html.parser\")\n", " books = soup.find_all(\"article\", class_=\"product_pod\")\n", "\n", " for book in books:\n", " titles.append(book.h3.a[\"title\"])\n", " prices.append(float(book.find(\"p\", class_=\"price_color\").text[1:]))\n", " ratings.append(book.p.get(\"class\")[1])\n", "\n", " time.sleep(0.5) # polite scraping delay" ] }, { "cell_type": "markdown", "metadata": { "id": "T0TOeRC4Yrnn" }, "source": [ "### *c. โœ‹๐Ÿป๐Ÿ›‘โ›”๏ธ Create a dataframe df_books that contains the now complete \"title\", \"price\", and \"rating\" objects*" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "l5FkkNhUYTHh" }, "outputs": [], "source": [ "df_books= pd.DataFrame({'title': titles, 'price': prices, 'rating': ratings})" ] }, { "cell_type": "markdown", "metadata": { "id": "duI5dv3CZYvF" }, "source": [ "### *d. Save web-scraped dataframe either as a CSV or Excel file*" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "lC1U_YHtZifh" }, "outputs": [], "source": [ "# ๐Ÿ’พ Save to CSV\n", "df_books.to_csv(\"books_data.csv\", index=False)\n", "\n", "# ๐Ÿ’พ Or save to Excel\n", "# df_books.to_excel(\"books_data.xlsx\", index=False)" ] }, { "cell_type": "markdown", "metadata": { "id": "qMjRKMBQZlJi" }, "source": [ "### *e. โœ‹๐Ÿป๐Ÿ›‘โ›”๏ธ View first fiew lines*" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "O_wIvTxYZqCK", "outputId": "3d22c37a-d5b2-4818-f6d8-3005af8b5387" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title price rating\n", "0 A Light in the Attic 51.77 Three\n", "1 Tipping the Velvet 53.74 One\n", "2 Soumission 50.10 One\n", "3 Sharp Objects 47.82 Four\n", "4 Sapiens: A Brief History of Humankind 54.23 Five" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepricerating
0A Light in the Attic51.77Three
1Tipping the Velvet53.74One
2Soumission50.10One
3Sharp Objects47.82Four
4Sapiens: A Brief History of Humankind54.23Five
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_books", "summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 7 } ], "source": [ "df_books.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "p-1Pr2szaqLk" }, "source": [ "## **3.** ๐Ÿงฉ Create a meaningful connection between real & synthetic datasets" ] }, { "cell_type": "markdown", "metadata": { "id": "SIaJUGIpaH4V" }, "source": [ "### *a. Initial setup*" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "-gPXGcRPuV_9" }, "outputs": [], "source": [ "import numpy as np\n", "import random\n", "from datetime import datetime\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "random.seed(2025)\n", "np.random.seed(2025)" ] }, { "cell_type": "markdown", "metadata": { "id": "pY4yCoIuaQqp" }, "source": [ "### *b. Generate popularity scores based on rating (with some randomness) with a generate_popularity_score function*" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "mnd5hdAbaNjz" }, "outputs": [], "source": [ "def generate_popularity_score(rating):\n", " base = {\"One\": 2, \"Two\": 3, \"Three\": 3, \"Four\": 4, \"Five\": 4}.get(rating, 3)\n", " trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n", " return int(np.clip(base + trend_factor, 1, 5))" ] }, { "cell_type": "markdown", "metadata": { "id": "n4-TaNTFgPak" }, "source": [ "### *c. โœ‹๐Ÿป๐Ÿ›‘โ›”๏ธ Run the function to create a \"popularity_score\" column from \"rating\"*" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "id": "V-G3OCUCgR07" }, "outputs": [], "source": [ "df_books[\"popularity_score\"] = df_books[\"rating\"].apply(generate_popularity_score)\n", "\n" ] }, { "cell_type": "markdown", "metadata": { "id": "HnngRNTgacYt" }, "source": [ "### *d. Decide on the sentiment_label based on the popularity score with a get_sentiment function*" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "id": "kUtWmr8maZLZ" }, "outputs": [], "source": [ "def get_sentiment(popularity_score):\n", " if popularity_score <= 2:\n", " return \"negative\"\n", " elif popularity_score == 3:\n", " return \"neutral\"\n", " else:\n", " return \"positive\"" ] }, { "cell_type": "markdown", "metadata": { "id": "HF9F9HIzgT7Z" }, "source": [ "### *e. โœ‹๐Ÿป๐Ÿ›‘โ›”๏ธ Run the function to create a \"sentiment_label\" column from \"popularity_score\"*" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "id": "tafQj8_7gYCG" }, "outputs": [], "source": [ "df_books[\"sentiment_label\"] = df_books[\"popularity_score\"].apply(get_sentiment)" ] }, { "cell_type": "markdown", "metadata": { "id": "T8AdKkmASq9a" }, "source": [ "## **4.** ๐Ÿ“ˆ Generate synthetic book sales data of 18 months" ] }, { "cell_type": "markdown", "metadata": { "id": "OhXbdGD5fH0c" }, "source": [ "### *a. Create a generate_sales_profit function that would generate sales patterns based on sentiment_label (with some randomness)*" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "id": "qkVhYPXGbgEn" }, "outputs": [], "source": [ "def generate_sales_profile(sentiment):\n", " months = pd.date_range(end=datetime.today(), periods=18, freq=\"M\")\n", "\n", " if sentiment == \"positive\":\n", " base = random.randint(200, 300)\n", " trend = np.linspace(base, base + random.randint(20, 60), len(months))\n", " elif sentiment == \"negative\":\n", " base = random.randint(20, 80)\n", " trend = np.linspace(base, base - random.randint(10, 30), len(months))\n", " else: # neutral\n", " base = random.randint(80, 160)\n", " trend = np.full(len(months), base + random.randint(-10, 10))\n", "\n", " seasonality = 10 * np.sin(np.linspace(0, 3 * np.pi, len(months)))\n", " noise = np.random.normal(0, 5, len(months))\n", " monthly_sales = np.clip(trend + seasonality + noise, a_min=0, a_max=None).astype(int)\n", "\n", " return list(zip(months.strftime(\"%Y-%m\"), monthly_sales))" ] }, { "cell_type": "markdown", "metadata": { "id": "L2ak1HlcgoTe" }, "source": [ "### *b. Run the function as part of building sales_data*" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "id": "SlJ24AUafoDB" }, "outputs": [], "source": [ "sales_data = []\n", "for _, row in df_books.iterrows():\n", " records = generate_sales_profile(row[\"sentiment_label\"])\n", " for month, units in records:\n", " sales_data.append({\n", " \"title\": row[\"title\"],\n", " \"month\": month,\n", " \"units_sold\": units,\n", " \"sentiment_label\": row[\"sentiment_label\"]\n", " })" ] }, { "cell_type": "markdown", "metadata": { "id": "4IXZKcCSgxnq" }, "source": [ "### *c. โœ‹๐Ÿป๐Ÿ›‘โ›”๏ธ Create a df_sales DataFrame from sales_data*" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "id": "wcN6gtiZg-ws" }, "outputs": [], "source": [ "df_sales= pd.DataFrame(sales_data)" ] }, { "cell_type": "markdown", "metadata": { "id": "EhIjz9WohAmZ" }, "source": [ "### *d. Save df_sales as synthetic_sales_data.csv & view first few lines*" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MzbZvLcAhGaH", "outputId": "18f70c1e-9370-43ce-eac8-2e7071894aac" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " title month units_sold sentiment_label\n", "0 A Light in the Attic 2024-08 100 neutral\n", "1 A Light in the Attic 2024-09 109 neutral\n", "2 A Light in the Attic 2024-10 102 neutral\n", "3 A Light in the Attic 2024-11 107 neutral\n", "4 A Light in the Attic 2024-12 108 neutral\n" ] } ], "source": [ "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n", "\n", "print(df_sales.head())" ] }, { "cell_type": "markdown", "metadata": { "id": "7g9gqBgQMtJn" }, "source": [ "## **5.** ๐ŸŽฏ Generate synthetic customer reviews" ] }, { "cell_type": "markdown", "metadata": { "id": "Gi4y9M9KuDWx" }, "source": [ "### *a. โœ‹๐Ÿป๐Ÿ›‘โ›”๏ธ Ask ChatGPT to create a list of 50 distinct generic book review texts for the sentiment labels \"positive\", \"neutral\", and \"negative\" called synthetic_reviews_by_sentiment*" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "id": "b3cd2a50" }, "outputs": [], "source": [ "synthetic_reviews_by_sentiment = {\n", " \"positive\": [\n", " \"An absolutely captivating read from start to finish.\",\n", " \"I couldnโ€™t put this book down.\",\n", " \"A beautifully written and deeply moving story.\",\n", " \"The characters felt incredibly real and relatable.\",\n", " \"One of the best books Iโ€™ve read this year.\",\n", " \"A powerful and inspiring narrative.\",\n", " \"The plot twists kept me hooked throughout.\",\n", " \"Exceptionally well-crafted and engaging.\",\n", " \"A masterpiece of storytelling.\",\n", " \"The writing style was elegant and immersive.\",\n", " \"I loved every chapter of this book.\",\n", " \"An unforgettable reading experience.\",\n", " \"The pacing was perfect and exciting.\",\n", " \"A heartwarming and uplifting story.\",\n", " \"Rich character development and strong themes.\",\n", " \"Truly a remarkable novel.\",\n", " \"This book exceeded my expectations.\",\n", " \"A compelling and thoughtful story.\",\n", " \"The dialogue felt authentic and natural.\",\n", " \"A brilliant and imaginative tale.\",\n", " \"I highly recommend this book.\",\n", " \"An emotional and satisfying journey.\",\n", " \"The ending was incredibly satisfying.\",\n", " \"A fresh and original perspective.\",\n", " \"An engaging and memorable story.\",\n", " \"The authorโ€™s voice was strong and confident.\",\n", " \"A beautifully structured narrative.\",\n", " \"The world-building was impressive.\",\n", " \"A delightful and entertaining read.\",\n", " \"This story stayed with me long after finishing.\",\n", " \"An outstanding contribution to the genre.\",\n", " \"The themes were handled with care and depth.\",\n", " \"A wonderfully immersive experience.\",\n", " \"The writing was both poetic and clear.\",\n", " \"A thought-provoking and rewarding read.\",\n", " \"The story was gripping and dynamic.\",\n", " \"A charming and well-paced novel.\",\n", " \"The characters evolved beautifully.\",\n", " \"An emotionally resonant book.\",\n", " \"A captivating and intelligent story.\",\n", " \"The plot was cleverly constructed.\",\n", " \"An inspiring and meaningful narrative.\",\n", " \"A truly enjoyable book.\",\n", " \"The suspense was expertly maintained.\",\n", " \"A compelling mix of drama and insight.\",\n", " \"The storytelling was vivid and engaging.\",\n", " \"A memorable and touching novel.\",\n", " \"The author delivered a fantastic story.\",\n", " \"An excellent and satisfying read.\",\n", " \"I would gladly read this again.\"\n", " ],\n", "\n", " \"neutral\": [\n", " \"The book was okay overall.\",\n", " \"It was an average reading experience.\",\n", " \"Some parts were interesting, others less so.\",\n", " \"The story was fine but not exceptional.\",\n", " \"A fairly standard plot.\",\n", " \"The characters were decent but not memorable.\",\n", " \"It had its moments but nothing groundbreaking.\",\n", " \"An ordinary book with predictable elements.\",\n", " \"The writing style was straightforward.\",\n", " \"It was neither great nor terrible.\",\n", " \"The pacing was somewhat uneven.\",\n", " \"A moderate and balanced read.\",\n", " \"The themes were presented clearly.\",\n", " \"The book met my expectations.\",\n", " \"It was a reasonable way to spend time.\",\n", " \"The storyline was simple and easy to follow.\",\n", " \"Some chapters were stronger than others.\",\n", " \"An acceptable but unremarkable novel.\",\n", " \"The ending was satisfactory.\",\n", " \"The characters served their purpose.\",\n", " \"The narrative felt conventional.\",\n", " \"It was a steady but calm story.\",\n", " \"The dialogue was functional.\",\n", " \"The plot moved at a steady pace.\",\n", " \"An average entry in its genre.\",\n", " \"Nothing particularly stood out.\",\n", " \"It was readable but not thrilling.\",\n", " \"The story had both strengths and weaknesses.\",\n", " \"A mildly engaging experience.\",\n", " \"The setting was described adequately.\",\n", " \"The writing was clear and simple.\",\n", " \"It delivered what it promised.\",\n", " \"The book was competently written.\",\n", " \"It felt somewhat familiar.\",\n", " \"The structure was conventional.\",\n", " \"The tone remained consistent throughout.\",\n", " \"It was fairly predictable.\",\n", " \"The story was easy to follow.\",\n", " \"An overall balanced book.\",\n", " \"The ideas were presented plainly.\",\n", " \"A neutral reading experience.\",\n", " \"It didnโ€™t evoke strong emotions.\",\n", " \"The plot resolution was acceptable.\",\n", " \"The themes were explored briefly.\",\n", " \"The characters were average.\",\n", " \"It was neither disappointing nor impressive.\",\n", " \"The book maintained a steady rhythm.\",\n", " \"An uncomplicated narrative.\",\n", " \"A straightforward reading experience.\",\n", " \"It was just fine.\"\n", " ],\n", "\n", " \"negative\": [\n", " \"I struggled to finish this book.\",\n", " \"The story felt dull and uninspired.\",\n", " \"The characters were flat and unconvincing.\",\n", " \"The pacing was painfully slow.\",\n", " \"I found the plot confusing and weak.\",\n", " \"The writing style was difficult to enjoy.\",\n", " \"This book did not meet my expectations.\",\n", " \"The ending was disappointing.\",\n", " \"It lacked depth and originality.\",\n", " \"The dialogue felt forced.\",\n", " \"I lost interest halfway through.\",\n", " \"The narrative felt disorganized.\",\n", " \"The themes were poorly developed.\",\n", " \"The story dragged unnecessarily.\",\n", " \"I couldnโ€™t connect with the characters.\",\n", " \"The plot twists felt unrealistic.\",\n", " \"The book was hard to follow.\",\n", " \"It felt repetitive and predictable.\",\n", " \"The writing lacked clarity.\",\n", " \"The storyline was underwhelming.\",\n", " \"The book felt rushed in parts.\",\n", " \"I didnโ€™t enjoy the authorโ€™s style.\",\n", " \"The pacing was inconsistent.\",\n", " \"The characters lacked personality.\",\n", " \"The overall experience was disappointing.\",\n", " \"The conflict felt unconvincing.\",\n", " \"The story failed to engage me.\",\n", " \"It was not as compelling as I hoped.\",\n", " \"The development felt shallow.\",\n", " \"The book lacked emotional impact.\",\n", " \"The ending felt abrupt.\",\n", " \"The plot felt overly complicated.\",\n", " \"The writing was monotonous.\",\n", " \"The story didnโ€™t hold my attention.\",\n", " \"The characters seemed unrealistic.\",\n", " \"The book felt overly long.\",\n", " \"The central idea was poorly executed.\",\n", " \"The narrative lacked focus.\",\n", " \"The story felt unfinished.\",\n", " \"The book was not memorable.\",\n", " \"The pacing made it hard to stay engaged.\",\n", " \"The themes were not explored deeply enough.\",\n", " \"It failed to leave an impression.\",\n", " \"The dialogue sounded unnatural.\",\n", " \"The storyline felt weak.\",\n", " \"The writing lacked energy.\",\n", " \"The book was disappointing overall.\",\n", " \"It did not deliver on its premise.\",\n", " \"The plot was not compelling.\",\n", " \"I wouldnโ€™t recommend this book.\"\n", " ]\n", "}\n" ] }, { "cell_type": "markdown", "metadata": { "id": "fQhfVaDmuULT" }, "source": [ "### *b. Generate 10 reviews per book using random sampling from the corresponding 50*" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "id": "l2SRc3PjuTGM" }, "outputs": [], "source": [ "review_rows = []\n", "for _, row in df_books.iterrows():\n", " title = row['title']\n", " sentiment_label = row['sentiment_label']\n", " review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n", " sampled_reviews = random.sample(review_pool, 10)\n", " for review_text in sampled_reviews:\n", " review_rows.append({\n", " \"title\": title,\n", " \"sentiment_label\": sentiment_label,\n", " \"review_text\": review_text,\n", " \"rating\": row['rating'],\n", " \"popularity_score\": row['popularity_score']\n", " })" ] }, { "cell_type": "markdown", "metadata": { "id": "bmJMXF-Bukdm" }, "source": [ "### *c. Create the final dataframe df_reviews & save it as synthetic_book_reviews.csv*" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "id": "ZUKUqZsuumsp" }, "outputs": [], "source": [ "df_reviews = pd.DataFrame(review_rows)\n", "df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)\n" ] }, { "cell_type": "markdown", "source": [ "### *c. inputs for R*" ], "metadata": { "id": "_602pYUS3gY5" } }, { "cell_type": "code", "execution_count": 26, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3946e521", "outputId": "0f60f9ef-4d44-4c1e-f0cc-6d27ef94d420" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "โœ… Wrote synthetic_title_level_features.csv\n", "โœ… Wrote synthetic_monthly_revenue_series.csv\n" ] } ], "source": [ "import numpy as np\n", "\n", "def _safe_num(s):\n", " return pd.to_numeric(\n", " pd.Series(s).astype(str).str.replace(r\"[^0-9.]\", \"\", regex=True),\n", " errors=\"coerce\"\n", " )\n", "\n", "# --- Clean book metadata (price/rating) ---\n", "df_books_r = df_books.copy()\n", "if \"price\" in df_books_r.columns:\n", " df_books_r[\"price\"] = _safe_num(df_books_r[\"price\"])\n", "if \"rating\" in df_books_r.columns:\n", " df_books_r[\"rating\"] = _safe_num(df_books_r[\"rating\"])\n", "\n", "df_books_r[\"title\"] = df_books_r[\"title\"].astype(str).str.strip()\n", "\n", "# --- Clean sales ---\n", "df_sales_r = df_sales.copy()\n", "df_sales_r[\"title\"] = df_sales_r[\"title\"].astype(str).str.strip()\n", "df_sales_r[\"month\"] = pd.to_datetime(df_sales_r[\"month\"], errors=\"coerce\")\n", "df_sales_r[\"units_sold\"] = _safe_num(df_sales_r[\"units_sold\"])\n", "\n", "# --- Clean reviews ---\n", "df_reviews_r = df_reviews.copy()\n", "df_reviews_r[\"title\"] = df_reviews_r[\"title\"].astype(str).str.strip()\n", "df_reviews_r[\"sentiment_label\"] = df_reviews_r[\"sentiment_label\"].astype(str).str.lower().str.strip()\n", "if \"rating\" in df_reviews_r.columns:\n", " df_reviews_r[\"rating\"] = _safe_num(df_reviews_r[\"rating\"])\n", "if \"popularity_score\" in df_reviews_r.columns:\n", " df_reviews_r[\"popularity_score\"] = _safe_num(df_reviews_r[\"popularity_score\"])\n", "\n", "# --- Sentiment shares per title (from reviews) ---\n", "sent_counts = (\n", " df_reviews_r.groupby([\"title\", \"sentiment_label\"])\n", " .size()\n", " .unstack(fill_value=0)\n", ")\n", "for lab in [\"positive\", \"neutral\", \"negative\"]:\n", " if lab not in sent_counts.columns:\n", " sent_counts[lab] = 0\n", "\n", "sent_counts[\"total_reviews\"] = sent_counts[[\"positive\", \"neutral\", \"negative\"]].sum(axis=1)\n", "den = sent_counts[\"total_reviews\"].replace(0, np.nan)\n", "sent_counts[\"share_positive\"] = sent_counts[\"positive\"] / den\n", "sent_counts[\"share_neutral\"] = sent_counts[\"neutral\"] / den\n", "sent_counts[\"share_negative\"] = sent_counts[\"negative\"] / den\n", "sent_counts = sent_counts.reset_index()\n", "\n", "# --- Sales aggregation per title ---\n", "sales_by_title = (\n", " df_sales_r.dropna(subset=[\"title\"])\n", " .groupby(\"title\", as_index=False)\n", " .agg(\n", " months_observed=(\"month\", \"nunique\"),\n", " avg_units_sold=(\"units_sold\", \"mean\"),\n", " total_units_sold=(\"units_sold\", \"sum\"),\n", " )\n", ")\n", "\n", "# --- Title-level features (join sales + books + sentiment) ---\n", "df_title = (\n", " sales_by_title\n", " .merge(df_books_r[[\"title\", \"price\", \"rating\"]], on=\"title\", how=\"left\")\n", " .merge(sent_counts[[\"title\", \"share_positive\", \"share_neutral\", \"share_negative\", \"total_reviews\"]],\n", " on=\"title\", how=\"left\")\n", ")\n", "\n", "df_title[\"avg_revenue\"] = df_title[\"avg_units_sold\"] * df_title[\"price\"]\n", "df_title[\"total_revenue\"] = df_title[\"total_units_sold\"] * df_title[\"price\"]\n", "\n", "df_title.to_csv(\"synthetic_title_level_features.csv\", index=False)\n", "print(\"โœ… Wrote synthetic_title_level_features.csv\")\n", "\n", "# --- Monthly revenue series (proxy: units_sold * price) ---\n", "monthly_rev = (\n", " df_sales_r.merge(df_books_r[[\"title\", \"price\"]], on=\"title\", how=\"left\")\n", ")\n", "monthly_rev[\"revenue\"] = monthly_rev[\"units_sold\"] * monthly_rev[\"price\"]\n", "\n", "df_monthly = (\n", " monthly_rev.dropna(subset=[\"month\"])\n", " .groupby(\"month\", as_index=False)[\"revenue\"]\n", " .sum()\n", " .rename(columns={\"revenue\": \"total_revenue\"})\n", " .sort_values(\"month\")\n", ")\n", "# if revenue is all NA (e.g., missing price), fallback to units_sold as a teaching proxy\n", "if df_monthly[\"total_revenue\"].notna().sum() == 0:\n", " df_monthly = (\n", " df_sales_r.dropna(subset=[\"month\"])\n", " .groupby(\"month\", as_index=False)[\"units_sold\"]\n", " .sum()\n", " .rename(columns={\"units_sold\": \"total_revenue\"})\n", " .sort_values(\"month\")\n", " )\n", "\n", "df_monthly[\"month\"] = pd.to_datetime(df_monthly[\"month\"], errors=\"coerce\").dt.strftime(\"%Y-%m-%d\")\n", "df_monthly.to_csv(\"synthetic_monthly_revenue_series.csv\", index=False)\n", "print(\"โœ… Wrote synthetic_monthly_revenue_series.csv\")\n" ] }, { "cell_type": "markdown", "metadata": { "id": "RYvGyVfXuo54" }, "source": [ "### *d. โœ‹๐Ÿป๐Ÿ›‘โ›”๏ธ View the first few lines*" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "xfE8NMqOurKo", "outputId": "fef5de5e-d0d7-43c4-e491-a7dd00b8c5a6" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title sentiment_label \\\n", "0 A Light in the Attic neutral \n", "1 A Light in the Attic neutral \n", "2 A Light in the Attic neutral \n", "3 A Light in the Attic neutral \n", "4 A Light in the Attic neutral \n", "\n", " review_text rating popularity_score \n", "0 The book maintained a steady rhythm. Three 3 \n", "1 The dialogue was functional. Three 3 \n", "2 The structure was conventional. Three 3 \n", "3 The writing style was straightforward. Three 3 \n", "4 The characters were average. Three 3 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlesentiment_labelreview_textratingpopularity_score
0A Light in the AtticneutralThe book maintained a steady rhythm.Three3
1A Light in the AtticneutralThe dialogue was functional.Three3
2A Light in the AtticneutralThe structure was conventional.Three3
3A Light in the AtticneutralThe writing style was straightforward.Three3
4A Light in the AtticneutralThe characters were average.Three3
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_reviews", "summary": "{\n \"name\": \"df_reviews\",\n \"rows\": 10000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review_text\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 150,\n \"samples\": [\n \"I highly recommend this book.\",\n \"I couldn\\u2019t connect with the characters.\",\n \"An outstanding contribution to the genre.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 27 } ], "source": [ "df_reviews.head()" ] } ], "metadata": { "colab": { "collapsed_sections": [ "jpASMyIQMaAq", "lquNYCbfL9IM", "0IWuNpxxYDJF", "oCdTsin2Yfp3", "T0TOeRC4Yrnn", "duI5dv3CZYvF", "qMjRKMBQZlJi", "p-1Pr2szaqLk", "SIaJUGIpaH4V", "pY4yCoIuaQqp", "n4-TaNTFgPak", "HnngRNTgacYt", "HF9F9HIzgT7Z", "T8AdKkmASq9a", "OhXbdGD5fH0c", "L2ak1HlcgoTe", "4IXZKcCSgxnq", "EhIjz9WohAmZ", "Gi4y9M9KuDWx", "fQhfVaDmuULT", "bmJMXF-Bukdm", "RYvGyVfXuo54" ], "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }