{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "4ba6aba8" }, "source": [ "# đŸ€– **Data Collection, Creation, Storage, and Processing**\n" ] }, { "cell_type": "markdown", "metadata": { "id": "jpASMyIQMaAq" }, "source": [ "## **1.** 📩 Install required packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f48c8f8c", "outputId": "4d0d052d-dec9-41c3-884d-dd95526f964a" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n", "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", "Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n", "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n", "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.3)\n", "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.62.0)\n", "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.5.0)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n", "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", "Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n", "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.1)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n", "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (2025.11.3)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (4.67.3)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n" ] } ], "source": [ "!pip install beautifulsoup4 pandas matplotlib seaborn numpy textblob" ] }, { "cell_type": "markdown", "metadata": { "id": "lquNYCbfL9IM" }, "source": [ "## **2.** ⛏ Web-scrape all book titles, prices, and ratings from books.toscrape.com" ] }, { "cell_type": "markdown", "metadata": { "id": "0IWuNpxxYDJF" }, "source": [ "### *a. Initial setup*\n", "Define the base url of the website you will scrape as well as how and what you will scrape" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "91d52125" }, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "import time\n", "\n", "base_url = \"https://books.toscrape.com/catalogue/page-{}.html\"\n", "headers = {\"User-Agent\": \"Mozilla/5.0\"}\n", "\n", "titles, prices, ratings = [], [], []" ] }, { "cell_type": "markdown", "metadata": { "id": "oCdTsin2Yfp3" }, "source": [ "### *b. Fill titles, prices, and ratings from the web pages*" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "xqO5Y3dnYhxt" }, "outputs": [], "source": [ "# Loop through all 50 pages\n", "for page in range(1, 51):\n", " url = base_url.format(page)\n", " response = requests.get(url, headers=headers)\n", " soup = BeautifulSoup(response.content, \"html.parser\")\n", " books = soup.find_all(\"article\", class_=\"product_pod\")\n", "\n", " for book in books:\n", " titles.append(book.h3.a[\"title\"])\n", " prices.append(float(book.find(\"p\", class_=\"price_color\").text[1:]))\n", " ratings.append(book.p.get(\"class\")[1])\n", "\n", " time.sleep(0.5) # polite scraping delay" ] }, { "cell_type": "markdown", "metadata": { "id": "T0TOeRC4Yrnn" }, "source": [ "### *c. âœ‹đŸ»đŸ›‘â›”ïž Create a dataframe df_books that contains the now complete \"title\", \"price\", and \"rating\" objects*" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "azPR5NjHqphK" }, "outputs": [], "source": [ "df_books = pd.DataFrame({\n", " \"title\": titles,\n", " \"price\": prices,\n", " \"rating\": ratings\n", "})" ] }, { "cell_type": "markdown", "metadata": { "id": "duI5dv3CZYvF" }, "source": [ "### *d. Save web-scraped dataframe either as a CSV or Excel file*" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "lC1U_YHtZifh" }, "outputs": [], "source": [ "# đŸ’Ÿ Save to CSV\n", "df_books.to_csv(\"books_data.csv\", index=False)\n", "\n", "# đŸ’Ÿ Or save to Excel\n", "# df_books.to_excel(\"books_data.xlsx\", index=False)" ] }, { "cell_type": "markdown", "metadata": { "id": "qMjRKMBQZlJi" }, "source": [ "### *e. âœ‹đŸ»đŸ›‘â›”ïž View first fiew lines*" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "O_wIvTxYZqCK", "outputId": "de83f95f-f4ff-462b-af6a-7de5f0b5f339" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title price rating\n", "0 A Light in the Attic 51.77 Three\n", "1 Tipping the Velvet 53.74 One\n", "2 Soumission 50.10 One\n", "3 Sharp Objects 47.82 Four\n", "4 Sapiens: A Brief History of Humankind 54.23 Five" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepricerating
0A Light in the Attic51.77Three
1Tipping the Velvet53.74One
2Soumission50.10One
3Sharp Objects47.82Four
4Sapiens: A Brief History of Humankind54.23Five
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_books", "summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 6 } ], "source": [ "df_books.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "p-1Pr2szaqLk" }, "source": [ "## **3.** đŸ§© Create a meaningful connection between real & synthetic datasets" ] }, { "cell_type": "markdown", "metadata": { "id": "SIaJUGIpaH4V" }, "source": [ "### *a. Initial setup*" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "-gPXGcRPuV_9" }, "outputs": [], "source": [ "import numpy as np\n", "import random\n", "from datetime import datetime\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "random.seed(2025)\n", "np.random.seed(2025)" ] }, { "cell_type": "markdown", "metadata": { "id": "pY4yCoIuaQqp" }, "source": [ "### *b. Generate popularity scores based on rating (with some randomness) with a generate_popularity_score function*" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "mnd5hdAbaNjz" }, "outputs": [], "source": [ "def generate_popularity_score(rating):\n", " base = {\"One\": 2, \"Two\": 3, \"Three\": 3, \"Four\": 4, \"Five\": 4}.get(rating, 3)\n", " trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n", " return int(np.clip(base + trend_factor, 1, 5))" ] }, { "cell_type": "markdown", "metadata": { "id": "n4-TaNTFgPak" }, "source": [ "### *c. âœ‹đŸ»đŸ›‘â›”ïž Run the function to create a \"popularity_score\" column from \"rating\"*" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "id": "zaTMGv0NrqAP" }, "outputs": [], "source": [ "df_books[\"popularity_score\"] = df_books[\"rating\"].apply(generate_popularity_score)" ] }, { "cell_type": "markdown", "metadata": { "id": "HnngRNTgacYt" }, "source": [ "### *d. Decide on the sentiment_label based on the popularity score with a get_sentiment function*" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "id": "kUtWmr8maZLZ" }, "outputs": [], "source": [ "def get_sentiment(popularity_score):\n", " if popularity_score <= 2:\n", " return \"negative\"\n", " elif popularity_score == 3:\n", " return \"neutral\"\n", " else:\n", " return \"positive\"" ] }, { "cell_type": "markdown", "metadata": { "id": "HF9F9HIzgT7Z" }, "source": [ "### *e. âœ‹đŸ»đŸ›‘â›”ïž Run the function to create a \"sentiment_label\" column from \"popularity_score\"*" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "_jZG8Hljrx9L" }, "outputs": [], "source": [ "df_books[\"sentiment_label\"] = df_books[\"popularity_score\"].apply(get_sentiment)" ] }, { "cell_type": "markdown", "metadata": { "id": "T8AdKkmASq9a" }, "source": [ "## **4.** 📈 Generate synthetic book sales data of 18 months" ] }, { "cell_type": "markdown", "metadata": { "id": "OhXbdGD5fH0c" }, "source": [ "### *a. Create a generate_sales_profit function that would generate sales patterns based on sentiment_label (with some randomness)*" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "id": "qkVhYPXGbgEn" }, "outputs": [], "source": [ "def generate_sales_profile(sentiment):\n", " months = pd.date_range(end=datetime.today(), periods=18, freq=\"M\")\n", "\n", " if sentiment == \"positive\":\n", " base = random.randint(200, 300)\n", " trend = np.linspace(base, base + random.randint(20, 60), len(months))\n", " elif sentiment == \"negative\":\n", " base = random.randint(20, 80)\n", " trend = np.linspace(base, base - random.randint(10, 30), len(months))\n", " else: # neutral\n", " base = random.randint(80, 160)\n", " trend = np.full(len(months), base + random.randint(-10, 10))\n", "\n", " seasonality = 10 * np.sin(np.linspace(0, 3 * np.pi, len(months)))\n", " noise = np.random.normal(0, 5, len(months))\n", " monthly_sales = np.clip(trend + seasonality + noise, a_min=0, a_max=None).astype(int)\n", "\n", " return list(zip(months.strftime(\"%Y-%m\"), monthly_sales))" ] }, { "cell_type": "markdown", "metadata": { "id": "L2ak1HlcgoTe" }, "source": [ "### *b. Run the function as part of building sales_data*" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "SlJ24AUafoDB" }, "outputs": [], "source": [ "sales_data = []\n", "for _, row in df_books.iterrows():\n", " records = generate_sales_profile(row[\"sentiment_label\"])\n", " for month, units in records:\n", " sales_data.append({\n", " \"title\": row[\"title\"],\n", " \"month\": month,\n", " \"units_sold\": units,\n", " \"sentiment_label\": row[\"sentiment_label\"]\n", " })" ] }, { "cell_type": "markdown", "metadata": { "id": "4IXZKcCSgxnq" }, "source": [ "### *c. âœ‹đŸ»đŸ›‘â›”ïž Create a df_sales DataFrame from sales_data*" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "NW-AjejTsIPG" }, "outputs": [], "source": [ "df_sales = pd.DataFrame(sales_data)" ] }, { "cell_type": "markdown", "metadata": { "id": "EhIjz9WohAmZ" }, "source": [ "### *d. Save df_sales as synthetic_sales_data.csv & view first few lines*" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MzbZvLcAhGaH", "outputId": "82feb55d-4977-429c-be89-2b44dd2439a0" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " title month units_sold sentiment_label\n", "0 A Light in the Attic 2024-09 100 neutral\n", "1 A Light in the Attic 2024-10 109 neutral\n", "2 A Light in the Attic 2024-11 102 neutral\n", "3 A Light in the Attic 2024-12 107 neutral\n", "4 A Light in the Attic 2025-01 108 neutral\n" ] } ], "source": [ "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n", "\n", "print(df_sales.head())" ] }, { "cell_type": "markdown", "metadata": { "id": "7g9gqBgQMtJn" }, "source": [ "## **5.** 🎯 Generate synthetic customer reviews" ] }, { "cell_type": "markdown", "metadata": { "id": "Gi4y9M9KuDWx" }, "source": [ "### *a. âœ‹đŸ»đŸ›‘â›”ïž Ask ChatGPT to create a list of 50 distinct generic book review texts for the sentiment labels \"positive\", \"neutral\", and \"negative\" called synthetic_reviews_by_sentiment*" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "id": "b3cd2a50" }, "outputs": [], "source": [ "synthetic_reviews_by_sentiment = {\n", " \"positive\": [\n", " \"A beautifully written story that pulled me in from page one.\",\n", " \"I couldn't put this down—fast-paced, emotional, and satisfying.\",\n", " \"The characters felt so real, I miss them already.\",\n", " \"A smart, heartfelt read with a payoff that truly lands.\",\n", " \"Gorgeous prose and a plot that kept surprising me.\",\n", " \"This book restored my faith in storytelling—fantastic.\",\n", " \"An uplifting, immersive journey that stayed with me for days.\",\n", " \"The world-building is vivid without ever slowing the story.\",\n", " \"A tender, thoughtful novel with moments that made me tear up.\",\n", " \"Every chapter made me want to keep going—excellent pacing.\",\n", " \"A rare book where the ending feels earned and unforgettable.\",\n", " \"I loved the dialogue—sharp, funny, and full of personality.\",\n", " \"A compelling blend of mystery and emotion—highly recommend.\",\n", " \"The themes were handled with care and real insight.\",\n", " \"An instant favorite—I'll be recommending this to everyone.\",\n", " \"The plot twists were clever without feeling cheap.\",\n", " \"A powerful story told with confidence and heart.\",\n", " \"I laughed, I cried, and I immediately wanted to reread it.\",\n", " \"Strong voice, strong characters, and a wonderfully satisfying arc.\",\n", " \"A gripping read with surprising depth and warmth.\",\n", " \"This was exactly what I needed—comforting and inspiring.\",\n", " \"The author’s writing style is addictive and beautifully precise.\",\n", " \"A moving exploration of love, loss, and hope.\",\n", " \"I was completely transported—this is storytelling at its best.\",\n", " \"A delightful surprise—fresh, original, and genuinely engaging.\",\n", " \"The main character’s growth felt authentic and rewarding.\",\n", " \"Such an atmospheric book—I could picture every scene.\",\n", " \"A wonderful mix of humor and heart that never felt forced.\",\n", " \"I devoured it in two sittings—so entertaining.\",\n", " \"An emotionally rich novel that still keeps the plot tight.\",\n", " \"Fantastic character chemistry and a plot that kept me hooked.\",\n", " \"The writing is elegant, and the story is even better.\",\n", " \"A masterpiece of pacing—never a dull moment.\",\n", " \"This book has one of the best openings I've read in ages.\",\n", " \"It’s rare to find a story this both fun and meaningful.\",\n", " \"The ending made me smile—so satisfying and well done.\",\n", " \"A thrilling ride with real emotional stakes.\",\n", " \"Beautifully layered storytelling with memorable scenes.\",\n", " \"An engaging plot and a voice that feels fresh and confident.\",\n", " \"The side characters were just as strong as the lead—loved it.\",\n", " \"I appreciated how thoughtful and nuanced the themes were.\",\n", " \"A heartfelt page-turner that’s hard to forget.\",\n", " \"Clever, charming, and surprisingly profound.\",\n", " \"A stunning debut—can’t wait to read more from this author.\",\n", " \"The suspense was perfect and the resolution was rewarding.\",\n", " \"I enjoyed every moment—pure reading joy.\",\n", " \"A captivating story with writing that sparkles.\",\n", " \"The emotional beats hit perfectly—so well crafted.\",\n", " \"This book exceeded my expectations in every way.\",\n", " \"An unforgettable read—highly recommended.\"\n", " ],\n", " \"neutral\": [\n", " \"It was an okay read—some strong parts, some slower stretches.\",\n", " \"Interesting premise, but the execution didn’t fully wow me.\",\n", " \"I enjoyed it enough, though it didn’t leave a big impression.\",\n", " \"A solid book overall, just not quite my favorite.\",\n", " \"The writing was good, but the plot felt a bit predictable.\",\n", " \"Some chapters were engaging, others dragged for me.\",\n", " \"Not bad, not amazing—just a decent way to spend an afternoon.\",\n", " \"The characters were fine, though I wanted more depth.\",\n", " \"I liked the idea more than the story itself.\",\n", " \"A competently written book that didn’t take many risks.\",\n", " \"It held my attention, but I wasn’t eager to rush back to it.\",\n", " \"The pacing was uneven—strong start, slower middle.\",\n", " \"The ending was fine, though I expected something bigger.\",\n", " \"There were a few great scenes, but also some filler.\",\n", " \"I can see why people like it, but it wasn’t for me.\",\n", " \"The story was straightforward and easy to follow.\",\n", " \"Good writing, but the tone didn’t always work for me.\",\n", " \"It had moments of charm, but didn’t fully click overall.\",\n", " \"A pleasant read, though I probably won’t reread it.\",\n", " \"The plot was okay, but I wanted more tension.\",\n", " \"Some characters stood out, while others felt flat.\",\n", " \"It started strong, then became more average as it went on.\",\n", " \"The world-building was interesting but not fully explored.\",\n", " \"A decent story with a few memorable moments.\",\n", " \"I finished it, but I wasn’t blown away.\",\n", " \"It delivered what it promised—nothing more, nothing less.\",\n", " \"The writing style was fine, though not particularly distinctive.\",\n", " \"The middle section slowed down the momentum a lot.\",\n", " \"A good concept, but the ending felt a little rushed.\",\n", " \"It was readable, but I didn’t connect emotionally.\",\n", " \"Some of the dialogue felt natural; some felt stiff.\",\n", " \"I liked parts of it, but overall it was just okay.\",\n", " \"A middle-of-the-road read with a few highlights.\",\n", " \"It had potential, but didn’t fully deliver for me.\",\n", " \"There’s nothing wrong with it—just not memorable.\",\n", " \"The story was fine, but I wanted more character development.\",\n", " \"An easy read, though the stakes felt low.\",\n", " \"I didn’t dislike it, but I wasn’t excited either.\",\n", " \"The plot moved along, but it rarely surprised me.\",\n", " \"It had a clear message, though it felt a bit on-the-nose.\",\n", " \"Some chapters were compelling, others felt repetitive.\",\n", " \"I liked the setting more than the characters.\",\n", " \"A decent book that I’d recommend only to certain readers.\",\n", " \"I’m glad I read it, but I won’t be talking about it much.\",\n", " \"It was entertaining enough, though not standout.\",\n", " \"The writing was smooth, but the story felt familiar.\",\n", " \"It had a few good twists, but I guessed most of them.\",\n", " \"Not a waste of time—just not a must-read either.\",\n", " \"A fine addition to the genre, but not exceptional.\",\n", " \"Overall: solid, but not remarkable.\"\n", " ],\n", " \"negative\": [\n", " \"I struggled to get into this—nothing really grabbed me.\",\n", " \"The pacing was slow and the story felt stretched out.\",\n", " \"I wanted to like it, but the characters felt flat.\",\n", " \"The plot was confusing and hard to stay invested in.\",\n", " \"It felt predictable, and the twists didn’t land for me.\",\n", " \"The writing style just didn’t work—I kept losing focus.\",\n", " \"I finished it, but only out of determination.\",\n", " \"Too many clichĂ©s and not enough originality.\",\n", " \"The dialogue felt unnatural and pulled me out of the story.\",\n", " \"The middle dragged so much that I almost DNF’d it.\",\n", " \"I found the main character difficult to root for.\",\n", " \"The story had potential, but the execution was disappointing.\",\n", " \"The tone was inconsistent and made it hard to enjoy.\",\n", " \"The ending felt rushed and unsatisfying.\",\n", " \"A lot happens, but none of it felt meaningful.\",\n", " \"The plot holes made it hard to take seriously.\",\n", " \"I didn’t connect with the characters at all.\",\n", " \"It was repetitive and could have been much shorter.\",\n", " \"The writing felt messy and unclear in places.\",\n", " \"The book didn’t live up to the hype for me.\",\n", " \"I kept waiting for it to get better, but it never did.\",\n", " \"The twists felt forced rather than clever.\",\n", " \"The story was dull and lacked emotional impact.\",\n", " \"Some scenes were unnecessarily long and added nothing.\",\n", " \"It felt like the author didn’t know how to end it.\",\n", " \"The characterization was shallow and inconsistent.\",\n", " \"I found myself skimming just to get through it.\",\n", " \"The plot wandered without a clear direction.\",\n", " \"It tried to be deep, but it came off as preachy.\",\n", " \"The conflict felt artificial and unconvincing.\",\n", " \"The writing was awkward and distracting.\",\n", " \"I couldn’t suspend disbelief—too many unrealistic moments.\",\n", " \"The world-building was thin and didn’t make sense.\",\n", " \"The stakes never felt real, so I didn’t care what happened.\",\n", " \"The emotional moments felt unearned.\",\n", " \"The book was more frustrating than enjoyable.\",\n", " \"The characters made decisions that didn’t make sense.\",\n", " \"The story relied too much on convenient coincidences.\",\n", " \"I didn’t like the pacing—slow start and abrupt finish.\",\n", " \"It felt like a first draft that needed heavy editing.\",\n", " \"The plot was bland and offered nothing new.\",\n", " \"I was bored for most of it, despite the premise.\",\n", " \"The writing felt repetitive and overly descriptive.\",\n", " \"The ending left too many loose threads in a bad way.\",\n", " \"It was hard to follow and not rewarding to read.\",\n", " \"The book promised a lot but delivered very little.\",\n", " \"I regret spending time on this one.\",\n", " \"This just wasn’t enjoyable—I'd skip it.\",\n", " \"I couldn’t get invested, and it felt like a chore.\",\n", " \"Disappointing overall—would not recommend.\"\n", " ]\n", "}" ] }, { "cell_type": "markdown", "metadata": { "id": "fQhfVaDmuULT" }, "source": [ "### *b. Generate 10 reviews per book using random sampling from the corresponding 50*" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "id": "l2SRc3PjuTGM" }, "outputs": [], "source": [ "review_rows = []\n", "for _, row in df_books.iterrows():\n", " title = row['title']\n", " sentiment_label = row['sentiment_label']\n", " review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n", " sampled_reviews = random.sample(review_pool, 10)\n", " for review_text in sampled_reviews:\n", " review_rows.append({\n", " \"title\": title,\n", " \"sentiment_label\": sentiment_label,\n", " \"review_text\": review_text,\n", " \"rating\": row['rating'],\n", " \"popularity_score\": row['popularity_score']\n", " })" ] }, { "cell_type": "markdown", "metadata": { "id": "bmJMXF-Bukdm" }, "source": [ "### *c. Create the final dataframe df_reviews & save it as synthetic_book_reviews.csv*" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "id": "ZUKUqZsuumsp" }, "outputs": [], "source": [ "df_reviews = pd.DataFrame(review_rows)\n", "df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)" ] }, { "cell_type": "markdown", "source": [ "### *c. inputs for R*" ], "metadata": { "id": "_602pYUS3gY5" } }, { "cell_type": "code", "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3946e521", "outputId": "4f3c942b-cfd3-4259-fedb-3dc75e4008e9" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "✅ Wrote synthetic_title_level_features.csv\n", "✅ Wrote synthetic_monthly_revenue_series.csv\n" ] } ], "source": [ "import numpy as np\n", "\n", "def _safe_num(s):\n", " return pd.to_numeric(\n", " pd.Series(s).astype(str).str.replace(r\"[^0-9.]\", \"\", regex=True),\n", " errors=\"coerce\"\n", " )\n", "\n", "# --- Clean book metadata (price/rating) ---\n", "df_books_r = df_books.copy()\n", "if \"price\" in df_books_r.columns:\n", " df_books_r[\"price\"] = _safe_num(df_books_r[\"price\"])\n", "if \"rating\" in df_books_r.columns:\n", " df_books_r[\"rating\"] = _safe_num(df_books_r[\"rating\"])\n", "\n", "df_books_r[\"title\"] = df_books_r[\"title\"].astype(str).str.strip()\n", "\n", "# --- Clean sales ---\n", "df_sales_r = df_sales.copy()\n", "df_sales_r[\"title\"] = df_sales_r[\"title\"].astype(str).str.strip()\n", "df_sales_r[\"month\"] = pd.to_datetime(df_sales_r[\"month\"], errors=\"coerce\")\n", "df_sales_r[\"units_sold\"] = _safe_num(df_sales_r[\"units_sold\"])\n", "\n", "# --- Clean reviews ---\n", "df_reviews_r = df_reviews.copy()\n", "df_reviews_r[\"title\"] = df_reviews_r[\"title\"].astype(str).str.strip()\n", "df_reviews_r[\"sentiment_label\"] = df_reviews_r[\"sentiment_label\"].astype(str).str.lower().str.strip()\n", "if \"rating\" in df_reviews_r.columns:\n", " df_reviews_r[\"rating\"] = _safe_num(df_reviews_r[\"rating\"])\n", "if \"popularity_score\" in df_reviews_r.columns:\n", " df_reviews_r[\"popularity_score\"] = _safe_num(df_reviews_r[\"popularity_score\"])\n", "\n", "# --- Sentiment shares per title (from reviews) ---\n", "sent_counts = (\n", " df_reviews_r.groupby([\"title\", \"sentiment_label\"])\n", " .size()\n", " .unstack(fill_value=0)\n", ")\n", "for lab in [\"positive\", \"neutral\", \"negative\"]:\n", " if lab not in sent_counts.columns:\n", " sent_counts[lab] = 0\n", "\n", "sent_counts[\"total_reviews\"] = sent_counts[[\"positive\", \"neutral\", \"negative\"]].sum(axis=1)\n", "den = sent_counts[\"total_reviews\"].replace(0, np.nan)\n", "sent_counts[\"share_positive\"] = sent_counts[\"positive\"] / den\n", "sent_counts[\"share_neutral\"] = sent_counts[\"neutral\"] / den\n", "sent_counts[\"share_negative\"] = sent_counts[\"negative\"] / den\n", "sent_counts = sent_counts.reset_index()\n", "\n", "# --- Sales aggregation per title ---\n", "sales_by_title = (\n", " df_sales_r.dropna(subset=[\"title\"])\n", " .groupby(\"title\", as_index=False)\n", " .agg(\n", " months_observed=(\"month\", \"nunique\"),\n", " avg_units_sold=(\"units_sold\", \"mean\"),\n", " total_units_sold=(\"units_sold\", \"sum\"),\n", " )\n", ")\n", "\n", "# --- Title-level features (join sales + books + sentiment) ---\n", "df_title = (\n", " sales_by_title\n", " .merge(df_books_r[[\"title\", \"price\", \"rating\"]], on=\"title\", how=\"left\")\n", " .merge(sent_counts[[\"title\", \"share_positive\", \"share_neutral\", \"share_negative\", \"total_reviews\"]],\n", " on=\"title\", how=\"left\")\n", ")\n", "\n", "df_title[\"avg_revenue\"] = df_title[\"avg_units_sold\"] * df_title[\"price\"]\n", "df_title[\"total_revenue\"] = df_title[\"total_units_sold\"] * df_title[\"price\"]\n", "\n", "df_title.to_csv(\"synthetic_title_level_features.csv\", index=False)\n", "print(\"✅ Wrote synthetic_title_level_features.csv\")\n", "\n", "# --- Monthly revenue series (proxy: units_sold * price) ---\n", "monthly_rev = (\n", " df_sales_r.merge(df_books_r[[\"title\", \"price\"]], on=\"title\", how=\"left\")\n", ")\n", "monthly_rev[\"revenue\"] = monthly_rev[\"units_sold\"] * monthly_rev[\"price\"]\n", "\n", "df_monthly = (\n", " monthly_rev.dropna(subset=[\"month\"])\n", " .groupby(\"month\", as_index=False)[\"revenue\"]\n", " .sum()\n", " .rename(columns={\"revenue\": \"total_revenue\"})\n", " .sort_values(\"month\")\n", ")\n", "# if revenue is all NA (e.g., missing price), fallback to units_sold as a teaching proxy\n", "if df_monthly[\"total_revenue\"].notna().sum() == 0:\n", " df_monthly = (\n", " df_sales_r.dropna(subset=[\"month\"])\n", " .groupby(\"month\", as_index=False)[\"units_sold\"]\n", " .sum()\n", " .rename(columns={\"units_sold\": \"total_revenue\"})\n", " .sort_values(\"month\")\n", " )\n", "\n", "df_monthly[\"month\"] = pd.to_datetime(df_monthly[\"month\"], errors=\"coerce\").dt.strftime(\"%Y-%m-%d\")\n", "df_monthly.to_csv(\"synthetic_monthly_revenue_series.csv\", index=False)\n", "print(\"✅ Wrote synthetic_monthly_revenue_series.csv\")\n" ] }, { "cell_type": "markdown", "metadata": { "id": "RYvGyVfXuo54" }, "source": [ "### *d. âœ‹đŸ»đŸ›‘â›”ïž View the first few lines*" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "xfE8NMqOurKo", "outputId": "feac0935-973e-40ee-8b14-e0b9870fd84d" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title sentiment_label \\\n", "0 A Light in the Attic neutral \n", "1 A Light in the Attic neutral \n", "2 A Light in the Attic neutral \n", "3 A Light in the Attic neutral \n", "4 A Light in the Attic neutral \n", "\n", " review_text rating popularity_score \n", "0 It had a few good twists, but I guessed most o... Three 3 \n", "1 The world-building was interesting but not ful... Three 3 \n", "2 There’s nothing wrong with it—just not memorable. Three 3 \n", "3 I liked the idea more than the story itself. Three 3 \n", "4 It was entertaining enough, though not standout. Three 3 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlesentiment_labelreview_textratingpopularity_score
0A Light in the AtticneutralIt had a few good twists, but I guessed most o...Three3
1A Light in the AtticneutralThe world-building was interesting but not ful...Three3
2A Light in the AtticneutralThere’s nothing wrong with it—just not memorable.Three3
3A Light in the AtticneutralI liked the idea more than the story itself.Three3
4A Light in the AtticneutralIt was entertaining enough, though not standout.Three3
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_reviews", "summary": "{\n \"name\": \"df_reviews\",\n \"rows\": 10000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review_text\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 150,\n \"samples\": [\n \"This was exactly what I needed\\u2014comforting and inspiring.\",\n \"A lot happens, but none of it felt meaningful.\",\n \"Fantastic character chemistry and a plot that kept me hooked.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 21 } ], "source": [ "df_reviews.head()" ] } ], "metadata": { "colab": { "collapsed_sections": [ "jpASMyIQMaAq", "lquNYCbfL9IM", "0IWuNpxxYDJF", "oCdTsin2Yfp3", "T0TOeRC4Yrnn", "duI5dv3CZYvF", "qMjRKMBQZlJi", "p-1Pr2szaqLk", "SIaJUGIpaH4V", "pY4yCoIuaQqp", "n4-TaNTFgPak", "HnngRNTgacYt", "HF9F9HIzgT7Z", "T8AdKkmASq9a", "OhXbdGD5fH0c", "L2ak1HlcgoTe", "4IXZKcCSgxnq", "EhIjz9WohAmZ", "Gi4y9M9KuDWx", "fQhfVaDmuULT", "bmJMXF-Bukdm", "RYvGyVfXuo54" ], "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }