{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "4ba6aba8" }, "source": [ "# šŸ¤– **Data Collection, Creation, Storage, and Processing**\n" ] }, { "cell_type": "markdown", "metadata": { "id": "jpASMyIQMaAq" }, "source": [ "## **1.** šŸ“¦ Install required packages" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f48c8f8c", "outputId": "c81f2626-4c46-40a1-f36d-9653d42ae4a2" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n", "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", "Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n", "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n", "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.3)\n", "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n", "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", "Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n", "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.1)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n", "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (2025.11.3)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (4.67.3)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n" ] } ], "source": [ "!pip install beautifulsoup4 pandas matplotlib seaborn numpy textblob" ] }, { "cell_type": "markdown", "metadata": { "id": "lquNYCbfL9IM" }, "source": [ "## **2.** ā› Web-scrape all book titles, prices, and ratings from books.toscrape.com" ] }, { "cell_type": "markdown", "metadata": { "id": "0IWuNpxxYDJF" }, "source": [ "### *a. Initial setup*\n", "Define the base url of the website you will scrape as well as how and what you will scrape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "91d52125" }, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "import time\n", "\n", "base_url = \"https://books.toscrape.com/catalogue/page-{}.html\"\n", "headers = {\"User-Agent\": \"Mozilla/5.0\"}\n", "\n", "titles, prices, ratings = [], [], []" ] }, { "cell_type": "markdown", "metadata": { "id": "oCdTsin2Yfp3" }, "source": [ "### *b. Fill titles, prices, and ratings from the web pages*" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "xqO5Y3dnYhxt" }, "outputs": [], "source": [ "# Loop through all 50 pages\n", "for page in range(1, 51):\n", " url = base_url.format(page)\n", " response = requests.get(url, headers=headers)\n", " soup = BeautifulSoup(response.content, \"html.parser\")\n", " books = soup.find_all(\"article\", class_=\"product_pod\")\n", "\n", " for book in books:\n", " titles.append(book.h3.a[\"title\"])\n", " prices.append(float(book.find(\"p\", class_=\"price_color\").text[1:]))\n", " ratings.append(book.p.get(\"class\")[1])\n", "\n", " time.sleep(0.5) # polite scraping delay" ] }, { "cell_type": "markdown", "metadata": { "id": "T0TOeRC4Yrnn" }, "source": [ "### *c. āœ‹šŸ»šŸ›‘ā›”ļø Create a dataframe df_books that contains the now complete \"title\", \"price\", and \"rating\" objects*" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "l5FkkNhUYTHh", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "4b62070a-611b-4850-d0e6-8e5aa8590c34" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title price rating rating_num\n", "0 A Light in the Attic 51.77 Three 3\n", "1 Tipping the Velvet 53.74 One 1\n", "2 Soumission 50.10 One 1\n", "3 Sharp Objects 47.82 Four 4\n", "4 Sapiens: A Brief History of Humankind 54.23 Five 5" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepriceratingrating_num
0A Light in the Attic51.77Three3
1Tipping the Velvet53.74One1
2Soumission50.10One1
3Sharp Objects47.82Four4
4Sapiens: A Brief History of Humankind54.23Five5
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_books", "summary": "{\n \"name\": \"df_books\",\n \"rows\": 2000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.443075738771789,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating_num\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 1,\n 2,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 8 } ], "source": [ "df_books = pd.DataFrame({\n", " \"title\": titles,\n", " \"price\": prices,\n", " \"rating\": ratings\n", "})\n", "\n", "# Optional: convert rating words to numbers\n", "rating_map = {\"One\": 1, \"Two\": 2, \"Three\": 3, \"Four\": 4, \"Five\": 5}\n", "df_books[\"rating_num\"] = df_books[\"rating\"].map(rating_map)\n", "\n", "df_books.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "duI5dv3CZYvF" }, "source": [ "### *d. Save web-scraped dataframe either as a CSV or Excel file*" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "id": "lC1U_YHtZifh" }, "outputs": [], "source": [ "# šŸ’¾ Save to CSV\n", "df_books.to_csv(\"books_data.csv\", index=False)\n", "\n", "# šŸ’¾ Or save to Excel\n", "# df_books.to_excel(\"books_data.xlsx\", index=False)" ] }, { "cell_type": "markdown", "metadata": { "id": "qMjRKMBQZlJi" }, "source": [ "### *e. āœ‹šŸ»šŸ›‘ā›”ļø View first fiew lines*" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "O_wIvTxYZqCK", "outputId": "50aac303-450b-4a5b-eb8a-226f6a3ea0fe" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title price rating rating_num\n", "0 A Light in the Attic 51.77 Three 3\n", "1 Tipping the Velvet 53.74 One 1\n", "2 Soumission 50.10 One 1\n", "3 Sharp Objects 47.82 Four 4\n", "4 Sapiens: A Brief History of Humankind 54.23 Five 5" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlepriceratingrating_num
0A Light in the Attic51.77Three3
1Tipping the Velvet53.74One1
2Soumission50.10One1
3Sharp Objects47.82Four4
4Sapiens: A Brief History of Humankind54.23Five5
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_books", "summary": "{\n \"name\": \"df_books\",\n \"rows\": 2000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.443075738771789,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating_num\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 1,\n 2,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 11 } ], "source": [ "df_books.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "p-1Pr2szaqLk" }, "source": [ "## **3.** 🧩 Create a meaningful connection between real & synthetic datasets" ] }, { "cell_type": "markdown", "metadata": { "id": "SIaJUGIpaH4V" }, "source": [ "### *a. Initial setup*" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "id": "-gPXGcRPuV_9" }, "outputs": [], "source": [ "import numpy as np\n", "import random\n", "from datetime import datetime\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "random.seed(2025)\n", "np.random.seed(2025)" ] }, { "cell_type": "markdown", "metadata": { "id": "pY4yCoIuaQqp" }, "source": [ "### *b. Generate popularity scores based on rating (with some randomness) with a generate_popularity_score function*" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "mnd5hdAbaNjz" }, "outputs": [], "source": [ "def generate_popularity_score(rating):\n", " base = {\"One\": 2, \"Two\": 3, \"Three\": 3, \"Four\": 4, \"Five\": 4}.get(rating, 3)\n", " trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n", " return int(np.clip(base + trend_factor, 1, 5))" ] }, { "cell_type": "markdown", "metadata": { "id": "n4-TaNTFgPak" }, "source": [ "### *c. āœ‹šŸ»šŸ›‘ā›”ļø Run the function to create a \"popularity_score\" column from \"rating\"*" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "V-G3OCUCgR07", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "a6c6a110-2f93-4906-903d-950392522db5" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title rating popularity_score\n", "0 A Light in the Attic Three 3\n", "1 Tipping the Velvet One 2\n", "2 Soumission One 2\n", "3 Sharp Objects Four 4\n", "4 Sapiens: A Brief History of Humankind Five 3" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleratingpopularity_score
0A Light in the AtticThree3
1Tipping the VelvetOne2
2SoumissionOne2
3Sharp ObjectsFour4
4Sapiens: A Brief History of HumankindFive3
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"df_books[[\\\"title\\\", \\\"rating\\\", \\\"popularity_score\\\"]]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Tipping the Velvet\",\n \"Sapiens: A Brief History of Humankind\",\n \"Soumission\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"One\",\n \"Five\",\n \"Three\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 2,\n \"max\": 4,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 2,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 14 } ], "source": [ "df_books[\"popularity_score\"] = df_books[\"rating\"].apply(generate_popularity_score)\n", "\n", "df_books[[\"title\", \"rating\", \"popularity_score\"]].head()" ] }, { "cell_type": "markdown", "metadata": { "id": "HnngRNTgacYt" }, "source": [ "### *d. Decide on the sentiment_label based on the popularity score with a get_sentiment function*" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "kUtWmr8maZLZ" }, "outputs": [], "source": [ "def get_sentiment(popularity_score):\n", " if popularity_score <= 2:\n", " return \"negative\"\n", " elif popularity_score == 3:\n", " return \"neutral\"\n", " else:\n", " return \"positive\"" ] }, { "cell_type": "markdown", "metadata": { "id": "HF9F9HIzgT7Z" }, "source": [ "### *e. āœ‹šŸ»šŸ›‘ā›”ļø Run the function to create a \"sentiment_label\" column from \"popularity_score\"*" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "id": "tafQj8_7gYCG", "colab": { "base_uri": "https://localhost:8080/", "height": 224 }, "outputId": "a5e4e6b1-1cc0-4ee9-f286-b1f5975412da" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title rating popularity_score \\\n", "0 A Light in the Attic Three 3 \n", "1 Tipping the Velvet One 2 \n", "2 Soumission One 2 \n", "3 Sharp Objects Four 4 \n", "4 Sapiens: A Brief History of Humankind Five 3 \n", "\n", " sentiment_label \n", "0 neutral \n", "1 negative \n", "2 negative \n", "3 positive \n", "4 neutral " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleratingpopularity_scoresentiment_label
0A Light in the AtticThree3neutral
1Tipping the VelvetOne2negative
2SoumissionOne2negative
3Sharp ObjectsFour4positive
4Sapiens: A Brief History of HumankindFive3neutral
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"df_books[[\\\"title\\\", \\\"rating\\\", \\\"popularity_score\\\", \\\"sentiment_label\\\"]]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Tipping the Velvet\",\n \"Sapiens: A Brief History of Humankind\",\n \"Soumission\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"One\",\n \"Five\",\n \"Three\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 2,\n \"max\": 4,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 2,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 16 } ], "source": [ "df_books[\"sentiment_label\"] = df_books[\"popularity_score\"].apply(get_sentiment)\n", "\n", "df_books[[\"title\", \"rating\", \"popularity_score\", \"sentiment_label\"]].head()" ] }, { "cell_type": "markdown", "metadata": { "id": "T8AdKkmASq9a" }, "source": [ "## **4.** šŸ“ˆ Generate synthetic book sales data of 18 months" ] }, { "cell_type": "markdown", "metadata": { "id": "OhXbdGD5fH0c" }, "source": [ "### *a. Create a generate_sales_profit function that would generate sales patterns based on sentiment_label (with some randomness)*" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "id": "qkVhYPXGbgEn" }, "outputs": [], "source": [ "def generate_sales_profile(sentiment):\n", " months = pd.date_range(end=datetime.today(), periods=18, freq=\"M\")\n", "\n", " if sentiment == \"positive\":\n", " base = random.randint(200, 300)\n", " trend = np.linspace(base, base + random.randint(20, 60), len(months))\n", " elif sentiment == \"negative\":\n", " base = random.randint(20, 80)\n", " trend = np.linspace(base, base - random.randint(10, 30), len(months))\n", " else: # neutral\n", " base = random.randint(80, 160)\n", " trend = np.full(len(months), base + random.randint(-10, 10))\n", "\n", " seasonality = 10 * np.sin(np.linspace(0, 3 * np.pi, len(months)))\n", " noise = np.random.normal(0, 5, len(months))\n", " monthly_sales = np.clip(trend + seasonality + noise, a_min=0, a_max=None).astype(int)\n", "\n", " return list(zip(months.strftime(\"%Y-%m\"), monthly_sales))" ] }, { "cell_type": "markdown", "metadata": { "id": "L2ak1HlcgoTe" }, "source": [ "### *b. Run the function as part of building sales_data*" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "id": "SlJ24AUafoDB" }, "outputs": [], "source": [ "sales_data = []\n", "for _, row in df_books.iterrows():\n", " records = generate_sales_profile(row[\"sentiment_label\"])\n", " for month, units in records:\n", " sales_data.append({\n", " \"title\": row[\"title\"],\n", " \"month\": month,\n", " \"units_sold\": units,\n", " \"sentiment_label\": row[\"sentiment_label\"]\n", " })" ] }, { "cell_type": "markdown", "metadata": { "id": "4IXZKcCSgxnq" }, "source": [ "### *c. āœ‹šŸ»šŸ›‘ā›”ļø Create a df_sales DataFrame from sales_data*" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "id": "wcN6gtiZg-ws", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "802ea916-5f82-479c-96ad-aab7e03111b1" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title month units_sold sentiment_label\n", "0 A Light in the Attic 2024-09 122 neutral\n", "1 A Light in the Attic 2024-10 131 neutral\n", "2 A Light in the Attic 2024-11 124 neutral\n", "3 A Light in the Attic 2024-12 129 neutral\n", "4 A Light in the Attic 2025-01 130 neutral" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlemonthunits_soldsentiment_label
0A Light in the Attic2024-09122neutral
1A Light in the Attic2024-10131neutral
2A Light in the Attic2024-11124neutral
3A Light in the Attic2024-12129neutral
4A Light in the Attic2025-01130neutral
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_sales", "summary": "{\n \"name\": \"df_sales\",\n \"rows\": 36000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"month\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"2024-09\",\n \"2024-10\",\n \"2025-05\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"units_sold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 98,\n \"min\": 0,\n \"max\": 362,\n \"num_unique_values\": 360,\n \"samples\": [\n 223,\n 300,\n 173\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 19 } ], "source": [ "df_sales = pd.DataFrame(sales_data)\n", "df_sales.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "EhIjz9WohAmZ" }, "source": [ "### *d. Save df_sales as synthetic_sales_data.csv & view first few lines*" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MzbZvLcAhGaH", "outputId": "f548a779-4d26-4f99-e8a3-ea918e2a2a64" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " title month units_sold sentiment_label\n", "0 A Light in the Attic 2024-09 122 neutral\n", "1 A Light in the Attic 2024-10 131 neutral\n", "2 A Light in the Attic 2024-11 124 neutral\n", "3 A Light in the Attic 2024-12 129 neutral\n", "4 A Light in the Attic 2025-01 130 neutral\n" ] } ], "source": [ "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n", "\n", "print(df_sales.head())" ] }, { "cell_type": "markdown", "metadata": { "id": "7g9gqBgQMtJn" }, "source": [ "## **5.** šŸŽÆ Generate synthetic customer reviews" ] }, { "cell_type": "markdown", "metadata": { "id": "Gi4y9M9KuDWx" }, "source": [ "### *a. āœ‹šŸ»šŸ›‘ā›”ļø Ask ChatGPT to create a list of 50 distinct generic book review texts for the sentiment labels \"positive\", \"neutral\", and \"negative\" called synthetic_reviews_by_sentiment*" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "id": "b3cd2a50", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "b9be8a1f-b38a-40f7-dc8c-4d192be39f62" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "{'positive': 50, 'neutral': 50, 'negative': 50}\n" ] } ], "source": [ "import random\n", "\n", "synthetic_reviews_by_sentiment = {\n", " \"positive\": [\n", " \"A delightful read—engaging from the first chapter to the last.\",\n", " \"Beautifully written with characters that felt real.\",\n", " \"An uplifting story with a satisfying ending.\",\n", " \"I couldn’t put it down; the pacing was fantastic.\",\n", " \"Heartfelt and memorable—this one will stick with me.\",\n", " \"The plot was gripping and the twists were well-earned.\",\n", " \"Strong storytelling and vivid descriptions throughout.\",\n", " \"A wonderfully immersive world and a great cast of characters.\",\n", " \"Surprisingly emotional in the best way.\",\n", " \"A smart, charming book that exceeded my expectations.\",\n", " \"The writing style was polished and easy to sink into.\",\n", " \"A powerful message delivered with care and nuance.\",\n", " \"One of those books you want to recommend to everyone.\",\n", " \"The character development was excellent and believable.\",\n", " \"I loved the atmosphere—rich, detailed, and consistent.\",\n", " \"Clever dialogue and a plot that kept me invested.\",\n", " \"A refreshing take on familiar themes.\",\n", " \"A satisfying, well-structured story with great momentum.\",\n", " \"An inspiring read that left me feeling hopeful.\",\n", " \"Highly enjoyable—great balance of action and emotion.\",\n", " \"The author’s voice is confident and captivating.\",\n", " \"A standout book with an ending that landed perfectly.\",\n", " \"Warm, witty, and genuinely fun to read.\",\n", " \"A page-turner with thoughtful themes underneath.\",\n", " \"Excellent pacing and a strong sense of place.\",\n", " \"This book made me laugh and tear up—rare combo.\",\n", " \"A creative plot with a very rewarding payoff.\",\n", " \"It was charming, heartfelt, and beautifully paced.\",\n", " \"The story hooked me early and never let go.\",\n", " \"A smart, satisfying read with great character arcs.\",\n", " \"Fantastic world-building and a compelling conflict.\",\n", " \"I enjoyed every chapter—consistently strong writing.\",\n", " \"A confident debut (or strong entry) with real emotional depth.\",\n", " \"The author nailed the tone—moving and uplifting.\",\n", " \"A thrilling ride with a surprisingly tender core.\",\n", " \"The characters’ choices felt meaningful and well-motivated.\",\n", " \"A wonderfully crafted story that feels complete.\",\n", " \"A fun, fast read that still had real substance.\",\n", " \"Strong themes, great pacing, and memorable moments.\",\n", " \"A cozy, satisfying read that I’ll revisit.\",\n", " \"The ending was satisfying and fit the story well.\",\n", " \"It balanced humor and seriousness in a great way.\",\n", " \"A thoughtful and engaging book with heart.\",\n", " \"A compelling story with beautiful prose.\",\n", " \"A top-tier read—well worth the time.\",\n", " \"The narrative voice was strong and consistent.\",\n", " \"A moving story that felt honest and earned.\",\n", " \"An entertaining plot with surprisingly deep characters.\",\n", " \"A strong recommendation if you like character-driven stories.\",\n", " \"This book delivered exactly what I hoped for—and more.\"\n", " ],\n", " \"neutral\": [\n", " \"It was a decent read, though not especially memorable.\",\n", " \"Some parts worked well, but others dragged a bit.\",\n", " \"An okay story overall—fine for a casual read.\",\n", " \"The premise was interesting, but the execution was uneven.\",\n", " \"I liked the idea more than the actual plot.\",\n", " \"The writing was solid, but it didn’t fully hook me.\",\n", " \"A mixed experience—some strong moments, some weak ones.\",\n", " \"Not bad, but I probably won’t reread it.\",\n", " \"It had potential, though it didn’t quite deliver for me.\",\n", " \"Enjoyable in places, but the pacing felt inconsistent.\",\n", " \"I felt neutral about the characters—some were good, others flat.\",\n", " \"The story was fine, but it didn’t stand out.\",\n", " \"An average book with a few interesting ideas.\",\n", " \"It started strong but lost momentum in the middle.\",\n", " \"The ending was okay, though a bit predictable.\",\n", " \"I didn’t love it or hate it—just ā€˜okay.’\",\n", " \"Some chapters were engaging; others felt like filler.\",\n", " \"The plot was straightforward and easy to follow.\",\n", " \"I appreciated the theme, but it wasn’t deeply explored.\",\n", " \"A solid effort, but not quite my style.\",\n", " \"I enjoyed the setting, but the story felt familiar.\",\n", " \"It was readable, but I expected more tension.\",\n", " \"The characters were serviceable, though not unforgettable.\",\n", " \"The tone was consistent, but the stakes felt low.\",\n", " \"A decent read, but I didn’t connect emotionally.\",\n", " \"It had a few great scenes, but the rest was average.\",\n", " \"The writing was clear, but the dialogue was hit-or-miss.\",\n", " \"Not as exciting as I hoped, but still okay.\",\n", " \"A moderate recommendation depending on your tastes.\",\n", " \"It was interesting, though it didn’t fully pull me in.\",\n", " \"Some good moments, but overall just fine.\",\n", " \"I liked parts of it, but it didn’t wow me.\",\n", " \"The plot made sense, but felt a bit safe.\",\n", " \"It was fine—nothing particularly wrong, nothing amazing.\",\n", " \"A middle-of-the-road book with a decent message.\",\n", " \"I finished it, but it didn’t leave a big impression.\",\n", " \"The pacing improved later, but the start was slow.\",\n", " \"The concept was good, but needed more depth.\",\n", " \"A serviceable read for the genre.\",\n", " \"I’m glad I read it, but it’s not a favorite.\",\n", " \"It had a clear structure, though it felt formulaic.\",\n", " \"The writing was competent, but not very distinctive.\",\n", " \"I liked the ending more than the beginning.\",\n", " \"A few plot points felt underdeveloped.\",\n", " \"The book was okay, but I wanted stronger characters.\",\n", " \"Readable and decent, but not something I’ll rave about.\",\n", " \"Some scenes were strong; overall it was average.\",\n", " \"The story was steady, but lacked surprises.\",\n", " \"It had charm, but didn’t fully land for me.\",\n", " \"A perfectly fine book to pass the time.\"\n", " ],\n", " \"negative\": [\n", " \"I struggled to stay interested—too slow for my taste.\",\n", " \"The plot felt thin and the pacing was uneven.\",\n", " \"I couldn’t connect with the characters at all.\",\n", " \"The writing style didn’t work for me.\",\n", " \"It started with promise but quickly fell flat.\",\n", " \"I found the story predictable and repetitive.\",\n", " \"The dialogue felt forced and unnatural.\",\n", " \"The characters felt underdeveloped and inconsistent.\",\n", " \"I had to push myself to finish it.\",\n", " \"The plot was confusing without being intriguing.\",\n", " \"Too many loose ends and not enough payoff.\",\n", " \"It felt like the story never really got going.\",\n", " \"The tone didn’t match the subject matter well.\",\n", " \"I expected more depth from the premise.\",\n", " \"The pacing dragged and the stakes felt low.\",\n", " \"The ending was unsatisfying and abrupt.\",\n", " \"I didn’t enjoy the main character’s choices or voice.\",\n", " \"It was hard to care about what was happening.\",\n", " \"The book relied on clichĆ©s more than I liked.\",\n", " \"A disappointing read that didn’t meet expectations.\",\n", " \"The plot points felt random rather than connected.\",\n", " \"The writing felt messy and hard to follow.\",\n", " \"It didn’t hold my attention for very long.\",\n", " \"The story felt stretched without enough content.\",\n", " \"I found it frustrating and not very rewarding.\",\n", " \"The character motivations didn’t make sense to me.\",\n", " \"Not my kind of book—too dull and unfocused.\",\n", " \"The storyline felt repetitive and lacked tension.\",\n", " \"It didn’t deliver on the setup.\",\n", " \"I was expecting a stronger conclusion.\",\n", " \"The book felt longer than it needed to be.\",\n", " \"I didn’t find the relationships believable.\",\n", " \"The prose felt flat and uninteresting.\",\n", " \"It had moments, but mostly didn’t work for me.\",\n", " \"The plot was hard to invest in.\",\n", " \"I didn’t feel any emotional impact.\",\n", " \"The twists (if any) were easy to see coming.\",\n", " \"I wouldn’t recommend this unless you’re a completionist.\",\n", " \"The conflict felt artificial and unconvincing.\",\n", " \"It lacked focus and clear direction.\",\n", " \"The characters felt like stereotypes.\",\n", " \"The pacing was slow and the payoff minimal.\",\n", " \"The writing didn’t flow well for me.\",\n", " \"It felt like it needed a stronger editor.\",\n", " \"The story didn’t match the summary hype.\",\n", " \"I kept waiting for it to improve, but it didn’t.\",\n", " \"I didn’t enjoy the narration or overall tone.\",\n", " \"It was disappointing given the premise.\",\n", " \"The plot didn’t make me care about the outcome.\",\n", " \"Overall, it wasn’t a satisfying reading experience.\"\n", " ]\n", "}\n", "\n", "# Quick check\n", "print({k: len(v) for k, v in synthetic_reviews_by_sentiment.items()})" ] }, { "cell_type": "markdown", "metadata": { "id": "fQhfVaDmuULT" }, "source": [ "### *b. Generate 10 reviews per book using random sampling from the corresponding 50*" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "id": "l2SRc3PjuTGM" }, "outputs": [], "source": [ "import random\n", "\n", "review_rows = []\n", "\n", "for _, row in df_books.iterrows():\n", " title = row[\"title\"]\n", " sentiment_label = row[\"sentiment_label\"]\n", " review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n", "\n", " # sample 10 (without replacement). This works because pool has 50 items.\n", " sampled_reviews = random.sample(review_pool, 10)\n", "\n", " for review_text in sampled_reviews:\n", " review_rows.append({\n", " \"title\": title,\n", " \"sentiment_label\": sentiment_label,\n", " \"review_text\": review_text,\n", " \"popularity_score\": row.get(\"popularity_score\", np.nan),\n", " \"rating\": row.get(\"rating_num\", row.get(\"rating\", np.nan))\n", " })" ] }, { "cell_type": "markdown", "metadata": { "id": "bmJMXF-Bukdm" }, "source": [ "### *c. Create the final dataframe df_reviews & save it as synthetic_book_reviews.csv*" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "id": "ZUKUqZsuumsp" }, "outputs": [], "source": [ "df_reviews = pd.DataFrame(review_rows)\n", "df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)" ] }, { "cell_type": "markdown", "source": [ "### *c. inputs for R*" ], "metadata": { "id": "_602pYUS3gY5" } }, { "cell_type": "code", "execution_count": 30, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3946e521", "outputId": "23d9ccf1-30e9-4802-86cb-a7f180f3a17c" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "āœ… Wrote synthetic_title_level_features.csv\n", "āœ… Wrote synthetic_monthly_revenue_series.csv\n" ] } ], "source": [ "import numpy as np\n", "\n", "def _safe_num(s):\n", " return pd.to_numeric(\n", " pd.Series(s).astype(str).str.replace(r\"[^0-9.]\", \"\", regex=True),\n", " errors=\"coerce\"\n", " )\n", "\n", "# --- Clean book metadata (price/rating) ---\n", "df_books_r = df_books.copy()\n", "if \"price\" in df_books_r.columns:\n", " df_books_r[\"price\"] = _safe_num(df_books_r[\"price\"])\n", "if \"rating\" in df_books_r.columns:\n", " df_books_r[\"rating\"] = _safe_num(df_books_r[\"rating\"])\n", "\n", "df_books_r[\"title\"] = df_books_r[\"title\"].astype(str).str.strip()\n", "\n", "# --- Clean sales ---\n", "df_sales_r = df_sales.copy()\n", "df_sales_r[\"title\"] = df_sales_r[\"title\"].astype(str).str.strip()\n", "df_sales_r[\"month\"] = pd.to_datetime(df_sales_r[\"month\"], errors=\"coerce\")\n", "df_sales_r[\"units_sold\"] = _safe_num(df_sales_r[\"units_sold\"])\n", "\n", "# --- Clean reviews ---\n", "df_reviews_r = df_reviews.copy()\n", "df_reviews_r[\"title\"] = df_reviews_r[\"title\"].astype(str).str.strip()\n", "df_reviews_r[\"sentiment_label\"] = df_reviews_r[\"sentiment_label\"].astype(str).str.lower().str.strip()\n", "if \"rating\" in df_reviews_r.columns:\n", " df_reviews_r[\"rating\"] = _safe_num(df_reviews_r[\"rating\"])\n", "if \"popularity_score\" in df_reviews_r.columns:\n", " df_reviews_r[\"popularity_score\"] = _safe_num(df_reviews_r[\"popularity_score\"])\n", "\n", "# --- Sentiment shares per title (from reviews) ---\n", "sent_counts = (\n", " df_reviews_r.groupby([\"title\", \"sentiment_label\"])\n", " .size()\n", " .unstack(fill_value=0)\n", ")\n", "for lab in [\"positive\", \"neutral\", \"negative\"]:\n", " if lab not in sent_counts.columns:\n", " sent_counts[lab] = 0\n", "\n", "sent_counts[\"total_reviews\"] = sent_counts[[\"positive\", \"neutral\", \"negative\"]].sum(axis=1)\n", "den = sent_counts[\"total_reviews\"].replace(0, np.nan)\n", "sent_counts[\"share_positive\"] = sent_counts[\"positive\"] / den\n", "sent_counts[\"share_neutral\"] = sent_counts[\"neutral\"] / den\n", "sent_counts[\"share_negative\"] = sent_counts[\"negative\"] / den\n", "sent_counts = sent_counts.reset_index()\n", "\n", "# --- Sales aggregation per title ---\n", "sales_by_title = (\n", " df_sales_r.dropna(subset=[\"title\"])\n", " .groupby(\"title\", as_index=False)\n", " .agg(\n", " months_observed=(\"month\", \"nunique\"),\n", " avg_units_sold=(\"units_sold\", \"mean\"),\n", " total_units_sold=(\"units_sold\", \"sum\"),\n", " )\n", ")\n", "\n", "# --- Title-level features (join sales + books + sentiment) ---\n", "df_title = (\n", " sales_by_title\n", " .merge(df_books_r[[\"title\", \"price\", \"rating\"]], on=\"title\", how=\"left\")\n", " .merge(sent_counts[[\"title\", \"share_positive\", \"share_neutral\", \"share_negative\", \"total_reviews\"]],\n", " on=\"title\", how=\"left\")\n", ")\n", "\n", "df_title[\"avg_revenue\"] = df_title[\"avg_units_sold\"] * df_title[\"price\"]\n", "df_title[\"total_revenue\"] = df_title[\"total_units_sold\"] * df_title[\"price\"]\n", "\n", "df_title.to_csv(\"synthetic_title_level_features.csv\", index=False)\n", "print(\"āœ… Wrote synthetic_title_level_features.csv\")\n", "\n", "# --- Monthly revenue series (proxy: units_sold * price) ---\n", "monthly_rev = (\n", " df_sales_r.merge(df_books_r[[\"title\", \"price\"]], on=\"title\", how=\"left\")\n", ")\n", "monthly_rev[\"revenue\"] = monthly_rev[\"units_sold\"] * monthly_rev[\"price\"]\n", "\n", "df_monthly = (\n", " monthly_rev.dropna(subset=[\"month\"])\n", " .groupby(\"month\", as_index=False)[\"revenue\"]\n", " .sum()\n", " .rename(columns={\"revenue\": \"total_revenue\"})\n", " .sort_values(\"month\")\n", ")\n", "# if revenue is all NA (e.g., missing price), fallback to units_sold as a teaching proxy\n", "if df_monthly[\"total_revenue\"].notna().sum() == 0:\n", " df_monthly = (\n", " df_sales_r.dropna(subset=[\"month\"])\n", " .groupby(\"month\", as_index=False)[\"units_sold\"]\n", " .sum()\n", " .rename(columns={\"units_sold\": \"total_revenue\"})\n", " .sort_values(\"month\")\n", " )\n", "\n", "df_monthly[\"month\"] = pd.to_datetime(df_monthly[\"month\"], errors=\"coerce\").dt.strftime(\"%Y-%m-%d\")\n", "df_monthly.to_csv(\"synthetic_monthly_revenue_series.csv\", index=False)\n", "print(\"āœ… Wrote synthetic_monthly_revenue_series.csv\")\n" ] }, { "cell_type": "markdown", "metadata": { "id": "RYvGyVfXuo54" }, "source": [ "### *d. āœ‹šŸ»šŸ›‘ā›”ļø View the first few lines*" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 397 }, "id": "xfE8NMqOurKo", "outputId": "65fccd01-7d12-4ccc-80b0-a13667b79016" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title sentiment_label \\\n", "0 A Light in the Attic neutral \n", "1 A Light in the Attic neutral \n", "2 A Light in the Attic neutral \n", "3 A Light in the Attic neutral \n", "4 A Light in the Attic neutral \n", "\n", " review_text popularity_score rating \n", "0 The plot was straightforward and easy to follow. 3 3 \n", "1 An okay story overall—fine for a casual read. 3 3 \n", "2 It had potential, though it didn’t quite deliv... 3 3 \n", "3 The ending was okay, though a bit predictable. 3 3 \n", "4 The story was steady, but lacked surprises. 3 3 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlesentiment_labelreview_textpopularity_scorerating
0A Light in the AtticneutralThe plot was straightforward and easy to follow.33
1A Light in the AtticneutralAn okay story overall—fine for a casual read.33
2A Light in the AtticneutralIt had potential, though it didn’t quite deliv...33
3A Light in the AtticneutralThe ending was okay, though a bit predictable.33
4A Light in the AtticneutralThe story was steady, but lacked surprises.33
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_reviews", "summary": "{\n \"name\": \"df_reviews\",\n \"rows\": 20000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review_text\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 150,\n \"samples\": [\n \"It was disappointing given the premise.\",\n \"The story didn\\u2019t match the summary hype.\",\n \"The writing was clear, but the dialogue was hit-or-miss.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 1,\n 2,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 31 } ], "source": [ "df_reviews.head()" ] } ], "metadata": { "colab": { "collapsed_sections": [ "jpASMyIQMaAq", "lquNYCbfL9IM", "0IWuNpxxYDJF", "oCdTsin2Yfp3", "T0TOeRC4Yrnn", "duI5dv3CZYvF", "qMjRKMBQZlJi", "p-1Pr2szaqLk", "SIaJUGIpaH4V", "pY4yCoIuaQqp", "n4-TaNTFgPak", "HnngRNTgacYt", "HF9F9HIzgT7Z", "T8AdKkmASq9a", "OhXbdGD5fH0c", "L2ak1HlcgoTe", "4IXZKcCSgxnq", "EhIjz9WohAmZ", "Gi4y9M9KuDWx", "fQhfVaDmuULT", "bmJMXF-Bukdm", "RYvGyVfXuo54" ], "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }