diff --git "a/2a_Python_Analysis (1).ipynb" "b/2a_Python_Analysis (1).ipynb" new file mode 100644--- /dev/null +++ "b/2a_Python_Analysis (1).ipynb" @@ -0,0 +1,2534 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "kz8lLSv6mVQo" + }, + "source": [ + "# **🤖 Data Analysis & Visualization**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jpASMyIQMaAq" + }, + "source": [ + "## **1.** 📦 Install required packages" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f48c8f8c", + "outputId": "4aa2e63e-f7ce-4b6b-9fde-5ee3c36f8c3e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n", + "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n", + "Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n", + "Collecting faker\n", + " Downloading faker-40.5.1-py3-none-any.whl.metadata (16 kB)\n", + "Requirement already satisfied: transformers in /usr/local/lib/python3.12/dist-packages (5.0.0)\n", + "Collecting vaderSentiment\n", + " Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.3)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n", + "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n", + "Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from transformers) (3.24.3)\n", + "Requirement already satisfied: huggingface-hub<2.0,>=1.3.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (1.4.1)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.12/dist-packages (from transformers) (6.0.3)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.12/dist-packages (from transformers) (2025.11.3)\n", + "Requirement already satisfied: tokenizers<=0.23.0,>=0.22.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.22.2)\n", + "Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from transformers) (0.24.0)\n", + "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.7.0)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.12/dist-packages (from transformers) (4.67.3)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from vaderSentiment) (2.32.4)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=1.3.0->transformers) (2025.3.0)\n", + "Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=1.3.0->transformers) (1.3.0)\n", + "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=1.3.0->transformers) (0.28.1)\n", + "Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=1.3.0->transformers) (1.5.4)\n", + "Requirement already satisfied: typing-extensions>=4.1.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=1.3.0->transformers) (4.15.0)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.1)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (3.4.4)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (3.11)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (2.5.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->vaderSentiment) (2026.1.4)\n", + "Requirement already satisfied: typer>=0.24.0 in /usr/local/lib/python3.12/dist-packages (from typer-slim->transformers) (0.24.1)\n", + "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface-hub<2.0,>=1.3.0->transformers) (4.12.1)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface-hub<2.0,>=1.3.0->transformers) (1.0.9)\n", + "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->huggingface-hub<2.0,>=1.3.0->transformers) (0.16.0)\n", + "Requirement already satisfied: rich>=12.3.0 in /usr/local/lib/python3.12/dist-packages (from typer>=0.24.0->typer-slim->transformers) (13.9.4)\n", + "Requirement already satisfied: annotated-doc>=0.0.2 in /usr/local/lib/python3.12/dist-packages (from typer>=0.24.0->typer-slim->transformers) (0.0.4)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.12/dist-packages (from rich>=12.3.0->typer>=0.24.0->typer-slim->transformers) (4.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.12/dist-packages (from rich>=12.3.0->typer>=0.24.0->typer-slim->transformers) (2.19.2)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.12/dist-packages (from markdown-it-py>=2.2.0->rich>=12.3.0->typer>=0.24.0->typer-slim->transformers) (0.1.2)\n", + "Downloading faker-40.5.1-py3-none-any.whl (2.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m47.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m126.0/126.0 kB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: faker, vaderSentiment\n", + "Successfully installed faker-40.5.1 vaderSentiment-3.3.2\n" + ] + } + ], + "source": [ + "!pip install pandas matplotlib seaborn numpy textblob faker transformers vaderSentiment\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NZd99NpKkKyp" + }, + "source": [ + "## **2.** ✅️ Load & inspect input datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_JBLmm508Uq2" + }, + "source": [ + "### *a. Initial setup*" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "eBDXPQz18Xrs" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import random" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IL8lZbMm8m3k" + }, + "source": [ + "### *b. ✋🏻🛑⛔️ Create the df_reviews dataframe from the synthetic_book_reviews.csv file*" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "fdgjghfO8uuq" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df_reviews = pd.read_csv(\"synthetic_book_reviews.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N-Dl37J0HLhU" + }, + "source": [ + "### *c. ✋🏻🛑⛔️ Create the df_sales dataframe from the synthetic_sales_data.csv file*" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "6XZs3P7fHgQe", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "8e653027-8140-4d4d-9349-a65540ddaba0" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "File caricato correttamente ✅\n" + ] + } + ], + "source": [ + "import os\n", + "import pandas as pd\n", + "\n", + "if \"synthetic_sales_data.csv\" in os.listdir():\n", + " df_sales = pd.read_csv(\"synthetic_sales_data.csv\")\n", + " print(\"File caricato correttamente ✅\")\n", + "else:\n", + " print(\"Il file synthetic_sales_data.csv non esiste ❌\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MUI3SkmyrGQo" + }, + "source": [ + "### *d. ✋🏻🛑⛔️ Visualize the first few lines of the two final datasets: df_reviews and df_sales*" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 449 + }, + "id": "p8FdQFXErOqE", + "outputId": "539e81d6-114e-458e-a24e-8fff57717de0" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "df_reviews:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " title sentiment_label review_text \\\n", + "0 A Light in the Attic neutral It didn’t leave a strong impression. \n", + "1 A Light in the Attic neutral A mild and easy read. \n", + "2 A Light in the Attic neutral The writing style was standard. \n", + "3 A Light in the Attic neutral I found it moderately interesting. \n", + "4 A Light in the Attic neutral A modest and simple read. \n", + "\n", + " rating popularity_score \n", + "0 Three 3 \n", + "1 Three 3 \n", + "2 Three 3 \n", + "3 Three 3 \n", + "4 Three 3 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlesentiment_labelreview_textratingpopularity_score
0A Light in the AtticneutralIt didn’t leave a strong impression.Three3
1A Light in the AtticneutralA mild and easy read.Three3
2A Light in the AtticneutralThe writing style was standard.Three3
3A Light in the AtticneutralI found it moderately interesting.Three3
4A Light in the AtticneutralA modest and simple read.Three3
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"display(df_sales\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"A Light in the Attic\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"neutral\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"A mild and easy read.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Three\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 3,\n \"max\": 3,\n \"num_unique_values\": 1,\n \"samples\": [\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "df_sales:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " title month units_sold sentiment_label\n", + "0 A Light in the Attic 2024-09 100 neutral\n", + "1 A Light in the Attic 2024-10 109 neutral\n", + "2 A Light in the Attic 2024-11 102 neutral\n", + "3 A Light in the Attic 2024-12 107 neutral\n", + "4 A Light in the Attic 2025-01 108 neutral" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlemonthunits_soldsentiment_label
0A Light in the Attic2024-09100neutral
1A Light in the Attic2024-10109neutral
2A Light in the Attic2024-11102neutral
3A Light in the Attic2024-12107neutral
4A Light in the Attic2025-01108neutral
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"display(df_sales\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"A Light in the Attic\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"month\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"2024-10\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"units_sold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 100,\n \"max\": 109,\n \"num_unique_values\": 5,\n \"samples\": [\n 109\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"neutral\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + } + ], + "source": [ + "print(\"df_reviews:\")\n", + "display(df_reviews.head())\n", + "\n", + "print(\"\\ndf_sales:\")\n", + "display(df_sales.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y3oqGHsmrQzx" + }, + "source": [ + "### *d. Run a quality check on the datasets*" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "VArQGPoKrfLm", + "outputId": "d0b2b3ca-15ef-4d04-ae17-4e860d11d2cc" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "🔍 Quality Check Report for: df_reviews\n", + "===================================\n", + "\n", + "📏 Shape: (10000, 5)\n", + "\n", + "🔠 Column Types:\n", + "title object\n", + "sentiment_label object\n", + "review_text object\n", + "rating object\n", + "popularity_score int64\n", + "dtype: object\n", + "\n", + "❓ Missing Values:\n", + "title 0\n", + "sentiment_label 0\n", + "review_text 0\n", + "rating 0\n", + "popularity_score 0\n", + "dtype: int64\n", + "\n", + "📋 Duplicate Rows: 0\n", + "\n", + "📊 Summary Statistics:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " count unique top freq \\\n", + "title 10000 999 The Star-Touched Queen 20 \n", + "sentiment_label 10000 3 positive 4380 \n", + "review_text 10000 150 A wonderfully constructed narrative. 104 \n", + "rating 10000 5 One 2260 \n", + "popularity_score 10000.0 NaN NaN NaN \n", + "\n", + " mean std min 25% 50% 75% max \n", + "title NaN NaN NaN NaN NaN NaN NaN \n", + "sentiment_label NaN NaN NaN NaN NaN NaN NaN \n", + "review_text NaN NaN NaN NaN NaN NaN NaN \n", + "rating NaN NaN NaN NaN NaN NaN NaN \n", + "popularity_score 3.282 1.028874 1.0 3.0 3.0 4.0 5.0 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countuniquetopfreqmeanstdmin25%50%75%max
title10000999The Star-Touched Queen20NaNNaNNaNNaNNaNNaNNaN
sentiment_label100003positive4380NaNNaNNaNNaNNaNNaNNaN
review_text10000150A wonderfully constructed narrative.104NaNNaNNaNNaNNaNNaNNaN
rating100005One2260NaNNaNNaNNaNNaNNaNNaN
popularity_score10000.0NaNNaNNaN3.2821.0288741.03.03.04.05.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"quality_check(df_sales, \\\"df_sales\\\")\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"count\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"10000\",\n \"max\": \"10000\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"10000\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"unique\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 3,\n \"max\": 999,\n \"num_unique_values\": 4,\n \"samples\": [\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"top\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"freq\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"20\",\n \"max\": \"4380\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"4380\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"mean\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 3.282,\n \"max\": 3.282,\n \"num_unique_values\": 1,\n \"samples\": [\n 3.282\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"std\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 1.0288740730458223,\n \"max\": 1.0288740730458223,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0288740730458223\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"min\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"25%\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 3.0,\n \"max\": 3.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"50%\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 3.0,\n \"max\": 3.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 3.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"75%\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 4.0,\n \"max\": 4.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 4.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"max\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 5.0,\n \"max\": 5.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "👀 Sample Rows:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " title sentiment_label \\\n", + "9933 Bleach, Vol. 1: Strawberry and the Soul Reaper... positive \n", + "3098 El Deafo positive \n", + "6703 It neutral \n", + "7036 Vegan Vegetarian Omnivore: Dinner for Everyone... neutral \n", + "621 The Most Perfect Thing: Inside (and Outside) a... positive \n", + "\n", + " review_text rating popularity_score \n", + "9933 A masterfully written story. Five 4 \n", + "3098 A captivating and heartfelt read. Five 5 \n", + "6703 An average reading experience. Three 3 \n", + "7036 An adequate storytelling effort. Two 3 \n", + "621 A refreshing and engaging perspective. Four 4 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlesentiment_labelreview_textratingpopularity_score
9933Bleach, Vol. 1: Strawberry and the Soul Reaper...positiveA masterfully written story.Five4
3098El DeafopositiveA captivating and heartfelt read.Five5
6703ItneutralAn average reading experience.Three3
7036Vegan Vegetarian Omnivore: Dinner for Everyone...neutralAn adequate storytelling effort.Two3
621The Most Perfect Thing: Inside (and Outside) a...positiveA refreshing and engaging perspective.Four4
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"quality_check(df_sales, \\\"df_sales\\\")\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"El Deafo\",\n \"The Most Perfect Thing: Inside (and Outside) a Bird's Egg\",\n \"It\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"neutral\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"A captivating and heartfelt read.\",\n \"A refreshing and engaging perspective.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Three\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 3,\n \"max\": 5,\n \"num_unique_values\": 3,\n \"samples\": [\n 4,\n 5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "🔍 Quality Check Report for: df_sales\n", + "=================================\n", + "\n", + "📏 Shape: (18000, 4)\n", + "\n", + "🔠 Column Types:\n", + "title object\n", + "month object\n", + "units_sold int64\n", + "sentiment_label object\n", + "dtype: object\n", + "\n", + "❓ Missing Values:\n", + "title 0\n", + "month 0\n", + "units_sold 0\n", + "sentiment_label 0\n", + "dtype: int64\n", + "\n", + "📋 Duplicate Rows: 0\n", + "\n", + "📊 Summary Statistics:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " count unique top freq mean \\\n", + "title 18000 999 The Star-Touched Queen 36 NaN \n", + "month 18000 18 2024-09 1000 NaN \n", + "units_sold 18000.0 NaN NaN NaN 168.024167 \n", + "sentiment_label 18000 3 positive 7884 NaN \n", + "\n", + " std min 25% 50% 75% max \n", + "title NaN NaN NaN NaN NaN NaN \n", + "month NaN NaN NaN NaN NaN NaN \n", + "units_sold 98.656354 0.0 84.0 148.0 262.0 362.0 \n", + "sentiment_label NaN NaN NaN NaN NaN NaN " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countuniquetopfreqmeanstdmin25%50%75%max
title18000999The Star-Touched Queen36NaNNaNNaNNaNNaNNaNNaN
month18000182024-091000NaNNaNNaNNaNNaNNaNNaN
units_sold18000.0NaNNaNNaN168.02416798.6563540.084.0148.0262.0362.0
sentiment_label180003positive7884NaNNaNNaNNaNNaNNaNNaN
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"quality_check(df_sales, \\\"df_sales\\\")\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"count\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"18000\",\n \"max\": \"18000\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"18000\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"unique\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 3,\n \"max\": 999,\n \"num_unique_values\": 3,\n \"samples\": [\n 999\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"top\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"The Star-Touched Queen\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"freq\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"36\",\n \"max\": \"7884\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"36\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"mean\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 168.02416666666667,\n \"max\": 168.02416666666667,\n \"num_unique_values\": 1,\n \"samples\": [\n 168.02416666666667\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"std\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 98.65635350480927,\n \"max\": 98.65635350480927,\n \"num_unique_values\": 1,\n \"samples\": [\n 98.65635350480927\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"min\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"25%\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 84.0,\n \"max\": 84.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 84.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"50%\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 148.0,\n \"max\": 148.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 148.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"75%\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 262.0,\n \"max\": 262.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 262.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"max\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 362.0,\n \"max\": 362.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 362.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "👀 Sample Rows:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " title month units_sold \\\n", + "7751 The Art and Science of Low Carbohydrate Living 2025-08 289 \n", + "4780 Someone Like You (The Harrisons #2) 2025-07 315 \n", + "602 The Bear and the Piano 2025-05 27 \n", + "13477 The Goldfinch 2025-10 39 \n", + "6612 The Children 2025-03 94 \n", + "\n", + " sentiment_label \n", + "7751 positive \n", + "4780 positive \n", + "602 negative \n", + "13477 negative \n", + "6612 neutral " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlemonthunits_soldsentiment_label
7751The Art and Science of Low Carbohydrate Living2025-08289positive
4780Someone Like You (The Harrisons #2)2025-07315positive
602The Bear and the Piano2025-0527negative
13477The Goldfinch2025-1039negative
6612The Children2025-0394neutral
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "repr_error": "0" + } + }, + "metadata": {} + } + ], + "source": [ + "def quality_check(df, name=\"DataFrame\"):\n", + " print(f\"\\n🔍 Quality Check Report for: {name}\")\n", + " print(\"=\" * (25 + len(name)))\n", + "\n", + " # Basic info\n", + " print(f\"\\n📏 Shape: {df.shape}\")\n", + " print(\"\\n🔠 Column Types:\")\n", + " print(df.dtypes)\n", + "\n", + " # Missing values\n", + " print(\"\\n❓ Missing Values:\")\n", + " print(df.isnull().sum())\n", + "\n", + " # Duplicates\n", + " duplicate_count = df.duplicated().sum()\n", + " print(f\"\\n📋 Duplicate Rows: {duplicate_count}\")\n", + "\n", + " # Summary stats\n", + " print(\"\\n📊 Summary Statistics:\")\n", + " display(df.describe(include='all').transpose())\n", + "\n", + " # Sample rows\n", + " print(\"\\n👀 Sample Rows:\")\n", + " display(df.sample(5))\n", + "\n", + "# Run checks\n", + "quality_check(df_reviews, \"df_reviews\")\n", + "quality_check(df_sales, \"df_sales\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TTxUKDYINPxV" + }, + "source": [ + "## **3.** 🎭 Perform sentiment analysis using VADER" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OqhYU8rDxQRT" + }, + "source": [ + "### *a. Initial setup*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DNk5w8mNxSZ6" + }, + "outputs": [], + "source": [ + "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n", + "\n", + "# 🤖 Initialize VADER analyzer\n", + "analyzer = SentimentIntensityAnalyzer()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P123TwSWxVAr" + }, + "source": [ + "### *b. Create a function get_sentiment_label that will return the label negative, neutral, or positive based on the VADER analyzer's scoring of the text*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "89809e6f" + }, + "outputs": [], + "source": [ + "def get_sentiment_label(text):\n", + " score = analyzer.polarity_scores(text)[\"compound\"]\n", + " if score >= 0.05:\n", + " return \"positive\"\n", + " elif score <= -0.05:\n", + " return \"negative\"\n", + " else:\n", + " return \"neutral\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DS9eCZ95yQn3" + }, + "source": [ + "### *c. ✋🏻🛑⛔️ Apply get_sentiment_label to df_reviews column named review_text to get sentiment_label column*" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "SpXzFaDfyM7I", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "outputId": "bf1b14f6-f3d5-44c8-b92b-15865bb89cad" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " title sentiment_label review_text \\\n", + "0 A Light in the Attic neutral It didn’t leave a strong impression. \n", + "1 A Light in the Attic neutral A mild and easy read. \n", + "2 A Light in the Attic neutral The writing style was standard. \n", + "3 A Light in the Attic neutral I found it moderately interesting. \n", + "4 A Light in the Attic neutral A modest and simple read. \n", + "\n", + " rating popularity_score vader_sentiment \n", + "0 Three 3 positive \n", + "1 Three 3 positive \n", + "2 Three 3 neutral \n", + "3 Three 3 positive \n", + "4 Three 3 neutral " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlesentiment_labelreview_textratingpopularity_scorevader_sentiment
0A Light in the AtticneutralIt didn’t leave a strong impression.Three3positive
1A Light in the AtticneutralA mild and easy read.Three3positive
2A Light in the AtticneutralThe writing style was standard.Three3neutral
3A Light in the AtticneutralI found it moderately interesting.Three3positive
4A Light in the AtticneutralA modest and simple read.Three3neutral
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df_reviews", + "summary": "{\n \"name\": \"df_reviews\",\n \"rows\": 10000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review_text\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 150,\n \"samples\": [\n \"A brilliant blend of emotion and storytelling.\",\n \"The ending was unsatisfying.\",\n \"The dialogue felt natural and authentic.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vader_sentiment\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"positive\",\n \"neutral\",\n \"negative\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 16 + } + ], + "source": [ + "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n", + "\n", + "# Initialize VADER\n", + "analyzer = SentimentIntensityAnalyzer()\n", + "\n", + "# Define function\n", + "def get_sentiment_label(text):\n", + " score = analyzer.polarity_scores(text)[\"compound\"]\n", + "\n", + " if score >= 0.05:\n", + " return \"positive\"\n", + " elif score <= -0.05:\n", + " return \"negative\"\n", + " else:\n", + " return \"neutral\"\n", + "\n", + "# Apply VADER to review_text\n", + "df_reviews[\"vader_sentiment\"] = df_reviews[\"review_text\"].apply(get_sentiment_label)\n", + "\n", + "# View first rows\n", + "df_reviews.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5cnPCFFnyXN6" + }, + "source": [ + "### *d. ✋🏻🛑⛔️ View the first few lines of the resulting table df_reviews*" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "ODGyfjBSyZEO", + "outputId": "437e991c-5343-4892-ae45-1e72f6dde135" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " review_text sentiment_label vader_sentiment\n", + "0 It didn’t leave a strong impression. neutral positive\n", + "1 A mild and easy read. neutral positive\n", + "2 The writing style was standard. neutral neutral\n", + "3 I found it moderately interesting. neutral positive\n", + "4 A modest and simple read. neutral neutral" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
review_textsentiment_labelvader_sentiment
0It didn’t leave a strong impression.neutralpositive
1A mild and easy read.neutralpositive
2The writing style was standard.neutralneutral
3I found it moderately interesting.neutralpositive
4A modest and simple read.neutralneutral
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df_reviews[[\\\"review_text\\\", \\\"sentiment_label\\\", \\\"vader_sentiment\\\"]]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"review_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"A mild and easy read.\",\n \"A modest and simple read.\",\n \"The writing style was standard.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"neutral\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vader_sentiment\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"neutral\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 17 + } + ], + "source": [ + "df_reviews[[\"review_text\", \"sentiment_label\", \"vader_sentiment\"]].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Qy3Hqm-FojvT" + }, + "source": [ + "## **4.** 📊 Use the following data visualization code snippets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lcjGSw2bzqtZ" + }, + "source": [ + "### *a. Initial setup*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "p5LV2o1rzsiC" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import matplotlib.dates as mdates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tvaBtswpGS__" + }, + "outputs": [], + "source": [ + "# ----------------------------\n", + "# Outputs (for Hugging Face app)\n", + "# ----------------------------\n", + "# In the notebook: you still SEE interactive tables/plots inline.\n", + "# For the Space dashboard: we also SAVE the same outputs as files.\n", + "\n", + "from pathlib import Path\n", + "\n", + "ART_DIR = Path(\"artifacts\")\n", + "PY_FIG = ART_DIR / \"py\" / \"figures\"\n", + "PY_TAB = ART_DIR / \"py\" / \"tables\"\n", + "\n", + "for p in [PY_FIG, PY_TAB]:\n", + " p.mkdir(parents=True, exist_ok=True)\n", + "\n", + "print(\"✅ Output folders:\")\n", + "print(\" -\", PY_FIG.resolve())\n", + "print(\" -\", PY_TAB.resolve())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b9T1rkBe0AJU" + }, + "source": [ + "### *b. Sample of 5 books for each popularity level for visualizations*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLdFmGqXqo_t" + }, + "outputs": [], + "source": [ + "sampled_titles = []\n", + "for pop_score in sorted(df_reviews[\"popularity_score\"].dropna().unique()):\n", + " all_titles = df_reviews[df_reviews[\"popularity_score\"] == pop_score][\"title\"].unique()\n", + " sampled = random.sample(list(all_titles), min(5, len(all_titles)))\n", + " sampled_titles.extend(sampled)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xq7-C8m70mMH" + }, + "source": [ + "### *c. Copy relevant sales, reviews, and book names*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "laDdMece0qrq" + }, + "outputs": [], + "source": [ + "sampled_sales = df_sales[df_sales[\"title\"].isin(sampled_titles)].copy()\n", + "sampled_reviews = df_reviews[df_reviews[\"title\"].isin(sampled_titles)].copy()\n", + "sampled_books = df_reviews[df_reviews[\"title\"].isin(sampled_titles)].copy()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8YtfkG_A0wTy" + }, + "source": [ + "### *d. Plot sales trends over time for the sampled books*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1iTVzflW0Rkw" + }, + "outputs": [], + "source": [ + "# 🕒 Ensure datetime format\n", + "df_sales[\"month\"] = pd.to_datetime(df_sales[\"month\"])\n", + "# 🎨 Color mapping\n", + "popularity_colors = {\n", + " 1: \"darkred\", 2: \"orangered\", 3: \"gold\", 4: \"mediumseagreen\", 5: \"royalblue\"\n", + "}\n", + "\n", + "# 📈 Plot 1: Sales trends\n", + "plt.figure(figsize=(20, 8))\n", + "for title in sampled_titles:\n", + " row = sampled_books[sampled_books[\"title\"] == title].iloc[0]\n", + " color = popularity_colors.get(row[\"popularity_score\"], \"gray\")\n", + " subset = sampled_sales[sampled_sales[\"title\"] == title]\n", + " plt.plot(subset[\"month\"], subset[\"units_sold\"], label=f\"{title} (Pop. {row['popularity_score']})\", color=color)\n", + "\n", + "plt.title(\"📈 Sales Trends Over Time (5 per Popularity Level)\")\n", + "plt.xlabel(\"Month\")\n", + "plt.ylabel(\"Units Sold\")\n", + "plt.xticks(rotation=45)\n", + "plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize='small')\n", + "plt.grid(True)\n", + "plt.tight_layout()\n", + "plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))\n", + "plt.savefig(PY_FIG / 'sales_trends_sampled_titles.png', dpi=150)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lDpMkjDP1K6j" + }, + "source": [ + "### *e. Plot sentiment_label distribution per book*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dn1Jgd5R1KLu" + }, + "outputs": [], + "source": [ + "# 🎨 Give a new name to each book that includes the rating together with the title\n", + "sampled_reviews[\"grouped_title\"] = sampled_reviews[\"rating\"].astype(str) + \"★ | \" + sampled_reviews[\"title\"]\n", + "\n", + "# 📊 Aggregate sentiment counts\n", + "sentiment_counts = (\n", + " sampled_reviews.groupby([\"grouped_title\", \"sentiment_label\"])\n", + " .size()\n", + " .unstack(fill_value=0)[[\"negative\", \"neutral\", \"positive\"]] # consistent order\n", + ")\n", + "\n", + "# 💾 Save table for HF dashboard\n", + "sentiment_counts.reset_index().to_csv(PY_TAB / 'sentiment_counts_sampled.csv', index=False)\n", + "\n", + "\n", + "# ✅ Plot stacked horizontal bars\n", + "fig, ax = plt.subplots(figsize=(12, 14))\n", + "sentiment_counts.plot.barh(\n", + " stacked=True,\n", + " ax=ax,\n", + " color={\"negative\": \"royalblue\", \"neutral\": \"lightgray\", \"positive\": \"crimson\"}\n", + ")\n", + "\n", + "plt.title(\"💬 Sentiment Distribution in Reviews (5 Books per Popularity Level)\", fontsize=14)\n", + "plt.xlabel(\"Number of Reviews\")\n", + "plt.ylabel(\"Book Title (Grouped by Popularity Score)\")\n", + "plt.legend(title=\"Sentiment\", loc=\"lower right\")\n", + "plt.grid(axis=\"x\", linestyle=\"--\", alpha=0.6)\n", + "plt.tight_layout()\n", + "plt.savefig(PY_FIG / 'sentiment_distribution_sampled_titles.png', dpi=150)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rmgylC1ENCHy" + }, + "source": [ + "## **5.** 🔮 Forecast book sales with the following ARIMA code" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jFV4JE1R3FKH" + }, + "source": [ + "### *a. Initial setup*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mh8Alha03H22" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import matplotlib.dates as mdates\n", + "import statsmodels.api as sm\n", + "from itertools import product\n", + "import matplotlib.cm as cm\n", + "import warnings" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gHucD8OW3U0w" + }, + "source": [ + "### *b. Define function find_best_arima to try different ARIMA parameter values and return the best combination for each book's price forecast*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "477fa43f" + }, + "outputs": [], + "source": [ + "def find_best_arima(series, p_range=(0, 5), d_range=(0, 2), q_range=(0, 1)):\n", + " best_aic = float(\"inf\")\n", + " best_order = None\n", + " best_model = None\n", + "\n", + " for p, d, q in product(range(p_range[0], p_range[1] + 1),\n", + " range(d_range[0], d_range[1] + 1),\n", + " range(q_range[0], q_range[1] + 1)):\n", + " try:\n", + " model = sm.tsa.ARIMA(series, order=(p, d, q))\n", + " results = model.fit()\n", + " if results.aic < best_aic:\n", + " best_aic = results.aic\n", + " best_order = (p, d, q)\n", + " best_model = results\n", + " except:\n", + " continue\n", + "\n", + " return best_order, best_model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Rq5t1Hey3jkD" + }, + "source": [ + "### *c. Plot the figure*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DmxGdvLE3dHQ" + }, + "outputs": [], + "source": [ + "# 🎨 Generate 25 highly distinct colors using HUSL (hue-saturation-lightness)\n", + "colors = sns.color_palette(\"tab10\", len(sampled_titles))\n", + "\n", + "plt.figure(figsize=(16, 10))\n", + "\n", + "for i, title in enumerate(sampled_titles):\n", + " book_sales = sampled_sales[sampled_sales[\"title\"] == title].copy()\n", + " book_sales[\"month\"] = pd.to_datetime(book_sales[\"month\"])\n", + " book_sales = book_sales.sort_values(\"month\").set_index(\"month\")\n", + "\n", + " with warnings.catch_warnings():\n", + " warnings.simplefilter(\"ignore\")\n", + " best_order, best_model = find_best_arima(book_sales[\"units_sold\"])\n", + " if best_model is not None:\n", + " forecast = best_model.get_forecast(steps=6)\n", + " forecast_index = pd.date_range(start=book_sales.index[-1] + pd.DateOffset(months=1), periods=6, freq='MS')\n", + "\n", + " # 🟦 Plot observed sales (solid line)\n", + " plt.plot(book_sales.index, book_sales[\"units_sold\"], color=colors[i], label=title, linewidth=2)\n", + "\n", + " # 🟠 Plot forecast (dotted line, same color)\n", + " plt.plot(forecast_index, forecast.predicted_mean, linestyle=\"--\", color=colors[i], linewidth=2)\n", + "\n", + "# 📈 Final formatting\n", + "plt.title(\"📈 ARIMA Forecasts for Sampled Books (1 per Popularity Level)\", fontsize=14)\n", + "plt.xlabel(\"Month\")\n", + "plt.ylabel(\"Units Sold\")\n", + "plt.xticks(rotation=45)\n", + "plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))\n", + "plt.grid(True)\n", + "plt.legend(loc=\"center left\", bbox_to_anchor=(1, 0.5), fontsize=\"small\")\n", + "plt.tight_layout()\n", + "plt.savefig(PY_FIG / 'arima_forecasts_sampled_titles.png', dpi=150)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SKBcx3fyCFly" + }, + "source": [ + "## **6.** 🏷️ Decide on price changes with a rule-based approach based on sentiment and future revenue" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nY-vV2JJDZqu" + }, + "source": [ + "### *a. Calculate average sales per book*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nbDT_RHaDD2R" + }, + "outputs": [], + "source": [ + "avg_sales = df_sales.groupby(\"title\")[\"units_sold\"].mean().reset_index()\n", + "avg_sales.columns = [\"title\", \"avg_units_sold\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "94wi-RvkDf2z" + }, + "source": [ + "### *b. Calculate sentiment distribution per book*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fWjQ9IOXDk-M" + }, + "outputs": [], + "source": [ + "sentiment_counts = df_reviews.groupby([\"title\", \"sentiment_label\"]).size().unstack(fill_value=0)\n", + "sentiment_counts[\"total\"] = sentiment_counts.sum(axis=1)\n", + "sentiment_counts[\"positive_ratio\"] = sentiment_counts.get(\"positive\") / sentiment_counts[\"total\"]\n", + "sentiment_counts[\"negative_ratio\"] = sentiment_counts.get(\"negative\") / sentiment_counts[\"total\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Vm10ym_iDtEW" + }, + "source": [ + "### *c. Merge the calculated sales and sentiment characteristics*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "T-zlh6rBDpxg" + }, + "outputs": [], + "source": [ + "df_decision = avg_sales.merge(sentiment_counts, on=\"title\", how=\"left\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1WIWDojyD7fK" + }, + "source": [ + "### *d. ✋🏻🛑⛔️ Create the pricing_decision function as a basic rule-based pricing decider based on sentiment and revenue*\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b5qJCb46Dxfb" + }, + "source": [ + "* If there are 120 or more average units sold and 0.6 or higher positive ratio, the decision should be to increase price.\n", + "* If there are 60 or less average units sold and 0.4 or higher negative ratio, the decision should be to decrease price.\n", + "* Otherwise, the price should be kept the same." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XBzozedwD6yx" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xmLEdF14EPAA" + }, + "source": [ + "### *e. ✋🏻🛑⛔️ Run the pricing_decision function and check out the first few decisions*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TZ0ZhgHrEQJB" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WTkP2_-EApev" + }, + "source": [ + "\n", + "## **7.** 💾 Save Python outputs for the Hugging Face dashboard" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3EIjfnokGpJv" + }, + "source": [ + "\n", + "This section exports **HF-ready artifacts** into a consistent folder structure:\n", + "\n", + "- `(root folder)py/figures/` (Python-generated visuals)\n", + "- `(root folder)py/tables/` (tables/metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZJJ4PMgIApev" + }, + "outputs": [], + "source": [ + "\n", + "import json\n", + "\n", + "# -------------------------\n", + "# 1) Dashboard table (monthly) — reuse if already built\n", + "# -------------------------\n", + "if \"df_monthly\" in globals() and df_monthly is not None:\n", + " df_dashboard = df_monthly.copy()\n", + "else:\n", + " # fallback: monthly units sold only\n", + " df_dashboard = (\n", + " df_sales.groupby(\"month\", as_index=False)\n", + " .agg(total_units_sold=(\"units_sold\", \"sum\"))\n", + " .sort_values(\"month\")\n", + " )\n", + "\n", + "# Save the single overview dashboard table\n", + "df_dashboard.to_csv(PY_TAB / \"df_dashboard.csv\", index=False)\n", + "\n", + "# -------------------------\n", + "# 2) KPI summary (small json) — computed from raw df_sales + df_dashboard\n", + "# -------------------------\n", + "kpis = {\n", + " \"n_titles\": int(df_sales[\"title\"].nunique()),\n", + " \"n_months\": int(df_dashboard[\"month\"].nunique()),\n", + " \"total_units_sold\": float(df_sales[\"units_sold\"].sum()),\n", + "}\n", + "\n", + "# Only include revenue KPIs if df_dashboard contains it (since you said monthly revenue already exists)\n", + "if \"total_revenue\" in df_dashboard.columns and df_dashboard[\"total_revenue\"].notna().any():\n", + " kpis[\"total_revenue\"] = float(df_dashboard[\"total_revenue\"].sum())\n", + "\n", + "with open(PY_FIG / \"kpis.json\", \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(kpis, f, indent=2)\n", + "\n", + "# -------------------------\n", + "# 3) Python tables (title-level quick inspection)\n", + "# -------------------------\n", + "df_by_title_units = (\n", + " df_sales.groupby(\"title\", as_index=False)\n", + " .agg(total_units_sold=(\"units_sold\", \"sum\"))\n", + " .sort_values(\"total_units_sold\", ascending=False)\n", + ")\n", + "df_by_title_units.head(10).to_csv(PY_TAB / \"top_titles_by_units_sold.csv\", index=False)\n", + "\n", + "# Optional: title-level revenue table ONLY if df_sales already has per-row revenue\n", + "if \"revenue\" in df_sales.columns and df_sales[\"revenue\"].notna().any():\n", + " df_by_title_rev = (\n", + " df_sales.groupby(\"title\", as_index=False)\n", + " .agg(total_revenue=(\"revenue\", \"sum\"))\n", + " .sort_values(\"total_revenue\", ascending=False)\n", + " )\n", + " df_by_title_rev.head(10).to_csv(PY_TAB / \"top_titles_by_revenue.csv\", index=False)\n", + "\n", + "print(\"✅ Exports written to artifacts/:\")\n", + "print(\" - common/: df_dashboard.csv, kpis.json\")\n", + "print(\" - py/tables/: top_titles_by_units_sold.csv (+ optional top_titles_by_revenue.csv)\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0b4e76d3" + }, + "source": [ + "✅ **Extra outputs for the R notebook**: `(root folder)common/r_input_title_level.csv` and `(root folder)common/r_input_monthly_revenue.csv` (these are the only two files the R portion needs)." + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "jpASMyIQMaAq", + "NZd99NpKkKyp", + "TTxUKDYINPxV", + "Qy3Hqm-FojvT", + "rmgylC1ENCHy", + "SKBcx3fyCFly", + "WTkP2_-EApev" + ], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file