Charlottebke commited on
Commit
619d4eb
·
verified ·
1 Parent(s): 6b43de4

Upload 2 files

Browse files
1_Data_Creation (1).ipynb ADDED
@@ -0,0 +1,2015 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "4ba6aba8"
7
+ },
8
+ "source": [
9
+ "# 🤖 **Data Collection, Creation, Storage, and Processing**\n"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {
15
+ "id": "jpASMyIQMaAq"
16
+ },
17
+ "source": [
18
+ "## **1.** 📦 Install required packages"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 2,
24
+ "metadata": {
25
+ "colab": {
26
+ "base_uri": "https://localhost:8080/"
27
+ },
28
+ "id": "f48c8f8c",
29
+ "outputId": "12bccee2-077c-492f-9e8e-615db2caa9dc"
30
+ },
31
+ "outputs": [
32
+ {
33
+ "output_type": "stream",
34
+ "name": "stdout",
35
+ "text": [
36
+ "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n",
37
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n",
38
+ "Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n",
39
+ "Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n",
40
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n",
41
+ "Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n",
42
+ "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n",
43
+ "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n",
44
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n",
45
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n",
46
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.3)\n",
47
+ "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n",
48
+ "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n",
49
+ "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.61.1)\n",
50
+ "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)\n",
51
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)\n",
52
+ "Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n",
53
+ "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n",
54
+ "Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n",
55
+ "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.1)\n",
56
+ "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n",
57
+ "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (2025.11.3)\n",
58
+ "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (4.67.3)\n",
59
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n"
60
+ ]
61
+ }
62
+ ],
63
+ "source": [
64
+ "!pip install beautifulsoup4 pandas matplotlib seaborn numpy textblob"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "markdown",
69
+ "metadata": {
70
+ "id": "lquNYCbfL9IM"
71
+ },
72
+ "source": [
73
+ "## **2.** ⛏ Web-scrape all book titles, prices, and ratings from books.toscrape.com"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "markdown",
78
+ "metadata": {
79
+ "id": "0IWuNpxxYDJF"
80
+ },
81
+ "source": [
82
+ "### *a. Initial setup*\n",
83
+ "Define the base url of the website you will scrape as well as how and what you will scrape"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": 5,
89
+ "metadata": {
90
+ "id": "91d52125"
91
+ },
92
+ "outputs": [],
93
+ "source": [
94
+ "import requests\n",
95
+ "from bs4 import BeautifulSoup\n",
96
+ "import pandas as pd\n",
97
+ "import time\n",
98
+ "\n",
99
+ "base_url = \"https://books.toscrape.com/catalogue/page-{}.html\"\n",
100
+ "headers = {\"User-Agent\": \"Mozilla/5.0\"}\n",
101
+ "\n",
102
+ "titles, prices, ratings = [], [], []"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "markdown",
107
+ "metadata": {
108
+ "id": "oCdTsin2Yfp3"
109
+ },
110
+ "source": [
111
+ "### *b. Fill titles, prices, and ratings from the web pages*"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 6,
117
+ "metadata": {
118
+ "id": "xqO5Y3dnYhxt"
119
+ },
120
+ "outputs": [],
121
+ "source": [
122
+ "# Loop through all 50 pages\n",
123
+ "for page in range(1, 51):\n",
124
+ " url = base_url.format(page)\n",
125
+ " response = requests.get(url, headers=headers)\n",
126
+ " soup = BeautifulSoup(response.content, \"html.parser\")\n",
127
+ " books = soup.find_all(\"article\", class_=\"product_pod\")\n",
128
+ "\n",
129
+ " for book in books:\n",
130
+ " titles.append(book.h3.a[\"title\"])\n",
131
+ " prices.append(float(book.find(\"p\", class_=\"price_color\").text[1:]))\n",
132
+ " ratings.append(book.p.get(\"class\")[1])\n",
133
+ "\n",
134
+ " time.sleep(0.5) # polite scraping delay"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "markdown",
139
+ "metadata": {
140
+ "id": "T0TOeRC4Yrnn"
141
+ },
142
+ "source": [
143
+ "### *c. ✋🏻🛑⛔️ Create a dataframe df_books that contains the now complete \"title\", \"price\", and \"rating\" objects*"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": 8,
149
+ "metadata": {
150
+ "id": "l5FkkNhUYTHh",
151
+ "colab": {
152
+ "base_uri": "https://localhost:8080/",
153
+ "height": 518
154
+ },
155
+ "outputId": "05fcbb8a-6fa1-4eb8-a884-659333c6d723"
156
+ },
157
+ "outputs": [
158
+ {
159
+ "output_type": "stream",
160
+ "name": "stdout",
161
+ "text": [
162
+ "Length check:\n",
163
+ "Titles: 1000\n",
164
+ "Prices: 1000\n",
165
+ "Ratings: 1000\n",
166
+ "\n",
167
+ "DataFrame Shape: (1000, 3)\n",
168
+ "\n",
169
+ "Data Types:\n",
170
+ "title string[python]\n",
171
+ "price float64\n",
172
+ "rating string[python]\n",
173
+ "dtype: object\n",
174
+ "\n",
175
+ "Missing Values:\n",
176
+ "title 0\n",
177
+ "price 0\n",
178
+ "rating 0\n",
179
+ "dtype: int64\n"
180
+ ]
181
+ },
182
+ {
183
+ "output_type": "display_data",
184
+ "data": {
185
+ "text/plain": [
186
+ " title price rating\n",
187
+ "0 A Light in the Attic 51.77 Three\n",
188
+ "1 Tipping the Velvet 53.74 One\n",
189
+ "2 Soumission 50.10 One\n",
190
+ "3 Sharp Objects 47.82 Four\n",
191
+ "4 Sapiens: A Brief History of Humankind 54.23 Five"
192
+ ],
193
+ "text/html": [
194
+ "\n",
195
+ " <div id=\"df-3db31cf8-831e-437a-aa18-478c349c1192\" class=\"colab-df-container\">\n",
196
+ " <div>\n",
197
+ "<style scoped>\n",
198
+ " .dataframe tbody tr th:only-of-type {\n",
199
+ " vertical-align: middle;\n",
200
+ " }\n",
201
+ "\n",
202
+ " .dataframe tbody tr th {\n",
203
+ " vertical-align: top;\n",
204
+ " }\n",
205
+ "\n",
206
+ " .dataframe thead th {\n",
207
+ " text-align: right;\n",
208
+ " }\n",
209
+ "</style>\n",
210
+ "<table border=\"1\" class=\"dataframe\">\n",
211
+ " <thead>\n",
212
+ " <tr style=\"text-align: right;\">\n",
213
+ " <th></th>\n",
214
+ " <th>title</th>\n",
215
+ " <th>price</th>\n",
216
+ " <th>rating</th>\n",
217
+ " </tr>\n",
218
+ " </thead>\n",
219
+ " <tbody>\n",
220
+ " <tr>\n",
221
+ " <th>0</th>\n",
222
+ " <td>A Light in the Attic</td>\n",
223
+ " <td>51.77</td>\n",
224
+ " <td>Three</td>\n",
225
+ " </tr>\n",
226
+ " <tr>\n",
227
+ " <th>1</th>\n",
228
+ " <td>Tipping the Velvet</td>\n",
229
+ " <td>53.74</td>\n",
230
+ " <td>One</td>\n",
231
+ " </tr>\n",
232
+ " <tr>\n",
233
+ " <th>2</th>\n",
234
+ " <td>Soumission</td>\n",
235
+ " <td>50.10</td>\n",
236
+ " <td>One</td>\n",
237
+ " </tr>\n",
238
+ " <tr>\n",
239
+ " <th>3</th>\n",
240
+ " <td>Sharp Objects</td>\n",
241
+ " <td>47.82</td>\n",
242
+ " <td>Four</td>\n",
243
+ " </tr>\n",
244
+ " <tr>\n",
245
+ " <th>4</th>\n",
246
+ " <td>Sapiens: A Brief History of Humankind</td>\n",
247
+ " <td>54.23</td>\n",
248
+ " <td>Five</td>\n",
249
+ " </tr>\n",
250
+ " </tbody>\n",
251
+ "</table>\n",
252
+ "</div>\n",
253
+ " <div class=\"colab-df-buttons\">\n",
254
+ "\n",
255
+ " <div class=\"colab-df-container\">\n",
256
+ " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-3db31cf8-831e-437a-aa18-478c349c1192')\"\n",
257
+ " title=\"Convert this dataframe to an interactive table.\"\n",
258
+ " style=\"display:none;\">\n",
259
+ "\n",
260
+ " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
261
+ " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
262
+ " </svg>\n",
263
+ " </button>\n",
264
+ "\n",
265
+ " <style>\n",
266
+ " .colab-df-container {\n",
267
+ " display:flex;\n",
268
+ " gap: 12px;\n",
269
+ " }\n",
270
+ "\n",
271
+ " .colab-df-convert {\n",
272
+ " background-color: #E8F0FE;\n",
273
+ " border: none;\n",
274
+ " border-radius: 50%;\n",
275
+ " cursor: pointer;\n",
276
+ " display: none;\n",
277
+ " fill: #1967D2;\n",
278
+ " height: 32px;\n",
279
+ " padding: 0 0 0 0;\n",
280
+ " width: 32px;\n",
281
+ " }\n",
282
+ "\n",
283
+ " .colab-df-convert:hover {\n",
284
+ " background-color: #E2EBFA;\n",
285
+ " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
286
+ " fill: #174EA6;\n",
287
+ " }\n",
288
+ "\n",
289
+ " .colab-df-buttons div {\n",
290
+ " margin-bottom: 4px;\n",
291
+ " }\n",
292
+ "\n",
293
+ " [theme=dark] .colab-df-convert {\n",
294
+ " background-color: #3B4455;\n",
295
+ " fill: #D2E3FC;\n",
296
+ " }\n",
297
+ "\n",
298
+ " [theme=dark] .colab-df-convert:hover {\n",
299
+ " background-color: #434B5C;\n",
300
+ " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
301
+ " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
302
+ " fill: #FFFFFF;\n",
303
+ " }\n",
304
+ " </style>\n",
305
+ "\n",
306
+ " <script>\n",
307
+ " const buttonEl =\n",
308
+ " document.querySelector('#df-3db31cf8-831e-437a-aa18-478c349c1192 button.colab-df-convert');\n",
309
+ " buttonEl.style.display =\n",
310
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
311
+ "\n",
312
+ " async function convertToInteractive(key) {\n",
313
+ " const element = document.querySelector('#df-3db31cf8-831e-437a-aa18-478c349c1192');\n",
314
+ " const dataTable =\n",
315
+ " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
316
+ " [key], {});\n",
317
+ " if (!dataTable) return;\n",
318
+ "\n",
319
+ " const docLinkHtml = 'Like what you see? Visit the ' +\n",
320
+ " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
321
+ " + ' to learn more about interactive tables.';\n",
322
+ " element.innerHTML = '';\n",
323
+ " dataTable['output_type'] = 'display_data';\n",
324
+ " await google.colab.output.renderOutput(dataTable, element);\n",
325
+ " const docLink = document.createElement('div');\n",
326
+ " docLink.innerHTML = docLinkHtml;\n",
327
+ " element.appendChild(docLink);\n",
328
+ " }\n",
329
+ " </script>\n",
330
+ " </div>\n",
331
+ "\n",
332
+ "\n",
333
+ " </div>\n",
334
+ " </div>\n"
335
+ ],
336
+ "application/vnd.google.colaboratory.intrinsic+json": {
337
+ "type": "dataframe",
338
+ "summary": "{\n \"name\": \"display(df_books\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Tipping the Velvet\",\n \"Sapiens: A Brief History of Humankind\",\n \"Soumission\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.647672562837028,\n \"min\": 47.82,\n \"max\": 54.23,\n \"num_unique_values\": 5,\n \"samples\": [\n 53.74,\n 54.23,\n 50.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"One\",\n \"Five\",\n \"Three\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
339
+ }
340
+ },
341
+ "metadata": {}
342
+ }
343
+ ],
344
+ "source": [
345
+ "# =========================\n",
346
+ "# Part 2-c: Create df_books\n",
347
+ "# =========================\n",
348
+ "\n",
349
+ "# 1️⃣ Check that all lists have the same length\n",
350
+ "print(\"Length check:\")\n",
351
+ "print(\"Titles:\", len(titles))\n",
352
+ "print(\"Prices:\", len(prices))\n",
353
+ "print(\"Ratings:\", len(ratings))\n",
354
+ "\n",
355
+ "if not (len(titles) == len(prices) == len(ratings)):\n",
356
+ " raise ValueError(\"The lists do not have the same length. Scraping may have failed on some pages.\")\n",
357
+ "\n",
358
+ "# 2️⃣ Create the dataframe\n",
359
+ "df_books = pd.DataFrame({\n",
360
+ " \"title\": pd.Series(titles, dtype=\"string\").str.strip(),\n",
361
+ " \"price\": pd.to_numeric(prices, errors=\"coerce\"),\n",
362
+ " \"rating\": pd.Series(ratings, dtype=\"string\").str.strip()\n",
363
+ "})\n",
364
+ "\n",
365
+ "# 3️⃣ Reset index (clean structure)\n",
366
+ "df_books = df_books.reset_index(drop=True)\n",
367
+ "\n",
368
+ "# 4️⃣ Basic validation\n",
369
+ "print(\"\\nDataFrame Shape:\", df_books.shape)\n",
370
+ "print(\"\\nData Types:\")\n",
371
+ "print(df_books.dtypes)\n",
372
+ "\n",
373
+ "print(\"\\nMissing Values:\")\n",
374
+ "print(df_books.isna().sum())\n",
375
+ "\n",
376
+ "display(df_books.head())"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "markdown",
381
+ "metadata": {
382
+ "id": "duI5dv3CZYvF"
383
+ },
384
+ "source": [
385
+ "### *d. Save web-scraped dataframe either as a CSV or Excel file*"
386
+ ]
387
+ },
388
+ {
389
+ "cell_type": "code",
390
+ "execution_count": 9,
391
+ "metadata": {
392
+ "id": "lC1U_YHtZifh"
393
+ },
394
+ "outputs": [],
395
+ "source": [
396
+ "# 💾 Save to CSV\n",
397
+ "df_books.to_csv(\"books_data.csv\", index=False)\n",
398
+ "\n",
399
+ "# 💾 Or save to Excel\n",
400
+ "# df_books.to_excel(\"books_data.xlsx\", index=False)"
401
+ ]
402
+ },
403
+ {
404
+ "cell_type": "markdown",
405
+ "metadata": {
406
+ "id": "qMjRKMBQZlJi"
407
+ },
408
+ "source": [
409
+ "### *e. ✋🏻🛑⛔️ View first fiew lines*"
410
+ ]
411
+ },
412
+ {
413
+ "cell_type": "code",
414
+ "execution_count": 10,
415
+ "metadata": {
416
+ "colab": {
417
+ "base_uri": "https://localhost:8080/",
418
+ "height": 206
419
+ },
420
+ "id": "O_wIvTxYZqCK",
421
+ "outputId": "29327c64-20f0-41e2-c635-e25d5ed002ea"
422
+ },
423
+ "outputs": [
424
+ {
425
+ "output_type": "execute_result",
426
+ "data": {
427
+ "text/plain": [
428
+ " title price rating\n",
429
+ "0 A Light in the Attic 51.77 Three\n",
430
+ "1 Tipping the Velvet 53.74 One\n",
431
+ "2 Soumission 50.10 One\n",
432
+ "3 Sharp Objects 47.82 Four\n",
433
+ "4 Sapiens: A Brief History of Humankind 54.23 Five"
434
+ ],
435
+ "text/html": [
436
+ "\n",
437
+ " <div id=\"df-03f6a076-f2f8-4963-8cc7-73e8cff5f5c0\" class=\"colab-df-container\">\n",
438
+ " <div>\n",
439
+ "<style scoped>\n",
440
+ " .dataframe tbody tr th:only-of-type {\n",
441
+ " vertical-align: middle;\n",
442
+ " }\n",
443
+ "\n",
444
+ " .dataframe tbody tr th {\n",
445
+ " vertical-align: top;\n",
446
+ " }\n",
447
+ "\n",
448
+ " .dataframe thead th {\n",
449
+ " text-align: right;\n",
450
+ " }\n",
451
+ "</style>\n",
452
+ "<table border=\"1\" class=\"dataframe\">\n",
453
+ " <thead>\n",
454
+ " <tr style=\"text-align: right;\">\n",
455
+ " <th></th>\n",
456
+ " <th>title</th>\n",
457
+ " <th>price</th>\n",
458
+ " <th>rating</th>\n",
459
+ " </tr>\n",
460
+ " </thead>\n",
461
+ " <tbody>\n",
462
+ " <tr>\n",
463
+ " <th>0</th>\n",
464
+ " <td>A Light in the Attic</td>\n",
465
+ " <td>51.77</td>\n",
466
+ " <td>Three</td>\n",
467
+ " </tr>\n",
468
+ " <tr>\n",
469
+ " <th>1</th>\n",
470
+ " <td>Tipping the Velvet</td>\n",
471
+ " <td>53.74</td>\n",
472
+ " <td>One</td>\n",
473
+ " </tr>\n",
474
+ " <tr>\n",
475
+ " <th>2</th>\n",
476
+ " <td>Soumission</td>\n",
477
+ " <td>50.10</td>\n",
478
+ " <td>One</td>\n",
479
+ " </tr>\n",
480
+ " <tr>\n",
481
+ " <th>3</th>\n",
482
+ " <td>Sharp Objects</td>\n",
483
+ " <td>47.82</td>\n",
484
+ " <td>Four</td>\n",
485
+ " </tr>\n",
486
+ " <tr>\n",
487
+ " <th>4</th>\n",
488
+ " <td>Sapiens: A Brief History of Humankind</td>\n",
489
+ " <td>54.23</td>\n",
490
+ " <td>Five</td>\n",
491
+ " </tr>\n",
492
+ " </tbody>\n",
493
+ "</table>\n",
494
+ "</div>\n",
495
+ " <div class=\"colab-df-buttons\">\n",
496
+ "\n",
497
+ " <div class=\"colab-df-container\">\n",
498
+ " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-03f6a076-f2f8-4963-8cc7-73e8cff5f5c0')\"\n",
499
+ " title=\"Convert this dataframe to an interactive table.\"\n",
500
+ " style=\"display:none;\">\n",
501
+ "\n",
502
+ " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
503
+ " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
504
+ " </svg>\n",
505
+ " </button>\n",
506
+ "\n",
507
+ " <style>\n",
508
+ " .colab-df-container {\n",
509
+ " display:flex;\n",
510
+ " gap: 12px;\n",
511
+ " }\n",
512
+ "\n",
513
+ " .colab-df-convert {\n",
514
+ " background-color: #E8F0FE;\n",
515
+ " border: none;\n",
516
+ " border-radius: 50%;\n",
517
+ " cursor: pointer;\n",
518
+ " display: none;\n",
519
+ " fill: #1967D2;\n",
520
+ " height: 32px;\n",
521
+ " padding: 0 0 0 0;\n",
522
+ " width: 32px;\n",
523
+ " }\n",
524
+ "\n",
525
+ " .colab-df-convert:hover {\n",
526
+ " background-color: #E2EBFA;\n",
527
+ " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
528
+ " fill: #174EA6;\n",
529
+ " }\n",
530
+ "\n",
531
+ " .colab-df-buttons div {\n",
532
+ " margin-bottom: 4px;\n",
533
+ " }\n",
534
+ "\n",
535
+ " [theme=dark] .colab-df-convert {\n",
536
+ " background-color: #3B4455;\n",
537
+ " fill: #D2E3FC;\n",
538
+ " }\n",
539
+ "\n",
540
+ " [theme=dark] .colab-df-convert:hover {\n",
541
+ " background-color: #434B5C;\n",
542
+ " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
543
+ " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
544
+ " fill: #FFFFFF;\n",
545
+ " }\n",
546
+ " </style>\n",
547
+ "\n",
548
+ " <script>\n",
549
+ " const buttonEl =\n",
550
+ " document.querySelector('#df-03f6a076-f2f8-4963-8cc7-73e8cff5f5c0 button.colab-df-convert');\n",
551
+ " buttonEl.style.display =\n",
552
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
553
+ "\n",
554
+ " async function convertToInteractive(key) {\n",
555
+ " const element = document.querySelector('#df-03f6a076-f2f8-4963-8cc7-73e8cff5f5c0');\n",
556
+ " const dataTable =\n",
557
+ " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
558
+ " [key], {});\n",
559
+ " if (!dataTable) return;\n",
560
+ "\n",
561
+ " const docLinkHtml = 'Like what you see? Visit the ' +\n",
562
+ " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
563
+ " + ' to learn more about interactive tables.';\n",
564
+ " element.innerHTML = '';\n",
565
+ " dataTable['output_type'] = 'display_data';\n",
566
+ " await google.colab.output.renderOutput(dataTable, element);\n",
567
+ " const docLink = document.createElement('div');\n",
568
+ " docLink.innerHTML = docLinkHtml;\n",
569
+ " element.appendChild(docLink);\n",
570
+ " }\n",
571
+ " </script>\n",
572
+ " </div>\n",
573
+ "\n",
574
+ "\n",
575
+ " </div>\n",
576
+ " </div>\n"
577
+ ],
578
+ "application/vnd.google.colaboratory.intrinsic+json": {
579
+ "type": "dataframe",
580
+ "variable_name": "df_books",
581
+ "summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
582
+ }
583
+ },
584
+ "metadata": {},
585
+ "execution_count": 10
586
+ }
587
+ ],
588
+ "source": [
589
+ "# Show the first 5 rows\n",
590
+ "df_books.head()"
591
+ ]
592
+ },
593
+ {
594
+ "cell_type": "markdown",
595
+ "metadata": {
596
+ "id": "p-1Pr2szaqLk"
597
+ },
598
+ "source": [
599
+ "## **3.** 🧩 Create a meaningful connection between real & synthetic datasets"
600
+ ]
601
+ },
602
+ {
603
+ "cell_type": "markdown",
604
+ "metadata": {
605
+ "id": "SIaJUGIpaH4V"
606
+ },
607
+ "source": [
608
+ "### *a. Initial setup*"
609
+ ]
610
+ },
611
+ {
612
+ "cell_type": "code",
613
+ "execution_count": 11,
614
+ "metadata": {
615
+ "id": "-gPXGcRPuV_9"
616
+ },
617
+ "outputs": [],
618
+ "source": [
619
+ "import numpy as np\n",
620
+ "import random\n",
621
+ "from datetime import datetime\n",
622
+ "import warnings\n",
623
+ "\n",
624
+ "warnings.filterwarnings(\"ignore\")\n",
625
+ "random.seed(2025)\n",
626
+ "np.random.seed(2025)"
627
+ ]
628
+ },
629
+ {
630
+ "cell_type": "markdown",
631
+ "metadata": {
632
+ "id": "pY4yCoIuaQqp"
633
+ },
634
+ "source": [
635
+ "### *b. Generate popularity scores based on rating (with some randomness) with a generate_popularity_score function*"
636
+ ]
637
+ },
638
+ {
639
+ "cell_type": "code",
640
+ "execution_count": 12,
641
+ "metadata": {
642
+ "id": "mnd5hdAbaNjz"
643
+ },
644
+ "outputs": [],
645
+ "source": [
646
+ "def generate_popularity_score(rating):\n",
647
+ " base = {\"One\": 2, \"Two\": 3, \"Three\": 3, \"Four\": 4, \"Five\": 4}.get(rating, 3)\n",
648
+ " trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n",
649
+ " return int(np.clip(base + trend_factor, 1, 5))"
650
+ ]
651
+ },
652
+ {
653
+ "cell_type": "markdown",
654
+ "metadata": {
655
+ "id": "n4-TaNTFgPak"
656
+ },
657
+ "source": [
658
+ "### *c. ✋🏻🛑⛔️ Run the function to create a \"popularity_score\" column from \"rating\"*"
659
+ ]
660
+ },
661
+ {
662
+ "cell_type": "code",
663
+ "execution_count": 13,
664
+ "metadata": {
665
+ "id": "V-G3OCUCgR07",
666
+ "colab": {
667
+ "base_uri": "https://localhost:8080/",
668
+ "height": 379
669
+ },
670
+ "outputId": "3835df83-5761-406c-95c4-7d8f3660e6a8"
671
+ },
672
+ "outputs": [
673
+ {
674
+ "output_type": "stream",
675
+ "name": "stdout",
676
+ "text": [
677
+ "DataFrame shape: (1000, 4)\n"
678
+ ]
679
+ },
680
+ {
681
+ "output_type": "display_data",
682
+ "data": {
683
+ "text/plain": [
684
+ " title price rating popularity_score\n",
685
+ "0 A Light in the Attic 51.77 Three 3\n",
686
+ "1 Tipping the Velvet 53.74 One 2\n",
687
+ "2 Soumission 50.10 One 2\n",
688
+ "3 Sharp Objects 47.82 Four 4\n",
689
+ "4 Sapiens: A Brief History of Humankind 54.23 Five 3"
690
+ ],
691
+ "text/html": [
692
+ "\n",
693
+ " <div id=\"df-7acfc331-fb80-4828-8da0-44200d7e5ba1\" class=\"colab-df-container\">\n",
694
+ " <div>\n",
695
+ "<style scoped>\n",
696
+ " .dataframe tbody tr th:only-of-type {\n",
697
+ " vertical-align: middle;\n",
698
+ " }\n",
699
+ "\n",
700
+ " .dataframe tbody tr th {\n",
701
+ " vertical-align: top;\n",
702
+ " }\n",
703
+ "\n",
704
+ " .dataframe thead th {\n",
705
+ " text-align: right;\n",
706
+ " }\n",
707
+ "</style>\n",
708
+ "<table border=\"1\" class=\"dataframe\">\n",
709
+ " <thead>\n",
710
+ " <tr style=\"text-align: right;\">\n",
711
+ " <th></th>\n",
712
+ " <th>title</th>\n",
713
+ " <th>price</th>\n",
714
+ " <th>rating</th>\n",
715
+ " <th>popularity_score</th>\n",
716
+ " </tr>\n",
717
+ " </thead>\n",
718
+ " <tbody>\n",
719
+ " <tr>\n",
720
+ " <th>0</th>\n",
721
+ " <td>A Light in the Attic</td>\n",
722
+ " <td>51.77</td>\n",
723
+ " <td>Three</td>\n",
724
+ " <td>3</td>\n",
725
+ " </tr>\n",
726
+ " <tr>\n",
727
+ " <th>1</th>\n",
728
+ " <td>Tipping the Velvet</td>\n",
729
+ " <td>53.74</td>\n",
730
+ " <td>One</td>\n",
731
+ " <td>2</td>\n",
732
+ " </tr>\n",
733
+ " <tr>\n",
734
+ " <th>2</th>\n",
735
+ " <td>Soumission</td>\n",
736
+ " <td>50.10</td>\n",
737
+ " <td>One</td>\n",
738
+ " <td>2</td>\n",
739
+ " </tr>\n",
740
+ " <tr>\n",
741
+ " <th>3</th>\n",
742
+ " <td>Sharp Objects</td>\n",
743
+ " <td>47.82</td>\n",
744
+ " <td>Four</td>\n",
745
+ " <td>4</td>\n",
746
+ " </tr>\n",
747
+ " <tr>\n",
748
+ " <th>4</th>\n",
749
+ " <td>Sapiens: A Brief History of Humankind</td>\n",
750
+ " <td>54.23</td>\n",
751
+ " <td>Five</td>\n",
752
+ " <td>3</td>\n",
753
+ " </tr>\n",
754
+ " </tbody>\n",
755
+ "</table>\n",
756
+ "</div>\n",
757
+ " <div class=\"colab-df-buttons\">\n",
758
+ "\n",
759
+ " <div class=\"colab-df-container\">\n",
760
+ " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-7acfc331-fb80-4828-8da0-44200d7e5ba1')\"\n",
761
+ " title=\"Convert this dataframe to an interactive table.\"\n",
762
+ " style=\"display:none;\">\n",
763
+ "\n",
764
+ " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
765
+ " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
766
+ " </svg>\n",
767
+ " </button>\n",
768
+ "\n",
769
+ " <style>\n",
770
+ " .colab-df-container {\n",
771
+ " display:flex;\n",
772
+ " gap: 12px;\n",
773
+ " }\n",
774
+ "\n",
775
+ " .colab-df-convert {\n",
776
+ " background-color: #E8F0FE;\n",
777
+ " border: none;\n",
778
+ " border-radius: 50%;\n",
779
+ " cursor: pointer;\n",
780
+ " display: none;\n",
781
+ " fill: #1967D2;\n",
782
+ " height: 32px;\n",
783
+ " padding: 0 0 0 0;\n",
784
+ " width: 32px;\n",
785
+ " }\n",
786
+ "\n",
787
+ " .colab-df-convert:hover {\n",
788
+ " background-color: #E2EBFA;\n",
789
+ " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
790
+ " fill: #174EA6;\n",
791
+ " }\n",
792
+ "\n",
793
+ " .colab-df-buttons div {\n",
794
+ " margin-bottom: 4px;\n",
795
+ " }\n",
796
+ "\n",
797
+ " [theme=dark] .colab-df-convert {\n",
798
+ " background-color: #3B4455;\n",
799
+ " fill: #D2E3FC;\n",
800
+ " }\n",
801
+ "\n",
802
+ " [theme=dark] .colab-df-convert:hover {\n",
803
+ " background-color: #434B5C;\n",
804
+ " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
805
+ " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
806
+ " fill: #FFFFFF;\n",
807
+ " }\n",
808
+ " </style>\n",
809
+ "\n",
810
+ " <script>\n",
811
+ " const buttonEl =\n",
812
+ " document.querySelector('#df-7acfc331-fb80-4828-8da0-44200d7e5ba1 button.colab-df-convert');\n",
813
+ " buttonEl.style.display =\n",
814
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
815
+ "\n",
816
+ " async function convertToInteractive(key) {\n",
817
+ " const element = document.querySelector('#df-7acfc331-fb80-4828-8da0-44200d7e5ba1');\n",
818
+ " const dataTable =\n",
819
+ " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
820
+ " [key], {});\n",
821
+ " if (!dataTable) return;\n",
822
+ "\n",
823
+ " const docLinkHtml = 'Like what you see? Visit the ' +\n",
824
+ " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
825
+ " + ' to learn more about interactive tables.';\n",
826
+ " element.innerHTML = '';\n",
827
+ " dataTable['output_type'] = 'display_data';\n",
828
+ " await google.colab.output.renderOutput(dataTable, element);\n",
829
+ " const docLink = document.createElement('div');\n",
830
+ " docLink.innerHTML = docLinkHtml;\n",
831
+ " element.appendChild(docLink);\n",
832
+ " }\n",
833
+ " </script>\n",
834
+ " </div>\n",
835
+ "\n",
836
+ "\n",
837
+ " </div>\n",
838
+ " </div>\n"
839
+ ],
840
+ "application/vnd.google.colaboratory.intrinsic+json": {
841
+ "type": "dataframe",
842
+ "summary": "{\n \"name\": \"print(df_books[\\\"popularity_score\\\"]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Tipping the Velvet\",\n \"Sapiens: A Brief History of Humankind\",\n \"Soumission\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.647672562837028,\n \"min\": 47.82,\n \"max\": 54.23,\n \"num_unique_values\": 5,\n \"samples\": [\n 53.74,\n 54.23,\n 50.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"One\",\n \"Five\",\n \"Three\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 2,\n \"max\": 4,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 2,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
843
+ }
844
+ },
845
+ "metadata": {}
846
+ },
847
+ {
848
+ "output_type": "stream",
849
+ "name": "stdout",
850
+ "text": [
851
+ "\n",
852
+ "Popularity Score Distribution:\n",
853
+ "popularity_score\n",
854
+ "1 38\n",
855
+ "2 197\n",
856
+ "3 327\n",
857
+ "4 321\n",
858
+ "5 117\n",
859
+ "Name: count, dtype: int64\n"
860
+ ]
861
+ }
862
+ ],
863
+ "source": [
864
+ "# =========================\n",
865
+ "# Create popularity_score column\n",
866
+ "# =========================\n",
867
+ "\n",
868
+ "# Apply the function to the rating column\n",
869
+ "df_books[\"popularity_score\"] = df_books[\"rating\"].apply(generate_popularity_score)\n",
870
+ "\n",
871
+ "# Quick validation\n",
872
+ "print(\"DataFrame shape:\", df_books.shape)\n",
873
+ "\n",
874
+ "# Show first 5 rows\n",
875
+ "display(df_books.head())\n",
876
+ "\n",
877
+ "# Check distribution of the new variable\n",
878
+ "print(\"\\nPopularity Score Distribution:\")\n",
879
+ "print(df_books[\"popularity_score\"].value_counts().sort_index())"
880
+ ]
881
+ },
882
+ {
883
+ "cell_type": "markdown",
884
+ "metadata": {
885
+ "id": "HnngRNTgacYt"
886
+ },
887
+ "source": [
888
+ "### *d. Decide on the sentiment_label based on the popularity score with a get_sentiment function*"
889
+ ]
890
+ },
891
+ {
892
+ "cell_type": "code",
893
+ "execution_count": 14,
894
+ "metadata": {
895
+ "id": "kUtWmr8maZLZ"
896
+ },
897
+ "outputs": [],
898
+ "source": [
899
+ "def get_sentiment(popularity_score):\n",
900
+ " if popularity_score <= 2:\n",
901
+ " return \"negative\"\n",
902
+ " elif popularity_score == 3:\n",
903
+ " return \"neutral\"\n",
904
+ " else:\n",
905
+ " return \"positive\""
906
+ ]
907
+ },
908
+ {
909
+ "cell_type": "markdown",
910
+ "metadata": {
911
+ "id": "HF9F9HIzgT7Z"
912
+ },
913
+ "source": [
914
+ "### *e. ✋🏻🛑⛔️ Run the function to create a \"sentiment_label\" column from \"popularity_score\"*"
915
+ ]
916
+ },
917
+ {
918
+ "cell_type": "code",
919
+ "execution_count": 15,
920
+ "metadata": {
921
+ "id": "tafQj8_7gYCG",
922
+ "colab": {
923
+ "base_uri": "https://localhost:8080/",
924
+ "height": 345
925
+ },
926
+ "outputId": "9bb6ed59-2bf6-4a41-8354-509515d182e7"
927
+ },
928
+ "outputs": [
929
+ {
930
+ "output_type": "stream",
931
+ "name": "stdout",
932
+ "text": [
933
+ "DataFrame shape: (1000, 5)\n"
934
+ ]
935
+ },
936
+ {
937
+ "output_type": "display_data",
938
+ "data": {
939
+ "text/plain": [
940
+ " title price rating popularity_score \\\n",
941
+ "0 A Light in the Attic 51.77 Three 3 \n",
942
+ "1 Tipping the Velvet 53.74 One 2 \n",
943
+ "2 Soumission 50.10 One 2 \n",
944
+ "3 Sharp Objects 47.82 Four 4 \n",
945
+ "4 Sapiens: A Brief History of Humankind 54.23 Five 3 \n",
946
+ "\n",
947
+ " sentiment_label \n",
948
+ "0 neutral \n",
949
+ "1 negative \n",
950
+ "2 negative \n",
951
+ "3 positive \n",
952
+ "4 neutral "
953
+ ],
954
+ "text/html": [
955
+ "\n",
956
+ " <div id=\"df-19236306-807e-4b6d-b783-330d529cdaed\" class=\"colab-df-container\">\n",
957
+ " <div>\n",
958
+ "<style scoped>\n",
959
+ " .dataframe tbody tr th:only-of-type {\n",
960
+ " vertical-align: middle;\n",
961
+ " }\n",
962
+ "\n",
963
+ " .dataframe tbody tr th {\n",
964
+ " vertical-align: top;\n",
965
+ " }\n",
966
+ "\n",
967
+ " .dataframe thead th {\n",
968
+ " text-align: right;\n",
969
+ " }\n",
970
+ "</style>\n",
971
+ "<table border=\"1\" class=\"dataframe\">\n",
972
+ " <thead>\n",
973
+ " <tr style=\"text-align: right;\">\n",
974
+ " <th></th>\n",
975
+ " <th>title</th>\n",
976
+ " <th>price</th>\n",
977
+ " <th>rating</th>\n",
978
+ " <th>popularity_score</th>\n",
979
+ " <th>sentiment_label</th>\n",
980
+ " </tr>\n",
981
+ " </thead>\n",
982
+ " <tbody>\n",
983
+ " <tr>\n",
984
+ " <th>0</th>\n",
985
+ " <td>A Light in the Attic</td>\n",
986
+ " <td>51.77</td>\n",
987
+ " <td>Three</td>\n",
988
+ " <td>3</td>\n",
989
+ " <td>neutral</td>\n",
990
+ " </tr>\n",
991
+ " <tr>\n",
992
+ " <th>1</th>\n",
993
+ " <td>Tipping the Velvet</td>\n",
994
+ " <td>53.74</td>\n",
995
+ " <td>One</td>\n",
996
+ " <td>2</td>\n",
997
+ " <td>negative</td>\n",
998
+ " </tr>\n",
999
+ " <tr>\n",
1000
+ " <th>2</th>\n",
1001
+ " <td>Soumission</td>\n",
1002
+ " <td>50.10</td>\n",
1003
+ " <td>One</td>\n",
1004
+ " <td>2</td>\n",
1005
+ " <td>negative</td>\n",
1006
+ " </tr>\n",
1007
+ " <tr>\n",
1008
+ " <th>3</th>\n",
1009
+ " <td>Sharp Objects</td>\n",
1010
+ " <td>47.82</td>\n",
1011
+ " <td>Four</td>\n",
1012
+ " <td>4</td>\n",
1013
+ " <td>positive</td>\n",
1014
+ " </tr>\n",
1015
+ " <tr>\n",
1016
+ " <th>4</th>\n",
1017
+ " <td>Sapiens: A Brief History of Humankind</td>\n",
1018
+ " <td>54.23</td>\n",
1019
+ " <td>Five</td>\n",
1020
+ " <td>3</td>\n",
1021
+ " <td>neutral</td>\n",
1022
+ " </tr>\n",
1023
+ " </tbody>\n",
1024
+ "</table>\n",
1025
+ "</div>\n",
1026
+ " <div class=\"colab-df-buttons\">\n",
1027
+ "\n",
1028
+ " <div class=\"colab-df-container\">\n",
1029
+ " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-19236306-807e-4b6d-b783-330d529cdaed')\"\n",
1030
+ " title=\"Convert this dataframe to an interactive table.\"\n",
1031
+ " style=\"display:none;\">\n",
1032
+ "\n",
1033
+ " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
1034
+ " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
1035
+ " </svg>\n",
1036
+ " </button>\n",
1037
+ "\n",
1038
+ " <style>\n",
1039
+ " .colab-df-container {\n",
1040
+ " display:flex;\n",
1041
+ " gap: 12px;\n",
1042
+ " }\n",
1043
+ "\n",
1044
+ " .colab-df-convert {\n",
1045
+ " background-color: #E8F0FE;\n",
1046
+ " border: none;\n",
1047
+ " border-radius: 50%;\n",
1048
+ " cursor: pointer;\n",
1049
+ " display: none;\n",
1050
+ " fill: #1967D2;\n",
1051
+ " height: 32px;\n",
1052
+ " padding: 0 0 0 0;\n",
1053
+ " width: 32px;\n",
1054
+ " }\n",
1055
+ "\n",
1056
+ " .colab-df-convert:hover {\n",
1057
+ " background-color: #E2EBFA;\n",
1058
+ " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
1059
+ " fill: #174EA6;\n",
1060
+ " }\n",
1061
+ "\n",
1062
+ " .colab-df-buttons div {\n",
1063
+ " margin-bottom: 4px;\n",
1064
+ " }\n",
1065
+ "\n",
1066
+ " [theme=dark] .colab-df-convert {\n",
1067
+ " background-color: #3B4455;\n",
1068
+ " fill: #D2E3FC;\n",
1069
+ " }\n",
1070
+ "\n",
1071
+ " [theme=dark] .colab-df-convert:hover {\n",
1072
+ " background-color: #434B5C;\n",
1073
+ " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
1074
+ " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
1075
+ " fill: #FFFFFF;\n",
1076
+ " }\n",
1077
+ " </style>\n",
1078
+ "\n",
1079
+ " <script>\n",
1080
+ " const buttonEl =\n",
1081
+ " document.querySelector('#df-19236306-807e-4b6d-b783-330d529cdaed button.colab-df-convert');\n",
1082
+ " buttonEl.style.display =\n",
1083
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
1084
+ "\n",
1085
+ " async function convertToInteractive(key) {\n",
1086
+ " const element = document.querySelector('#df-19236306-807e-4b6d-b783-330d529cdaed');\n",
1087
+ " const dataTable =\n",
1088
+ " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
1089
+ " [key], {});\n",
1090
+ " if (!dataTable) return;\n",
1091
+ "\n",
1092
+ " const docLinkHtml = 'Like what you see? Visit the ' +\n",
1093
+ " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
1094
+ " + ' to learn more about interactive tables.';\n",
1095
+ " element.innerHTML = '';\n",
1096
+ " dataTable['output_type'] = 'display_data';\n",
1097
+ " await google.colab.output.renderOutput(dataTable, element);\n",
1098
+ " const docLink = document.createElement('div');\n",
1099
+ " docLink.innerHTML = docLinkHtml;\n",
1100
+ " element.appendChild(docLink);\n",
1101
+ " }\n",
1102
+ " </script>\n",
1103
+ " </div>\n",
1104
+ "\n",
1105
+ "\n",
1106
+ " </div>\n",
1107
+ " </div>\n"
1108
+ ],
1109
+ "application/vnd.google.colaboratory.intrinsic+json": {
1110
+ "type": "dataframe",
1111
+ "summary": "{\n \"name\": \"print(df_books[\\\"sentiment_label\\\"]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Tipping the Velvet\",\n \"Sapiens: A Brief History of Humankind\",\n \"Soumission\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.647672562837028,\n \"min\": 47.82,\n \"max\": 54.23,\n \"num_unique_values\": 5,\n \"samples\": [\n 53.74,\n 54.23,\n 50.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"One\",\n \"Five\",\n \"Three\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 2,\n \"max\": 4,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 2,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
1112
+ }
1113
+ },
1114
+ "metadata": {}
1115
+ },
1116
+ {
1117
+ "output_type": "stream",
1118
+ "name": "stdout",
1119
+ "text": [
1120
+ "\n",
1121
+ "Sentiment Distribution:\n",
1122
+ "sentiment_label\n",
1123
+ "positive 438\n",
1124
+ "neutral 327\n",
1125
+ "negative 235\n",
1126
+ "Name: count, dtype: int64\n"
1127
+ ]
1128
+ }
1129
+ ],
1130
+ "source": [
1131
+ "# =========================\n",
1132
+ "# Create sentiment_label column\n",
1133
+ "# =========================\n",
1134
+ "\n",
1135
+ "# Apply function to popularity_score\n",
1136
+ "df_books[\"sentiment_label\"] = df_books[\"popularity_score\"].apply(get_sentiment)\n",
1137
+ "\n",
1138
+ "# Quick validation\n",
1139
+ "print(\"DataFrame shape:\", df_books.shape)\n",
1140
+ "\n",
1141
+ "# Show first 5 rows\n",
1142
+ "display(df_books.head())\n",
1143
+ "\n",
1144
+ "# Check sentiment distribution\n",
1145
+ "print(\"\\nSentiment Distribution:\")\n",
1146
+ "print(df_books[\"sentiment_label\"].value_counts())"
1147
+ ]
1148
+ },
1149
+ {
1150
+ "cell_type": "markdown",
1151
+ "metadata": {
1152
+ "id": "T8AdKkmASq9a"
1153
+ },
1154
+ "source": [
1155
+ "## **4.** 📈 Generate synthetic book sales data of 18 months"
1156
+ ]
1157
+ },
1158
+ {
1159
+ "cell_type": "markdown",
1160
+ "metadata": {
1161
+ "id": "OhXbdGD5fH0c"
1162
+ },
1163
+ "source": [
1164
+ "### *a. Create a generate_sales_profit function that would generate sales patterns based on sentiment_label (with some randomness)*"
1165
+ ]
1166
+ },
1167
+ {
1168
+ "cell_type": "code",
1169
+ "execution_count": 16,
1170
+ "metadata": {
1171
+ "id": "qkVhYPXGbgEn"
1172
+ },
1173
+ "outputs": [],
1174
+ "source": [
1175
+ "def generate_sales_profile(sentiment):\n",
1176
+ " months = pd.date_range(end=datetime.today(), periods=18, freq=\"M\")\n",
1177
+ "\n",
1178
+ " if sentiment == \"positive\":\n",
1179
+ " base = random.randint(200, 300)\n",
1180
+ " trend = np.linspace(base, base + random.randint(20, 60), len(months))\n",
1181
+ " elif sentiment == \"negative\":\n",
1182
+ " base = random.randint(20, 80)\n",
1183
+ " trend = np.linspace(base, base - random.randint(10, 30), len(months))\n",
1184
+ " else: # neutral\n",
1185
+ " base = random.randint(80, 160)\n",
1186
+ " trend = np.full(len(months), base + random.randint(-10, 10))\n",
1187
+ "\n",
1188
+ " seasonality = 10 * np.sin(np.linspace(0, 3 * np.pi, len(months)))\n",
1189
+ " noise = np.random.normal(0, 5, len(months))\n",
1190
+ " monthly_sales = np.clip(trend + seasonality + noise, a_min=0, a_max=None).astype(int)\n",
1191
+ "\n",
1192
+ " return list(zip(months.strftime(\"%Y-%m\"), monthly_sales))"
1193
+ ]
1194
+ },
1195
+ {
1196
+ "cell_type": "markdown",
1197
+ "metadata": {
1198
+ "id": "L2ak1HlcgoTe"
1199
+ },
1200
+ "source": [
1201
+ "### *b. Run the function as part of building sales_data*"
1202
+ ]
1203
+ },
1204
+ {
1205
+ "cell_type": "code",
1206
+ "execution_count": 17,
1207
+ "metadata": {
1208
+ "id": "SlJ24AUafoDB"
1209
+ },
1210
+ "outputs": [],
1211
+ "source": [
1212
+ "sales_data = []\n",
1213
+ "for _, row in df_books.iterrows():\n",
1214
+ " records = generate_sales_profile(row[\"sentiment_label\"])\n",
1215
+ " for month, units in records:\n",
1216
+ " sales_data.append({\n",
1217
+ " \"title\": row[\"title\"],\n",
1218
+ " \"month\": month,\n",
1219
+ " \"units_sold\": units,\n",
1220
+ " \"sentiment_label\": row[\"sentiment_label\"]\n",
1221
+ " })"
1222
+ ]
1223
+ },
1224
+ {
1225
+ "cell_type": "markdown",
1226
+ "metadata": {
1227
+ "id": "4IXZKcCSgxnq"
1228
+ },
1229
+ "source": [
1230
+ "### *c. ✋🏻🛑⛔️ Create a df_sales DataFrame from sales_data*"
1231
+ ]
1232
+ },
1233
+ {
1234
+ "cell_type": "code",
1235
+ "execution_count": 18,
1236
+ "metadata": {
1237
+ "id": "wcN6gtiZg-ws",
1238
+ "colab": {
1239
+ "base_uri": "https://localhost:8080/",
1240
+ "height": 640
1241
+ },
1242
+ "outputId": "edaf3e0c-0135-4218-a13a-24b0fdab4287"
1243
+ },
1244
+ "outputs": [
1245
+ {
1246
+ "output_type": "stream",
1247
+ "name": "stdout",
1248
+ "text": [
1249
+ "Shape of df_sales: (18000, 4)\n",
1250
+ "\n",
1251
+ "Columns:\n",
1252
+ "Index(['title', 'month', 'units_sold', 'sentiment_label'], dtype='object')\n",
1253
+ "\n",
1254
+ "Data types before cleaning:\n",
1255
+ "title object\n",
1256
+ "month object\n",
1257
+ "units_sold int64\n",
1258
+ "sentiment_label object\n",
1259
+ "dtype: object\n",
1260
+ "\n",
1261
+ "Data types after cleaning:\n",
1262
+ "title object\n",
1263
+ "month datetime64[ns]\n",
1264
+ "units_sold int64\n",
1265
+ "sentiment_label object\n",
1266
+ "dtype: object\n",
1267
+ "\n",
1268
+ "Missing values:\n",
1269
+ "title 0\n",
1270
+ "month 0\n",
1271
+ "units_sold 0\n",
1272
+ "sentiment_label 0\n",
1273
+ "dtype: int64\n"
1274
+ ]
1275
+ },
1276
+ {
1277
+ "output_type": "display_data",
1278
+ "data": {
1279
+ "text/plain": [
1280
+ " title month units_sold sentiment_label\n",
1281
+ "0 A Light in the Attic 2024-09-01 100 neutral\n",
1282
+ "1 A Light in the Attic 2024-10-01 109 neutral\n",
1283
+ "2 A Light in the Attic 2024-11-01 102 neutral\n",
1284
+ "3 A Light in the Attic 2024-12-01 107 neutral\n",
1285
+ "4 A Light in the Attic 2025-01-01 108 neutral"
1286
+ ],
1287
+ "text/html": [
1288
+ "\n",
1289
+ " <div id=\"df-fac6c718-35c9-4c22-9b98-b8ac475d3254\" class=\"colab-df-container\">\n",
1290
+ " <div>\n",
1291
+ "<style scoped>\n",
1292
+ " .dataframe tbody tr th:only-of-type {\n",
1293
+ " vertical-align: middle;\n",
1294
+ " }\n",
1295
+ "\n",
1296
+ " .dataframe tbody tr th {\n",
1297
+ " vertical-align: top;\n",
1298
+ " }\n",
1299
+ "\n",
1300
+ " .dataframe thead th {\n",
1301
+ " text-align: right;\n",
1302
+ " }\n",
1303
+ "</style>\n",
1304
+ "<table border=\"1\" class=\"dataframe\">\n",
1305
+ " <thead>\n",
1306
+ " <tr style=\"text-align: right;\">\n",
1307
+ " <th></th>\n",
1308
+ " <th>title</th>\n",
1309
+ " <th>month</th>\n",
1310
+ " <th>units_sold</th>\n",
1311
+ " <th>sentiment_label</th>\n",
1312
+ " </tr>\n",
1313
+ " </thead>\n",
1314
+ " <tbody>\n",
1315
+ " <tr>\n",
1316
+ " <th>0</th>\n",
1317
+ " <td>A Light in the Attic</td>\n",
1318
+ " <td>2024-09-01</td>\n",
1319
+ " <td>100</td>\n",
1320
+ " <td>neutral</td>\n",
1321
+ " </tr>\n",
1322
+ " <tr>\n",
1323
+ " <th>1</th>\n",
1324
+ " <td>A Light in the Attic</td>\n",
1325
+ " <td>2024-10-01</td>\n",
1326
+ " <td>109</td>\n",
1327
+ " <td>neutral</td>\n",
1328
+ " </tr>\n",
1329
+ " <tr>\n",
1330
+ " <th>2</th>\n",
1331
+ " <td>A Light in the Attic</td>\n",
1332
+ " <td>2024-11-01</td>\n",
1333
+ " <td>102</td>\n",
1334
+ " <td>neutral</td>\n",
1335
+ " </tr>\n",
1336
+ " <tr>\n",
1337
+ " <th>3</th>\n",
1338
+ " <td>A Light in the Attic</td>\n",
1339
+ " <td>2024-12-01</td>\n",
1340
+ " <td>107</td>\n",
1341
+ " <td>neutral</td>\n",
1342
+ " </tr>\n",
1343
+ " <tr>\n",
1344
+ " <th>4</th>\n",
1345
+ " <td>A Light in the Attic</td>\n",
1346
+ " <td>2025-01-01</td>\n",
1347
+ " <td>108</td>\n",
1348
+ " <td>neutral</td>\n",
1349
+ " </tr>\n",
1350
+ " </tbody>\n",
1351
+ "</table>\n",
1352
+ "</div>\n",
1353
+ " <div class=\"colab-df-buttons\">\n",
1354
+ "\n",
1355
+ " <div class=\"colab-df-container\">\n",
1356
+ " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-fac6c718-35c9-4c22-9b98-b8ac475d3254')\"\n",
1357
+ " title=\"Convert this dataframe to an interactive table.\"\n",
1358
+ " style=\"display:none;\">\n",
1359
+ "\n",
1360
+ " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
1361
+ " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
1362
+ " </svg>\n",
1363
+ " </button>\n",
1364
+ "\n",
1365
+ " <style>\n",
1366
+ " .colab-df-container {\n",
1367
+ " display:flex;\n",
1368
+ " gap: 12px;\n",
1369
+ " }\n",
1370
+ "\n",
1371
+ " .colab-df-convert {\n",
1372
+ " background-color: #E8F0FE;\n",
1373
+ " border: none;\n",
1374
+ " border-radius: 50%;\n",
1375
+ " cursor: pointer;\n",
1376
+ " display: none;\n",
1377
+ " fill: #1967D2;\n",
1378
+ " height: 32px;\n",
1379
+ " padding: 0 0 0 0;\n",
1380
+ " width: 32px;\n",
1381
+ " }\n",
1382
+ "\n",
1383
+ " .colab-df-convert:hover {\n",
1384
+ " background-color: #E2EBFA;\n",
1385
+ " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
1386
+ " fill: #174EA6;\n",
1387
+ " }\n",
1388
+ "\n",
1389
+ " .colab-df-buttons div {\n",
1390
+ " margin-bottom: 4px;\n",
1391
+ " }\n",
1392
+ "\n",
1393
+ " [theme=dark] .colab-df-convert {\n",
1394
+ " background-color: #3B4455;\n",
1395
+ " fill: #D2E3FC;\n",
1396
+ " }\n",
1397
+ "\n",
1398
+ " [theme=dark] .colab-df-convert:hover {\n",
1399
+ " background-color: #434B5C;\n",
1400
+ " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
1401
+ " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
1402
+ " fill: #FFFFFF;\n",
1403
+ " }\n",
1404
+ " </style>\n",
1405
+ "\n",
1406
+ " <script>\n",
1407
+ " const buttonEl =\n",
1408
+ " document.querySelector('#df-fac6c718-35c9-4c22-9b98-b8ac475d3254 button.colab-df-convert');\n",
1409
+ " buttonEl.style.display =\n",
1410
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
1411
+ "\n",
1412
+ " async function convertToInteractive(key) {\n",
1413
+ " const element = document.querySelector('#df-fac6c718-35c9-4c22-9b98-b8ac475d3254');\n",
1414
+ " const dataTable =\n",
1415
+ " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
1416
+ " [key], {});\n",
1417
+ " if (!dataTable) return;\n",
1418
+ "\n",
1419
+ " const docLinkHtml = 'Like what you see? Visit the ' +\n",
1420
+ " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
1421
+ " + ' to learn more about interactive tables.';\n",
1422
+ " element.innerHTML = '';\n",
1423
+ " dataTable['output_type'] = 'display_data';\n",
1424
+ " await google.colab.output.renderOutput(dataTable, element);\n",
1425
+ " const docLink = document.createElement('div');\n",
1426
+ " docLink.innerHTML = docLinkHtml;\n",
1427
+ " element.appendChild(docLink);\n",
1428
+ " }\n",
1429
+ " </script>\n",
1430
+ " </div>\n",
1431
+ "\n",
1432
+ "\n",
1433
+ " </div>\n",
1434
+ " </div>\n"
1435
+ ],
1436
+ "application/vnd.google.colaboratory.intrinsic+json": {
1437
+ "type": "dataframe",
1438
+ "summary": "{\n \"name\": \"display(df_sales\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"A Light in the Attic\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"month\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2024-09-01 00:00:00\",\n \"max\": \"2025-01-01 00:00:00\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"2024-10-01 00:00:00\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"units_sold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 100,\n \"max\": 109,\n \"num_unique_values\": 5,\n \"samples\": [\n 109\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"neutral\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
1439
+ }
1440
+ },
1441
+ "metadata": {}
1442
+ }
1443
+ ],
1444
+ "source": [
1445
+ "# =========================\n",
1446
+ "# Create df_sales from sales_data\n",
1447
+ "# =========================\n",
1448
+ "\n",
1449
+ "# 1️⃣ Convert list of dictionaries into DataFrame\n",
1450
+ "df_sales = pd.DataFrame(sales_data)\n",
1451
+ "\n",
1452
+ "# 2️⃣ Basic validation\n",
1453
+ "print(\"Shape of df_sales:\", df_sales.shape)\n",
1454
+ "print(\"\\nColumns:\")\n",
1455
+ "print(df_sales.columns)\n",
1456
+ "\n",
1457
+ "print(\"\\nData types before cleaning:\")\n",
1458
+ "print(df_sales.dtypes)\n",
1459
+ "\n",
1460
+ "# 3️⃣ Ensure correct data types\n",
1461
+ "df_sales[\"month\"] = pd.to_datetime(df_sales[\"month\"], format=\"%Y-%m\")\n",
1462
+ "df_sales[\"units_sold\"] = pd.to_numeric(df_sales[\"units_sold\"], errors=\"coerce\")\n",
1463
+ "\n",
1464
+ "# 4️⃣ Final validation\n",
1465
+ "print(\"\\nData types after cleaning:\")\n",
1466
+ "print(df_sales.dtypes)\n",
1467
+ "\n",
1468
+ "print(\"\\nMissing values:\")\n",
1469
+ "print(df_sales.isna().sum())\n",
1470
+ "\n",
1471
+ "display(df_sales.head())"
1472
+ ]
1473
+ },
1474
+ {
1475
+ "cell_type": "markdown",
1476
+ "metadata": {
1477
+ "id": "EhIjz9WohAmZ"
1478
+ },
1479
+ "source": [
1480
+ "### *d. Save df_sales as synthetic_sales_data.csv & view first few lines*"
1481
+ ]
1482
+ },
1483
+ {
1484
+ "cell_type": "code",
1485
+ "execution_count": 19,
1486
+ "metadata": {
1487
+ "colab": {
1488
+ "base_uri": "https://localhost:8080/"
1489
+ },
1490
+ "id": "MzbZvLcAhGaH",
1491
+ "outputId": "e5a2089f-49fb-4311-9e02-1e7204382cd5"
1492
+ },
1493
+ "outputs": [
1494
+ {
1495
+ "output_type": "stream",
1496
+ "name": "stdout",
1497
+ "text": [
1498
+ " title month units_sold sentiment_label\n",
1499
+ "0 A Light in the Attic 2024-09-01 100 neutral\n",
1500
+ "1 A Light in the Attic 2024-10-01 109 neutral\n",
1501
+ "2 A Light in the Attic 2024-11-01 102 neutral\n",
1502
+ "3 A Light in the Attic 2024-12-01 107 neutral\n",
1503
+ "4 A Light in the Attic 2025-01-01 108 neutral\n"
1504
+ ]
1505
+ }
1506
+ ],
1507
+ "source": [
1508
+ "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n",
1509
+ "\n",
1510
+ "print(df_sales.head())"
1511
+ ]
1512
+ },
1513
+ {
1514
+ "cell_type": "markdown",
1515
+ "metadata": {
1516
+ "id": "7g9gqBgQMtJn"
1517
+ },
1518
+ "source": [
1519
+ "## **5.** 🎯 Generate synthetic customer reviews"
1520
+ ]
1521
+ },
1522
+ {
1523
+ "cell_type": "markdown",
1524
+ "metadata": {
1525
+ "id": "Gi4y9M9KuDWx"
1526
+ },
1527
+ "source": [
1528
+ "### *a. ✋🏻🛑⛔️ Ask ChatGPT to create a list of 50 distinct generic book review texts for the sentiment labels \"positive\", \"neutral\", and \"negative\" called synthetic_reviews_by_sentiment*"
1529
+ ]
1530
+ },
1531
+ {
1532
+ "cell_type": "code",
1533
+ "execution_count": 21,
1534
+ "metadata": {
1535
+ "id": "b3cd2a50"
1536
+ },
1537
+ "outputs": [],
1538
+ "source": [
1539
+ "# =========================\n",
1540
+ "# Synthetic Review Library\n",
1541
+ "# =========================\n",
1542
+ "\n",
1543
+ "synthetic_reviews_by_sentiment = {\n",
1544
+ " \"positive\": [\n",
1545
+ " \"Absolutely loved this book — it exceeded my expectations.\",\n",
1546
+ " \"A beautifully written story that kept me engaged throughout.\",\n",
1547
+ " \"The characters felt real and the journey was unforgettable.\",\n",
1548
+ " \"An inspiring and uplifting read.\",\n",
1549
+ " \"I couldn't put it down — truly captivating.\",\n",
1550
+ " \"A powerful narrative with emotional depth.\",\n",
1551
+ " \"One of the most enjoyable books I've read recently.\",\n",
1552
+ " \"Thought-provoking and wonderfully paced.\",\n",
1553
+ " \"An outstanding piece of storytelling.\",\n",
1554
+ " \"Rich in detail and full of heart.\",\n",
1555
+ " \"A masterfully crafted and compelling novel.\",\n",
1556
+ " \"The writing style was elegant and immersive.\",\n",
1557
+ " \"Highly recommended for anyone who loves great fiction.\",\n",
1558
+ " \"A deeply satisfying reading experience.\",\n",
1559
+ " \"It delivered everything I hoped for and more.\",\n",
1560
+ " \"An engaging plot with meaningful themes.\",\n",
1561
+ " \"Beautiful prose and a gripping storyline.\",\n",
1562
+ " \"A refreshing and memorable read.\",\n",
1563
+ " \"I was hooked from the first chapter.\",\n",
1564
+ " \"The emotional impact was incredible.\",\n",
1565
+ " \"A fantastic blend of drama and insight.\",\n",
1566
+ " \"Creative, smart, and thoroughly enjoyable.\",\n",
1567
+ " \"This book truly stands out.\",\n",
1568
+ " \"A rewarding and impactful story.\",\n",
1569
+ " \"An exceptional and moving narrative.\",\n",
1570
+ " \"I would gladly read this again.\",\n",
1571
+ " \"Strong characters and excellent pacing.\",\n",
1572
+ " \"It left a lasting impression on me.\",\n",
1573
+ " \"A brilliant and heartfelt story.\",\n",
1574
+ " \"Compelling from beginning to end.\",\n",
1575
+ " \"An imaginative and beautifully told tale.\",\n",
1576
+ " \"A story that resonates long after finishing.\",\n",
1577
+ " \"Thoroughly entertaining and meaningful.\",\n",
1578
+ " \"An absorbing and skillfully written book.\",\n",
1579
+ " \"The themes were handled with great care.\",\n",
1580
+ " \"An impressive and emotionally rich novel.\",\n",
1581
+ " \"The author did a fantastic job.\",\n",
1582
+ " \"A wonderful surprise and a joy to read.\",\n",
1583
+ " \"Truly inspiring and well-executed.\",\n",
1584
+ " \"An unforgettable reading experience.\",\n",
1585
+ " \"Deeply engaging and thoughtfully written.\",\n",
1586
+ " \"A delightful and captivating story.\",\n",
1587
+ " \"Everything about this book worked for me.\",\n",
1588
+ " \"A well-structured and compelling narrative.\",\n",
1589
+ " \"A standout title in its genre.\",\n",
1590
+ " \"An emotional rollercoaster in the best way.\",\n",
1591
+ " \"Expertly written and thoroughly enjoyable.\",\n",
1592
+ " \"The storytelling was simply outstanding.\",\n",
1593
+ " \"A gripping and meaningful journey.\",\n",
1594
+ " \"A beautifully developed and inspiring book.\"\n",
1595
+ " ],\n",
1596
+ "\n",
1597
+ " \"neutral\": [\n",
1598
+ " \"An average book — not particularly memorable.\",\n",
1599
+ " \"It had some strong moments but also some weak ones.\",\n",
1600
+ " \"A decent read overall.\",\n",
1601
+ " \"Neither impressive nor disappointing.\",\n",
1602
+ " \"Some chapters were engaging, others less so.\",\n",
1603
+ " \"It was okay, but nothing extraordinary.\",\n",
1604
+ " \"A fairly standard story.\",\n",
1605
+ " \"An acceptable way to spend an afternoon.\",\n",
1606
+ " \"The plot was predictable but readable.\",\n",
1607
+ " \"Not bad, but not outstanding either.\",\n",
1608
+ " \"A mixed reading experience.\",\n",
1609
+ " \"Some characters stood out, others faded.\",\n",
1610
+ " \"The pacing was inconsistent at times.\",\n",
1611
+ " \"It held my attention occasionally.\",\n",
1612
+ " \"An ordinary but readable novel.\",\n",
1613
+ " \"It had potential but didn't fully deliver.\",\n",
1614
+ " \"The writing was competent but not remarkable.\",\n",
1615
+ " \"An average effort overall.\",\n",
1616
+ " \"Interesting in parts, slow in others.\",\n",
1617
+ " \"A moderately enjoyable read.\",\n",
1618
+ " \"I neither loved nor disliked it.\",\n",
1619
+ " \"The themes were somewhat engaging.\",\n",
1620
+ " \"A reasonable but forgettable book.\",\n",
1621
+ " \"Not as strong as I expected.\",\n",
1622
+ " \"It was fine, just not memorable.\",\n",
1623
+ " \"Some elements worked better than others.\",\n",
1624
+ " \"An uneven but passable story.\",\n",
1625
+ " \"The concept was interesting, execution average.\",\n",
1626
+ " \"It didn’t fully captivate me.\",\n",
1627
+ " \"A fair attempt with mixed results.\",\n",
1628
+ " \"Serviceable but not standout.\",\n",
1629
+ " \"A readable yet unremarkable book.\",\n",
1630
+ " \"There were moments of interest.\",\n",
1631
+ " \"It felt somewhat conventional.\",\n",
1632
+ " \"An okay read with minor highlights.\",\n",
1633
+ " \"The storyline was acceptable.\",\n",
1634
+ " \"It met basic expectations.\",\n",
1635
+ " \"A safe and predictable narrative.\",\n",
1636
+ " \"Nothing particularly new or exciting.\",\n",
1637
+ " \"A book I won’t revisit but don’t regret.\",\n",
1638
+ " \"Some parts were enjoyable.\",\n",
1639
+ " \"A mildly engaging experience.\",\n",
1640
+ " \"It had both strengths and weaknesses.\",\n",
1641
+ " \"An overall average performance.\",\n",
1642
+ " \"The writing was simple and straightforward.\",\n",
1643
+ " \"A balanced but unremarkable read.\",\n",
1644
+ " \"It delivered a standard storyline.\",\n",
1645
+ " \"Somewhat entertaining but not gripping.\",\n",
1646
+ " \"It was adequate for its genre.\",\n",
1647
+ " \"A middle-of-the-road book.\"\n",
1648
+ " ],\n",
1649
+ "\n",
1650
+ " \"negative\": [\n",
1651
+ " \"I struggled to stay engaged throughout.\",\n",
1652
+ " \"The story failed to capture my interest.\",\n",
1653
+ " \"Disappointing from start to finish.\",\n",
1654
+ " \"The characters felt flat and unconvincing.\",\n",
1655
+ " \"It didn’t live up to the hype.\",\n",
1656
+ " \"The pacing was painfully slow.\",\n",
1657
+ " \"I found the plot confusing.\",\n",
1658
+ " \"The writing style didn’t appeal to me.\",\n",
1659
+ " \"A frustrating reading experience.\",\n",
1660
+ " \"The story lacked direction.\",\n",
1661
+ " \"I expected much more from this book.\",\n",
1662
+ " \"It was difficult to finish.\",\n",
1663
+ " \"The narrative felt disjointed.\",\n",
1664
+ " \"The themes weren’t well developed.\",\n",
1665
+ " \"The dialogue seemed unrealistic.\",\n",
1666
+ " \"I couldn’t connect with the characters.\",\n",
1667
+ " \"The storyline felt repetitive.\",\n",
1668
+ " \"It left me underwhelmed.\",\n",
1669
+ " \"The book lacked emotional impact.\",\n",
1670
+ " \"Not as compelling as I had hoped.\",\n",
1671
+ " \"The ending was unsatisfying.\",\n",
1672
+ " \"It felt rushed and incomplete.\",\n",
1673
+ " \"The plot had too many gaps.\",\n",
1674
+ " \"I lost interest halfway through.\",\n",
1675
+ " \"The execution was disappointing.\",\n",
1676
+ " \"The concept was better than the delivery.\",\n",
1677
+ " \"It didn’t hold my attention.\",\n",
1678
+ " \"A forgettable and dull read.\",\n",
1679
+ " \"The structure felt messy.\",\n",
1680
+ " \"It failed to leave a lasting impression.\",\n",
1681
+ " \"The writing felt uninspired.\",\n",
1682
+ " \"The story lacked depth.\",\n",
1683
+ " \"I found it quite tedious.\",\n",
1684
+ " \"The pacing was uneven and slow.\",\n",
1685
+ " \"It lacked originality.\",\n",
1686
+ " \"A missed opportunity.\",\n",
1687
+ " \"The characters weren’t believable.\",\n",
1688
+ " \"It didn’t resonate with me.\",\n",
1689
+ " \"The development was weak.\",\n",
1690
+ " \"The plot twists felt forced.\",\n",
1691
+ " \"I wouldn’t recommend it.\",\n",
1692
+ " \"It didn’t meet my expectations.\",\n",
1693
+ " \"The storytelling was underwhelming.\",\n",
1694
+ " \"The book felt overly long.\",\n",
1695
+ " \"The tone felt inconsistent.\",\n",
1696
+ " \"It was hard to stay invested.\",\n",
1697
+ " \"The narrative felt shallow.\",\n",
1698
+ " \"Not an enjoyable experience.\",\n",
1699
+ " \"It lacked clarity and focus.\",\n",
1700
+ " \"Overall, a disappointing read.\"\n",
1701
+ " ]\n",
1702
+ "}"
1703
+ ]
1704
+ },
1705
+ {
1706
+ "cell_type": "markdown",
1707
+ "metadata": {
1708
+ "id": "fQhfVaDmuULT"
1709
+ },
1710
+ "source": [
1711
+ "### *b. Generate 10 reviews per book using random sampling from the corresponding 50*"
1712
+ ]
1713
+ },
1714
+ {
1715
+ "cell_type": "code",
1716
+ "execution_count": 22,
1717
+ "metadata": {
1718
+ "id": "l2SRc3PjuTGM"
1719
+ },
1720
+ "outputs": [],
1721
+ "source": [
1722
+ "review_rows = []\n",
1723
+ "for _, row in df_books.iterrows():\n",
1724
+ " title = row['title']\n",
1725
+ " sentiment_label = row['sentiment_label']\n",
1726
+ " review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n",
1727
+ " sampled_reviews = random.sample(review_pool, 10)\n",
1728
+ " for review_text in sampled_reviews:\n",
1729
+ " review_rows.append({\n",
1730
+ " \"title\": title,\n",
1731
+ " \"sentiment_label\": sentiment_label,\n",
1732
+ " \"review_text\": review_text,\n",
1733
+ " \"rating\": row['rating'],\n",
1734
+ " \"popularity_score\": row['popularity_score']\n",
1735
+ " })"
1736
+ ]
1737
+ },
1738
+ {
1739
+ "cell_type": "markdown",
1740
+ "metadata": {
1741
+ "id": "bmJMXF-Bukdm"
1742
+ },
1743
+ "source": [
1744
+ "### *c. Create the final dataframe df_reviews & save it as synthetic_book_reviews.csv*"
1745
+ ]
1746
+ },
1747
+ {
1748
+ "cell_type": "code",
1749
+ "execution_count": 24,
1750
+ "metadata": {
1751
+ "id": "ZUKUqZsuumsp"
1752
+ },
1753
+ "outputs": [],
1754
+ "source": [
1755
+ "df_reviews = pd.DataFrame(review_rows)\n",
1756
+ "df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)"
1757
+ ]
1758
+ },
1759
+ {
1760
+ "cell_type": "markdown",
1761
+ "source": [
1762
+ "### *c. inputs for R*"
1763
+ ],
1764
+ "metadata": {
1765
+ "id": "_602pYUS3gY5"
1766
+ }
1767
+ },
1768
+ {
1769
+ "cell_type": "markdown",
1770
+ "metadata": {
1771
+ "id": "RYvGyVfXuo54"
1772
+ },
1773
+ "source": [
1774
+ "### *d. ✋🏻🛑⛔️ View the first few lines*"
1775
+ ]
1776
+ },
1777
+ {
1778
+ "cell_type": "code",
1779
+ "execution_count": 25,
1780
+ "metadata": {
1781
+ "colab": {
1782
+ "base_uri": "https://localhost:8080/",
1783
+ "height": 206
1784
+ },
1785
+ "id": "xfE8NMqOurKo",
1786
+ "outputId": "29dcaaf0-5a04-4ee0-e2cf-2fb743b40f35"
1787
+ },
1788
+ "outputs": [
1789
+ {
1790
+ "output_type": "execute_result",
1791
+ "data": {
1792
+ "text/plain": [
1793
+ " title sentiment_label \\\n",
1794
+ "0 A Light in the Attic neutral \n",
1795
+ "1 A Light in the Attic neutral \n",
1796
+ "2 A Light in the Attic neutral \n",
1797
+ "3 A Light in the Attic neutral \n",
1798
+ "4 A Light in the Attic neutral \n",
1799
+ "\n",
1800
+ " review_text rating popularity_score \n",
1801
+ "0 It delivered a standard storyline. Three 3 \n",
1802
+ "1 A reasonable but forgettable book. Three 3 \n",
1803
+ "2 An okay read with minor highlights. Three 3 \n",
1804
+ "3 The plot was predictable but readable. Three 3 \n",
1805
+ "4 The writing was simple and straightforward. Three 3 "
1806
+ ],
1807
+ "text/html": [
1808
+ "\n",
1809
+ " <div id=\"df-6306fd8e-6e2d-43bb-a5cf-ec02998d4568\" class=\"colab-df-container\">\n",
1810
+ " <div>\n",
1811
+ "<style scoped>\n",
1812
+ " .dataframe tbody tr th:only-of-type {\n",
1813
+ " vertical-align: middle;\n",
1814
+ " }\n",
1815
+ "\n",
1816
+ " .dataframe tbody tr th {\n",
1817
+ " vertical-align: top;\n",
1818
+ " }\n",
1819
+ "\n",
1820
+ " .dataframe thead th {\n",
1821
+ " text-align: right;\n",
1822
+ " }\n",
1823
+ "</style>\n",
1824
+ "<table border=\"1\" class=\"dataframe\">\n",
1825
+ " <thead>\n",
1826
+ " <tr style=\"text-align: right;\">\n",
1827
+ " <th></th>\n",
1828
+ " <th>title</th>\n",
1829
+ " <th>sentiment_label</th>\n",
1830
+ " <th>review_text</th>\n",
1831
+ " <th>rating</th>\n",
1832
+ " <th>popularity_score</th>\n",
1833
+ " </tr>\n",
1834
+ " </thead>\n",
1835
+ " <tbody>\n",
1836
+ " <tr>\n",
1837
+ " <th>0</th>\n",
1838
+ " <td>A Light in the Attic</td>\n",
1839
+ " <td>neutral</td>\n",
1840
+ " <td>It delivered a standard storyline.</td>\n",
1841
+ " <td>Three</td>\n",
1842
+ " <td>3</td>\n",
1843
+ " </tr>\n",
1844
+ " <tr>\n",
1845
+ " <th>1</th>\n",
1846
+ " <td>A Light in the Attic</td>\n",
1847
+ " <td>neutral</td>\n",
1848
+ " <td>A reasonable but forgettable book.</td>\n",
1849
+ " <td>Three</td>\n",
1850
+ " <td>3</td>\n",
1851
+ " </tr>\n",
1852
+ " <tr>\n",
1853
+ " <th>2</th>\n",
1854
+ " <td>A Light in the Attic</td>\n",
1855
+ " <td>neutral</td>\n",
1856
+ " <td>An okay read with minor highlights.</td>\n",
1857
+ " <td>Three</td>\n",
1858
+ " <td>3</td>\n",
1859
+ " </tr>\n",
1860
+ " <tr>\n",
1861
+ " <th>3</th>\n",
1862
+ " <td>A Light in the Attic</td>\n",
1863
+ " <td>neutral</td>\n",
1864
+ " <td>The plot was predictable but readable.</td>\n",
1865
+ " <td>Three</td>\n",
1866
+ " <td>3</td>\n",
1867
+ " </tr>\n",
1868
+ " <tr>\n",
1869
+ " <th>4</th>\n",
1870
+ " <td>A Light in the Attic</td>\n",
1871
+ " <td>neutral</td>\n",
1872
+ " <td>The writing was simple and straightforward.</td>\n",
1873
+ " <td>Three</td>\n",
1874
+ " <td>3</td>\n",
1875
+ " </tr>\n",
1876
+ " </tbody>\n",
1877
+ "</table>\n",
1878
+ "</div>\n",
1879
+ " <div class=\"colab-df-buttons\">\n",
1880
+ "\n",
1881
+ " <div class=\"colab-df-container\">\n",
1882
+ " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-6306fd8e-6e2d-43bb-a5cf-ec02998d4568')\"\n",
1883
+ " title=\"Convert this dataframe to an interactive table.\"\n",
1884
+ " style=\"display:none;\">\n",
1885
+ "\n",
1886
+ " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
1887
+ " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
1888
+ " </svg>\n",
1889
+ " </button>\n",
1890
+ "\n",
1891
+ " <style>\n",
1892
+ " .colab-df-container {\n",
1893
+ " display:flex;\n",
1894
+ " gap: 12px;\n",
1895
+ " }\n",
1896
+ "\n",
1897
+ " .colab-df-convert {\n",
1898
+ " background-color: #E8F0FE;\n",
1899
+ " border: none;\n",
1900
+ " border-radius: 50%;\n",
1901
+ " cursor: pointer;\n",
1902
+ " display: none;\n",
1903
+ " fill: #1967D2;\n",
1904
+ " height: 32px;\n",
1905
+ " padding: 0 0 0 0;\n",
1906
+ " width: 32px;\n",
1907
+ " }\n",
1908
+ "\n",
1909
+ " .colab-df-convert:hover {\n",
1910
+ " background-color: #E2EBFA;\n",
1911
+ " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
1912
+ " fill: #174EA6;\n",
1913
+ " }\n",
1914
+ "\n",
1915
+ " .colab-df-buttons div {\n",
1916
+ " margin-bottom: 4px;\n",
1917
+ " }\n",
1918
+ "\n",
1919
+ " [theme=dark] .colab-df-convert {\n",
1920
+ " background-color: #3B4455;\n",
1921
+ " fill: #D2E3FC;\n",
1922
+ " }\n",
1923
+ "\n",
1924
+ " [theme=dark] .colab-df-convert:hover {\n",
1925
+ " background-color: #434B5C;\n",
1926
+ " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
1927
+ " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
1928
+ " fill: #FFFFFF;\n",
1929
+ " }\n",
1930
+ " </style>\n",
1931
+ "\n",
1932
+ " <script>\n",
1933
+ " const buttonEl =\n",
1934
+ " document.querySelector('#df-6306fd8e-6e2d-43bb-a5cf-ec02998d4568 button.colab-df-convert');\n",
1935
+ " buttonEl.style.display =\n",
1936
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
1937
+ "\n",
1938
+ " async function convertToInteractive(key) {\n",
1939
+ " const element = document.querySelector('#df-6306fd8e-6e2d-43bb-a5cf-ec02998d4568');\n",
1940
+ " const dataTable =\n",
1941
+ " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
1942
+ " [key], {});\n",
1943
+ " if (!dataTable) return;\n",
1944
+ "\n",
1945
+ " const docLinkHtml = 'Like what you see? Visit the ' +\n",
1946
+ " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
1947
+ " + ' to learn more about interactive tables.';\n",
1948
+ " element.innerHTML = '';\n",
1949
+ " dataTable['output_type'] = 'display_data';\n",
1950
+ " await google.colab.output.renderOutput(dataTable, element);\n",
1951
+ " const docLink = document.createElement('div');\n",
1952
+ " docLink.innerHTML = docLinkHtml;\n",
1953
+ " element.appendChild(docLink);\n",
1954
+ " }\n",
1955
+ " </script>\n",
1956
+ " </div>\n",
1957
+ "\n",
1958
+ "\n",
1959
+ " </div>\n",
1960
+ " </div>\n"
1961
+ ],
1962
+ "application/vnd.google.colaboratory.intrinsic+json": {
1963
+ "type": "dataframe",
1964
+ "variable_name": "df_reviews",
1965
+ "summary": "{\n \"name\": \"df_reviews\",\n \"rows\": 10000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment_label\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"neutral\",\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"review_text\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 150,\n \"samples\": [\n \"A fantastic blend of drama and insight.\",\n \"The dialogue seemed unrealistic.\",\n \"An imaginative and beautifully told tale.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
1966
+ }
1967
+ },
1968
+ "metadata": {},
1969
+ "execution_count": 25
1970
+ }
1971
+ ],
1972
+ "source": [
1973
+ "df_reviews.head()"
1974
+ ]
1975
+ }
1976
+ ],
1977
+ "metadata": {
1978
+ "colab": {
1979
+ "collapsed_sections": [
1980
+ "jpASMyIQMaAq",
1981
+ "lquNYCbfL9IM",
1982
+ "0IWuNpxxYDJF",
1983
+ "oCdTsin2Yfp3",
1984
+ "T0TOeRC4Yrnn",
1985
+ "duI5dv3CZYvF",
1986
+ "qMjRKMBQZlJi",
1987
+ "p-1Pr2szaqLk",
1988
+ "SIaJUGIpaH4V",
1989
+ "pY4yCoIuaQqp",
1990
+ "n4-TaNTFgPak",
1991
+ "HnngRNTgacYt",
1992
+ "HF9F9HIzgT7Z",
1993
+ "T8AdKkmASq9a",
1994
+ "OhXbdGD5fH0c",
1995
+ "L2ak1HlcgoTe",
1996
+ "4IXZKcCSgxnq",
1997
+ "EhIjz9WohAmZ",
1998
+ "Gi4y9M9KuDWx",
1999
+ "fQhfVaDmuULT",
2000
+ "bmJMXF-Bukdm",
2001
+ "RYvGyVfXuo54"
2002
+ ],
2003
+ "provenance": []
2004
+ },
2005
+ "kernelspec": {
2006
+ "display_name": "Python 3",
2007
+ "name": "python3"
2008
+ },
2009
+ "language_info": {
2010
+ "name": "python"
2011
+ }
2012
+ },
2013
+ "nbformat": 4,
2014
+ "nbformat_minor": 0
2015
+ }
2a_Python_Analysis_(1) (1).ipynb ADDED
The diff for this file is too large to render. See raw diff