Spaces:
Sleeping
Sleeping
Commit ·
9ac3023
1
Parent(s): f9169b8
update
Browse files
notebooks/02_eda.ipynb
CHANGED
|
@@ -180,6 +180,26 @@
|
|
| 180 |
"sample_train.info()"
|
| 181 |
]
|
| 182 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
{
|
| 184 |
"cell_type": "markdown",
|
| 185 |
"id": "f082d532",
|
|
@@ -300,7 +320,17 @@
|
|
| 300 |
},
|
| 301 |
{
|
| 302 |
"cell_type": "code",
|
| 303 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
"id": "2da64228",
|
| 305 |
"metadata": {},
|
| 306 |
"outputs": [
|
|
@@ -316,7 +346,18 @@
|
|
| 316 |
}
|
| 317 |
],
|
| 318 |
"source": [
|
| 319 |
-
"sns.countplot(x='review_target', data=sample_train)\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
"plt.title('Distribution of Target Classes in Sample Train Dataset')\n",
|
| 321 |
"plt.xlabel('Target Class')\n",
|
| 322 |
"plt.ylabel('Count')\n",
|
|
@@ -333,7 +374,7 @@
|
|
| 333 |
},
|
| 334 |
{
|
| 335 |
"cell_type": "code",
|
| 336 |
-
"execution_count":
|
| 337 |
"id": "aaa59508",
|
| 338 |
"metadata": {},
|
| 339 |
"outputs": [
|
|
|
|
| 180 |
"sample_train.info()"
|
| 181 |
]
|
| 182 |
},
|
| 183 |
+
{
|
| 184 |
+
"cell_type": "code",
|
| 185 |
+
"execution_count": null,
|
| 186 |
+
"id": "dedcfbf6",
|
| 187 |
+
"metadata": {},
|
| 188 |
+
"outputs": [],
|
| 189 |
+
"source": [
|
| 190 |
+
"print('There are {} rows and {} columns in train'.format(sample_train.shape[0], sample_train.shape[1]))"
|
| 191 |
+
]
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"cell_type": "code",
|
| 195 |
+
"execution_count": null,
|
| 196 |
+
"id": "c6e524fd",
|
| 197 |
+
"metadata": {},
|
| 198 |
+
"outputs": [],
|
| 199 |
+
"source": [
|
| 200 |
+
"sample_train.describe()"
|
| 201 |
+
]
|
| 202 |
+
},
|
| 203 |
{
|
| 204 |
"cell_type": "markdown",
|
| 205 |
"id": "f082d532",
|
|
|
|
| 320 |
},
|
| 321 |
{
|
| 322 |
"cell_type": "code",
|
| 323 |
+
"execution_count": null,
|
| 324 |
+
"id": "408d9237",
|
| 325 |
+
"metadata": {},
|
| 326 |
+
"outputs": [],
|
| 327 |
+
"source": [
|
| 328 |
+
"sample_train.groupby('review_target').describe()\n"
|
| 329 |
+
]
|
| 330 |
+
},
|
| 331 |
+
{
|
| 332 |
+
"cell_type": "code",
|
| 333 |
+
"execution_count": null,
|
| 334 |
"id": "2da64228",
|
| 335 |
"metadata": {},
|
| 336 |
"outputs": [
|
|
|
|
| 346 |
}
|
| 347 |
],
|
| 348 |
"source": [
|
| 349 |
+
"ax= sns.countplot(x='review_target', data=sample_train)\n",
|
| 350 |
+
"\n",
|
| 351 |
+
"for p in ax.patches: # bars\n",
|
| 352 |
+
" '''\n",
|
| 353 |
+
" get_bbox(): return bounding box of the bar, \n",
|
| 354 |
+
" get_points(): returns the coordinates of the four corners of the bounding box.\n",
|
| 355 |
+
" '''\n",
|
| 356 |
+
" x= p.get_bbox().get_points()[:,0] # extract the x-coordinates of the four corners of the bar rectangle\n",
|
| 357 |
+
" y= p.get_bbox().get_points()[1,1] # extract the y-coordinate of the top-right corner\n",
|
| 358 |
+
" ax.annotate(f'{y:.0f}', (x.mean(), y), ha='center',va='bottom') # text on top bar\n",
|
| 359 |
+
" \n",
|
| 360 |
+
"\n",
|
| 361 |
"plt.title('Distribution of Target Classes in Sample Train Dataset')\n",
|
| 362 |
"plt.xlabel('Target Class')\n",
|
| 363 |
"plt.ylabel('Count')\n",
|
|
|
|
| 374 |
},
|
| 375 |
{
|
| 376 |
"cell_type": "code",
|
| 377 |
+
"execution_count": null,
|
| 378 |
"id": "aaa59508",
|
| 379 |
"metadata": {},
|
| 380 |
"outputs": [
|
notebooks/03_data_preprocessing.ipynb
CHANGED
|
@@ -160,7 +160,7 @@
|
|
| 160 |
}
|
| 161 |
],
|
| 162 |
"source": [
|
| 163 |
-
"balanced_sample_train = pd.read_csv(r'data/balanced/balanced_sample_train.csv', dtype=str, quoting=0
|
| 164 |
"balanced_sample_train.head()"
|
| 165 |
]
|
| 166 |
},
|
|
@@ -174,17 +174,17 @@
|
|
| 174 |
"output_type": "stream",
|
| 175 |
"text": [
|
| 176 |
"<class 'pandas.DataFrame'>\n",
|
| 177 |
-
"RangeIndex:
|
| 178 |
"Data columns (total 5 columns):\n",
|
| 179 |
" # Column Non-Null Count Dtype\n",
|
| 180 |
"--- ------ -------------- -----\n",
|
| 181 |
-
" 0 review_target
|
| 182 |
-
" 1 review_title
|
| 183 |
-
" 2 review_content
|
| 184 |
-
" 3 char_count
|
| 185 |
-
" 4 word_count
|
| 186 |
"dtypes: str(5)\n",
|
| 187 |
-
"memory usage:
|
| 188 |
]
|
| 189 |
}
|
| 190 |
],
|
|
@@ -202,7 +202,7 @@
|
|
| 202 |
},
|
| 203 |
{
|
| 204 |
"cell_type": "code",
|
| 205 |
-
"execution_count":
|
| 206 |
"id": "2deb74f4",
|
| 207 |
"metadata": {},
|
| 208 |
"outputs": [
|
|
@@ -308,7 +308,7 @@
|
|
| 308 |
"4 love sheet sleek smooth really cool feel perfe... "
|
| 309 |
]
|
| 310 |
},
|
| 311 |
-
"execution_count":
|
| 312 |
"metadata": {},
|
| 313 |
"output_type": "execute_result"
|
| 314 |
}
|
|
@@ -329,7 +329,7 @@
|
|
| 329 |
},
|
| 330 |
{
|
| 331 |
"cell_type": "code",
|
| 332 |
-
"execution_count":
|
| 333 |
"id": "2c4e029b",
|
| 334 |
"metadata": {},
|
| 335 |
"outputs": [
|
|
@@ -346,7 +346,7 @@
|
|
| 346 |
"{'csv': PosixPath('data/processed/processed_train.csv')}"
|
| 347 |
]
|
| 348 |
},
|
| 349 |
-
"execution_count":
|
| 350 |
"metadata": {},
|
| 351 |
"output_type": "execute_result"
|
| 352 |
}
|
|
|
|
| 160 |
}
|
| 161 |
],
|
| 162 |
"source": [
|
| 163 |
+
"balanced_sample_train = pd.read_csv(r'data/balanced/balanced_sample_train.csv', dtype=str, quoting=0)\n",
|
| 164 |
"balanced_sample_train.head()"
|
| 165 |
]
|
| 166 |
},
|
|
|
|
| 174 |
"output_type": "stream",
|
| 175 |
"text": [
|
| 176 |
"<class 'pandas.DataFrame'>\n",
|
| 177 |
+
"RangeIndex: 79972 entries, 0 to 79971\n",
|
| 178 |
"Data columns (total 5 columns):\n",
|
| 179 |
" # Column Non-Null Count Dtype\n",
|
| 180 |
"--- ------ -------------- -----\n",
|
| 181 |
+
" 0 review_target 79972 non-null str \n",
|
| 182 |
+
" 1 review_title 79972 non-null str \n",
|
| 183 |
+
" 2 review_content 79972 non-null str \n",
|
| 184 |
+
" 3 char_count 79972 non-null str \n",
|
| 185 |
+
" 4 word_count 79972 non-null str \n",
|
| 186 |
"dtypes: str(5)\n",
|
| 187 |
+
"memory usage: 3.1 MB\n"
|
| 188 |
]
|
| 189 |
}
|
| 190 |
],
|
|
|
|
| 202 |
},
|
| 203 |
{
|
| 204 |
"cell_type": "code",
|
| 205 |
+
"execution_count": 5,
|
| 206 |
"id": "2deb74f4",
|
| 207 |
"metadata": {},
|
| 208 |
"outputs": [
|
|
|
|
| 308 |
"4 love sheet sleek smooth really cool feel perfe... "
|
| 309 |
]
|
| 310 |
},
|
| 311 |
+
"execution_count": 5,
|
| 312 |
"metadata": {},
|
| 313 |
"output_type": "execute_result"
|
| 314 |
}
|
|
|
|
| 329 |
},
|
| 330 |
{
|
| 331 |
"cell_type": "code",
|
| 332 |
+
"execution_count": 6,
|
| 333 |
"id": "2c4e029b",
|
| 334 |
"metadata": {},
|
| 335 |
"outputs": [
|
|
|
|
| 346 |
"{'csv': PosixPath('data/processed/processed_train.csv')}"
|
| 347 |
]
|
| 348 |
},
|
| 349 |
+
"execution_count": 6,
|
| 350 |
"metadata": {},
|
| 351 |
"output_type": "execute_result"
|
| 352 |
}
|
notebooks/04_feature_engineering.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|