elsayedelmandoh commited on
Commit
9ac3023
·
1 Parent(s): f9169b8
notebooks/02_eda.ipynb CHANGED
@@ -180,6 +180,26 @@
180
  "sample_train.info()"
181
  ]
182
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  {
184
  "cell_type": "markdown",
185
  "id": "f082d532",
@@ -300,7 +320,17 @@
300
  },
301
  {
302
  "cell_type": "code",
303
- "execution_count": 9,
 
 
 
 
 
 
 
 
 
 
304
  "id": "2da64228",
305
  "metadata": {},
306
  "outputs": [
@@ -316,7 +346,18 @@
316
  }
317
  ],
318
  "source": [
319
- "sns.countplot(x='review_target', data=sample_train)\n",
 
 
 
 
 
 
 
 
 
 
 
320
  "plt.title('Distribution of Target Classes in Sample Train Dataset')\n",
321
  "plt.xlabel('Target Class')\n",
322
  "plt.ylabel('Count')\n",
@@ -333,7 +374,7 @@
333
  },
334
  {
335
  "cell_type": "code",
336
- "execution_count": 10,
337
  "id": "aaa59508",
338
  "metadata": {},
339
  "outputs": [
 
180
  "sample_train.info()"
181
  ]
182
  },
183
+ {
184
+ "cell_type": "code",
185
+ "execution_count": null,
186
+ "id": "dedcfbf6",
187
+ "metadata": {},
188
+ "outputs": [],
189
+ "source": [
190
+ "print('There are {} rows and {} columns in train'.format(sample_train.shape[0], sample_train.shape[1]))"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "code",
195
+ "execution_count": null,
196
+ "id": "c6e524fd",
197
+ "metadata": {},
198
+ "outputs": [],
199
+ "source": [
200
+ "sample_train.describe()"
201
+ ]
202
+ },
203
  {
204
  "cell_type": "markdown",
205
  "id": "f082d532",
 
320
  },
321
  {
322
  "cell_type": "code",
323
+ "execution_count": null,
324
+ "id": "408d9237",
325
+ "metadata": {},
326
+ "outputs": [],
327
+ "source": [
328
+ "sample_train.groupby('review_target').describe()\n"
329
+ ]
330
+ },
331
+ {
332
+ "cell_type": "code",
333
+ "execution_count": null,
334
  "id": "2da64228",
335
  "metadata": {},
336
  "outputs": [
 
346
  }
347
  ],
348
  "source": [
349
+ "ax= sns.countplot(x='review_target', data=sample_train)\n",
350
+ "\n",
351
+ "for p in ax.patches: # bars\n",
352
+ " '''\n",
353
+ " get_bbox(): return bounding box of the bar, \n",
354
+ " get_points(): returns the coordinates of the four corners of the bounding box.\n",
355
+ " '''\n",
356
+ " x= p.get_bbox().get_points()[:,0] # extract the x-coordinates of the four corners of the bar rectangle\n",
357
+ " y= p.get_bbox().get_points()[1,1] # extract the y-coordinate of the top-right corner\n",
358
+ " ax.annotate(f'{y:.0f}', (x.mean(), y), ha='center',va='bottom') # text on top bar\n",
359
+ " \n",
360
+ "\n",
361
  "plt.title('Distribution of Target Classes in Sample Train Dataset')\n",
362
  "plt.xlabel('Target Class')\n",
363
  "plt.ylabel('Count')\n",
 
374
  },
375
  {
376
  "cell_type": "code",
377
+ "execution_count": null,
378
  "id": "aaa59508",
379
  "metadata": {},
380
  "outputs": [
notebooks/03_data_preprocessing.ipynb CHANGED
@@ -160,7 +160,7 @@
160
  }
161
  ],
162
  "source": [
163
- "balanced_sample_train = pd.read_csv(r'data/balanced/balanced_sample_train.csv', dtype=str, quoting=0, nrows=100)\n",
164
  "balanced_sample_train.head()"
165
  ]
166
  },
@@ -174,17 +174,17 @@
174
  "output_type": "stream",
175
  "text": [
176
  "<class 'pandas.DataFrame'>\n",
177
- "RangeIndex: 100 entries, 0 to 99\n",
178
  "Data columns (total 5 columns):\n",
179
  " # Column Non-Null Count Dtype\n",
180
  "--- ------ -------------- -----\n",
181
- " 0 review_target 100 non-null str \n",
182
- " 1 review_title 100 non-null str \n",
183
- " 2 review_content 100 non-null str \n",
184
- " 3 char_count 100 non-null str \n",
185
- " 4 word_count 100 non-null str \n",
186
  "dtypes: str(5)\n",
187
- "memory usage: 4.0 KB\n"
188
  ]
189
  }
190
  ],
@@ -202,7 +202,7 @@
202
  },
203
  {
204
  "cell_type": "code",
205
- "execution_count": 6,
206
  "id": "2deb74f4",
207
  "metadata": {},
208
  "outputs": [
@@ -308,7 +308,7 @@
308
  "4 love sheet sleek smooth really cool feel perfe... "
309
  ]
310
  },
311
- "execution_count": 6,
312
  "metadata": {},
313
  "output_type": "execute_result"
314
  }
@@ -329,7 +329,7 @@
329
  },
330
  {
331
  "cell_type": "code",
332
- "execution_count": 7,
333
  "id": "2c4e029b",
334
  "metadata": {},
335
  "outputs": [
@@ -346,7 +346,7 @@
346
  "{'csv': PosixPath('data/processed/processed_train.csv')}"
347
  ]
348
  },
349
- "execution_count": 7,
350
  "metadata": {},
351
  "output_type": "execute_result"
352
  }
 
160
  }
161
  ],
162
  "source": [
163
+ "balanced_sample_train = pd.read_csv(r'data/balanced/balanced_sample_train.csv', dtype=str, quoting=0)\n",
164
  "balanced_sample_train.head()"
165
  ]
166
  },
 
174
  "output_type": "stream",
175
  "text": [
176
  "<class 'pandas.DataFrame'>\n",
177
+ "RangeIndex: 79972 entries, 0 to 79971\n",
178
  "Data columns (total 5 columns):\n",
179
  " # Column Non-Null Count Dtype\n",
180
  "--- ------ -------------- -----\n",
181
+ " 0 review_target 79972 non-null str \n",
182
+ " 1 review_title 79972 non-null str \n",
183
+ " 2 review_content 79972 non-null str \n",
184
+ " 3 char_count 79972 non-null str \n",
185
+ " 4 word_count 79972 non-null str \n",
186
  "dtypes: str(5)\n",
187
+ "memory usage: 3.1 MB\n"
188
  ]
189
  }
190
  ],
 
202
  },
203
  {
204
  "cell_type": "code",
205
+ "execution_count": 5,
206
  "id": "2deb74f4",
207
  "metadata": {},
208
  "outputs": [
 
308
  "4 love sheet sleek smooth really cool feel perfe... "
309
  ]
310
  },
311
+ "execution_count": 5,
312
  "metadata": {},
313
  "output_type": "execute_result"
314
  }
 
329
  },
330
  {
331
  "cell_type": "code",
332
+ "execution_count": 6,
333
  "id": "2c4e029b",
334
  "metadata": {},
335
  "outputs": [
 
346
  "{'csv': PosixPath('data/processed/processed_train.csv')}"
347
  ]
348
  },
349
+ "execution_count": 6,
350
  "metadata": {},
351
  "output_type": "execute_result"
352
  }
notebooks/04_feature_engineering.ipynb CHANGED
The diff for this file is too large to render. See raw diff