Spaces:

elsayedelmandoh
/

sentiment-sleuth

Sleeping

App Files Files Community

elsayedelmandoh commited on Mar 4

Commit

9ac3023

1 Parent(s): f9169b8

update

Browse files

Files changed (3) hide show

notebooks/02_eda.ipynb +44 -3
notebooks/03_data_preprocessing.ipynb +12 -12
notebooks/04_feature_engineering.ipynb +0 -0

notebooks/02_eda.ipynb CHANGED Viewed

@@ -180,6 +180,26 @@
     "sample_train.info()"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "f082d532",
@@ -300,7 +320,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "id": "2da64228",
    "metadata": {},
    "outputs": [
@@ -316,7 +346,18 @@
     }
    ],
    "source": [
-    "sns.countplot(x='review_target', data=sample_train)\n",
     "plt.title('Distribution of Target Classes in Sample Train Dataset')\n",
     "plt.xlabel('Target Class')\n",
     "plt.ylabel('Count')\n",
@@ -333,7 +374,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "id": "aaa59508",
    "metadata": {},
    "outputs": [

     "sample_train.info()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dedcfbf6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('There are {} rows and {} columns in train'.format(sample_train.shape[0], sample_train.shape[1]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6e524fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_train.describe()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "f082d532",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
+   "id": "408d9237",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_train.groupby('review_target').describe()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "id": "2da64228",
    "metadata": {},
    "outputs": [
     }
    ],
    "source": [
+    "ax= sns.countplot(x='review_target', data=sample_train)\n",
+    "\n",
+    "for p in ax.patches: # bars\n",
+    "    '''\n",
+    "    get_bbox(): return bounding box of the bar, \n",
+    "    get_points(): returns the coordinates of the four corners of the bounding box.\n",
+    "    '''\n",
+    "    x= p.get_bbox().get_points()[:,0] # extract the x-coordinates of the four corners of the bar rectangle\n",
+    "    y= p.get_bbox().get_points()[1,1] # extract the y-coordinate of the top-right corner\n",
+    "    ax.annotate(f'{y:.0f}', (x.mean(), y), ha='center',va='bottom') # text on top bar\n",
+    "    \n",
+    "\n",
     "plt.title('Distribution of Target Classes in Sample Train Dataset')\n",
     "plt.xlabel('Target Class')\n",
     "plt.ylabel('Count')\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "aaa59508",
    "metadata": {},
    "outputs": [

notebooks/03_data_preprocessing.ipynb CHANGED Viewed

@@ -160,7 +160,7 @@
     }
    ],
    "source": [
-    "balanced_sample_train = pd.read_csv(r'data/balanced/balanced_sample_train.csv', dtype=str, quoting=0, nrows=100)\n",
     "balanced_sample_train.head()"
    ]
   },
@@ -174,17 +174,17 @@
      "output_type": "stream",
      "text": [
       "<class 'pandas.DataFrame'>\n",
-      "RangeIndex: 100 entries, 0 to 99\n",
       "Data columns (total 5 columns):\n",
       " #   Column          Non-Null Count  Dtype\n",
       "---  ------          --------------  -----\n",
-      " 0   review_target   100 non-null    str  \n",
-      " 1   review_title    100 non-null    str  \n",
-      " 2   review_content  100 non-null    str  \n",
-      " 3   char_count      100 non-null    str  \n",
-      " 4   word_count      100 non-null    str  \n",
       "dtypes: str(5)\n",
-      "memory usage: 4.0 KB\n"
      ]
     }
    ],
@@ -202,7 +202,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "id": "2deb74f4",
    "metadata": {},
    "outputs": [
@@ -308,7 +308,7 @@
        "4  love sheet sleek smooth really cool feel perfe...  "
       ]
      },
-     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -329,7 +329,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "id": "2c4e029b",
    "metadata": {},
    "outputs": [
@@ -346,7 +346,7 @@
        "{'csv': PosixPath('data/processed/processed_train.csv')}"
       ]
      },
-     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }

     }
    ],
    "source": [
+    "balanced_sample_train = pd.read_csv(r'data/balanced/balanced_sample_train.csv', dtype=str, quoting=0)\n",
     "balanced_sample_train.head()"
    ]
   },
      "output_type": "stream",
      "text": [
       "<class 'pandas.DataFrame'>\n",
+      "RangeIndex: 79972 entries, 0 to 79971\n",
       "Data columns (total 5 columns):\n",
       " #   Column          Non-Null Count  Dtype\n",
       "---  ------          --------------  -----\n",
+      " 0   review_target   79972 non-null  str  \n",
+      " 1   review_title    79972 non-null  str  \n",
+      " 2   review_content  79972 non-null  str  \n",
+      " 3   char_count      79972 non-null  str  \n",
+      " 4   word_count      79972 non-null  str  \n",
       "dtypes: str(5)\n",
+      "memory usage: 3.1 MB\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "id": "2deb74f4",
    "metadata": {},
    "outputs": [
        "4  love sheet sleek smooth really cool feel perfe...  "
       ]
      },
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "id": "2c4e029b",
    "metadata": {},
    "outputs": [
        "{'csv': PosixPath('data/processed/processed_train.csv')}"
       ]
      },
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }

notebooks/04_feature_engineering.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff