Spaces:

vikranth1111
/

capstone_main

Runtime error

App Files Files Community

vikranth1111 commited on Nov 14, 2023

Commit

a4b94b2

1 Parent(s): fff24c6

Upload 16 files

Browse files

Files changed (16) hide show

01_kpy_first_model_errors.ipynb +0 -0
02_error_analysis_first_model.ipynb +612 -0
03_kpy_data_exploration.ipynb +0 -0
VERSION +1 -0
config.py +40 -0
data_cleaning.py +50 -0
features.py +19 -0
fixed_df_naive_bayes.png +0 -0
incorrect_naive_bayes.png +0 -0
inference.py +40 -0
lstm_model.py +18 -0
model_dispatcher.py +8 -0
preprocessing.py +36 -0
train.py +85 -0
user_interface.py +47 -0
utils.py +15 -0

01_kpy_first_model_errors.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

02_error_analysis_first_model.ipynb ADDED Viewed

	@@ -0,0 +1,612 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     C:\\Users\\kurti\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import re\n",
+    "import nltk\n",
+    "import string\n",
+    "import numpy as np \n",
+    "import pandas as pd\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem import PorterStemmer\n",
+    "from nltk.tokenize import TweetTokenizer\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.model_selection import StratifiedKFold\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "\n",
+    "nltk.download(\"stopwords\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_tweet(tweet):\n",
+    "    \"\"\"\n",
+    "    Process tweet function.\n",
+    "    Input:\n",
+    "        tweet: a string containing a tweet\n",
+    "    Returns:\n",
+    "        tweets_clean: a list of words containing the processed tweet\n",
+    "\n",
+    "    *Taken from Coursera NLP Specialization Course 1, week 1 programming\n",
+    "    assignment*\n",
+    "    \"\"\"\n",
+    "    stemmer = PorterStemmer()\n",
+    "    stopwords_english = stopwords.words('english')\n",
+    "    # remove stock market tickers like $GE\n",
+    "    tweet = re.sub(r'\\$\\w*', '', str(tweet))\n",
+    "    # remove old style retweet text \"RT\"\n",
+    "    tweet = re.sub(r'^RT[\\s]+', '', str(tweet))\n",
+    "    # remove hyperlinks\n",
+    "    tweet = re.sub(r'https?:\\/\\/.*[\\r\\n]*', '', str(tweet))\n",
+    "    # remove hashtags\n",
+    "    # only removing the hash # sign from the word\n",
+    "    tweet = re.sub(r'#', '', str(tweet))\n",
+    "    # tokenize tweets\n",
+    "    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,\n",
+    "                               reduce_len=True)\n",
+    "    tweet_tokens = tokenizer.tokenize(tweet)\n",
+    "\n",
+    "    tweets_clean = []\n",
+    "    for word in tweet_tokens:\n",
+    "        if (word not in stopwords_english and  # remove stopwords\n",
+    "                word not in string.punctuation):  # remove punctuation\n",
+    "            # tweets_clean.append(word)\n",
+    "            stem_word = stemmer.stem(word)  # stemming word\n",
+    "            tweets_clean.append(stem_word)\n",
+    "\n",
+    "    return \" \".join(tweets_clean)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>all_text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>3796</td>\n",
+       "      <td>new weapon caus un-imagin destruct destructionnon</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>3185</td>\n",
+       "      <td>f @ing thing gishwh got soak delug go pad tamp...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>7769</td>\n",
+       "      <td>dt rt  ûïthe col polic catch pickpocket liver...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>191</td>\n",
+       "      <td>aftershock back school kick great want thank e...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>9810</td>\n",
+       "      <td>respons trauma children addict develop defens ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>7934</td>\n",
+       "      <td>look like got caught rainstorm amaz disgust ti...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>2538</td>\n",
+       "      <td>favorit ladi came volunt meet hope join youth ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>2611</td>\n",
+       "      <td>ux fail emv peopl want insert remov quickli li...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>9756</td>\n",
+       "      <td>can't find ariana grand shirt fuck tragedytrag...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>6254</td>\n",
+       "      <td>murder stori america  ûª first hijack</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     id                                           all_text\n",
+       "0  3796  new weapon caus un-imagin destruct destructionnon\n",
+       "1  3185  f @ing thing gishwh got soak delug go pad tamp...\n",
+       "2  7769  dt rt  ûïthe col polic catch pickpocket liver...\n",
+       "3   191  aftershock back school kick great want thank e...\n",
+       "4  9810  respons trauma children addict develop defens ...\n",
+       "5  7934  look like got caught rainstorm amaz disgust ti...\n",
+       "6  2538  favorit ladi came volunt meet hope join youth ...\n",
+       "7  2611  ux fail emv peopl want insert remov quickli li...\n",
+       "8  9756  can't find ariana grand shirt fuck tragedytrag...\n",
+       "9  6254             murder stori america  ûª first hijack"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# read train data\n",
+    "df = pd.read_csv(\"../inputs/train.csv\")\n",
+    "# shuffle data\n",
+    "df = df.sample(frac=1, random_state=42).reset_index(drop=True)\n",
+    "# create new column \"all_text\"\n",
+    "df[\"all_text\"] = df[\"text\"] + df[\"keyword\"].fillna(\"none\") + df[\"location\"].fillna(\"none\")\n",
+    "# split into features and labels\n",
+    "X = df.drop([\"text\", \"keyword\", \"location\", \"target\"], axis=1)\n",
+    "y = df[\"target\"]\n",
+    "\n",
+    "# process tweets\n",
+    "X[\"all_text\"] = X[\"all_text\"].apply(process_tweet)\n",
+    "X.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a dictionary mapping predictions to the tweet idx\n",
+    "pred_idx_dict = {}\n",
+    "# initialize kfold\n",
+    "skf = StratifiedKFold(n_splits=5, shuffle=False)\n",
+    "for fold, (train_idx, val_idx) in enumerate(skf.split(X=X, y=y)):\n",
+    "    X_train, X_val = X.loc[train_idx, :], X.loc[val_idx, :]\n",
+    "    y_train, y_val = y[train_idx], y[val_idx]\n",
+    "\n",
+    "    # vectorize text and store model\n",
+    "    count_vect = CountVectorizer()\n",
+    "    X_train_vect = count_vect.fit_transform(X_train[\"all_text\"].values)\n",
+    "    X_val_vect = count_vect.transform(X_val[\"all_text\"].values)\n",
+    "    \n",
+    "    # classify predictions\n",
+    "    clf = MultinomialNB()\n",
+    "    clf.fit(X_train_vect, y_train)\n",
+    "    y_preds = clf.predict(X_val_vect)\n",
+    "    \n",
+    "    # idx of tweet mapping to prediction of model\n",
+    "    for idx, key  in enumerate(val_idx):\n",
+    "        pred_idx_dict[key] = y_preds[idx]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create df with actual and prediction\n",
+    "error_df = X.copy()\n",
+    "error_df.rename(columns={\"all_text\":\"processed_all_text\"}, inplace=True)\n",
+    "error_df[\"all_text\"] = df[df[\"id\"] == error_df[\"id\"].values][\"all_text\"]\n",
+    "error_df[\"actual\"] = y.copy()\n",
+    "error_df[\"predictions\"] = pred_idx_dict.values()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>processed_all_text</th>\n",
+       "      <th>all_text</th>\n",
+       "      <th>actual</th>\n",
+       "      <th>predictions</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>3796</td>\n",
+       "      <td>new weapon caus un-imagin destruct destructionnon</td>\n",
+       "      <td>So you have a new weapon that can cause un-ima...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>3185</td>\n",
+       "      <td>f @ing thing gishwh got soak delug go pad tamp...</td>\n",
+       "      <td>The f$&amp;amp;@ing things I do for #GISHWHES Just...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>7769</td>\n",
+       "      <td>dt rt  ûïthe col polic catch pickpocket liver...</td>\n",
+       "      <td>DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>191</td>\n",
+       "      <td>aftershock back school kick great want thank e...</td>\n",
+       "      <td>Aftershock back to school kick off was great. ...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>9810</td>\n",
+       "      <td>respons trauma children addict develop defens ...</td>\n",
+       "      <td>in response to trauma Children of Addicts deve...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7608</th>\n",
+       "      <td>7470</td>\n",
+       "      <td>mani obliter server alway like play :D obliter...</td>\n",
+       "      <td>@Eganator2000 There aren't many Obliteration s...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7609</th>\n",
+       "      <td>7691</td>\n",
+       "      <td>panic attack bc enough money drug alcohol want...</td>\n",
+       "      <td>just had a panic attack bc I don't have enough...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7610</th>\n",
+       "      <td>1242</td>\n",
+       "      <td>omron hem 712c automat blood pressur monitor s...</td>\n",
+       "      <td>Omron HEM-712C Automatic Blood Pressure Monito...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7611</th>\n",
+       "      <td>10862</td>\n",
+       "      <td>offici say quarantin place alabama home possib...</td>\n",
+       "      <td>Officials say a quarantine is in place at an A...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7612</th>\n",
+       "      <td>10409</td>\n",
+       "      <td>move england five year ago today whirlwind time</td>\n",
+       "      <td>I moved to England five years ago today. What ...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>7613 rows × 5 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         id                                 processed_all_text  \\\n",
+       "0      3796  new weapon caus un-imagin destruct destructionnon   \n",
+       "1      3185  f @ing thing gishwh got soak delug go pad tamp...   \n",
+       "2      7769  dt rt  ûïthe col polic catch pickpocket liver...   \n",
+       "3       191  aftershock back school kick great want thank e...   \n",
+       "4      9810  respons trauma children addict develop defens ...   \n",
+       "...     ...                                                ...   \n",
+       "7608   7470  mani obliter server alway like play :D obliter...   \n",
+       "7609   7691  panic attack bc enough money drug alcohol want...   \n",
+       "7610   1242  omron hem 712c automat blood pressur monitor s...   \n",
+       "7611  10862  offici say quarantin place alabama home possib...   \n",
+       "7612  10409    move england five year ago today whirlwind time   \n",
+       "\n",
+       "                                               all_text  actual  predictions  \n",
+       "0     So you have a new weapon that can cause un-ima...       1            0  \n",
+       "1     The f$&amp;@ing things I do for #GISHWHES Just...       0            0  \n",
+       "2     DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...       1            0  \n",
+       "3     Aftershock back to school kick off was great. ...       0            0  \n",
+       "4     in response to trauma Children of Addicts deve...       0            1  \n",
+       "...                                                 ...     ...          ...  \n",
+       "7608  @Eganator2000 There aren't many Obliteration s...       0            0  \n",
+       "7609  just had a panic attack bc I don't have enough...       0            0  \n",
+       "7610  Omron HEM-712C Automatic Blood Pressure Monito...       0            1  \n",
+       "7611  Officials say a quarantine is in place at an A...       1            1  \n",
+       "7612  I moved to England five years ago today. What ...       1            1  \n",
+       "\n",
+       "[7613 rows x 5 columns]"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "error_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>processed_all_text</th>\n",
+       "      <th>all_text</th>\n",
+       "      <th>actual</th>\n",
+       "      <th>predictions</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>149</th>\n",
+       "      <td>1061</td>\n",
+       "      <td>ye i'm bleed heart liberal.bleedingl oak tx</td>\n",
+       "      <td>@KatRamsland Yes I'm a bleeding heart liberal....</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>518</th>\n",
+       "      <td>8946</td>\n",
+       "      <td>storm came . . fuck coolstormnon</td>\n",
+       "      <td>So this storm just came out of no where. .fuck...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3161</th>\n",
+       "      <td>143</td>\n",
+       "      <td>car even week got fuck car accid .. mf can't f...</td>\n",
+       "      <td>only had a car for not even a week and got in ...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6624</th>\n",
+       "      <td>9044</td>\n",
+       "      <td>spacex founder musk structur failur took falcon 9</td>\n",
+       "      <td>SpaceX Founder Musk: Structural Failure Took D...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>881</th>\n",
+       "      <td>1458</td>\n",
+       "      <td>anoth one anoth one still ain't done shit one ...</td>\n",
+       "      <td>'I did another one I did another one. You stil...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4314</th>\n",
+       "      <td>10364</td>\n",
+       "      <td>router one latest ddo attack weapon</td>\n",
+       "      <td>Your Router is One of the Latest DDoS Attack W...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5399</th>\n",
+       "      <td>6188</td>\n",
+       "      <td>gov brown allow parol 1976 chowchilla school b...</td>\n",
+       "      <td>Gov. Brown allows parole for 1976 Chowchilla s...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4266</th>\n",
+       "      <td>4911</td>\n",
+       "      <td>chick masturb guy get explod face</td>\n",
+       "      <td>Chick masturbates a guy until she gets explode...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3959</th>\n",
+       "      <td>2112</td>\n",
+       "      <td>borrow concern possibl interest rate rise coul...</td>\n",
+       "      <td>#Borrowers concerned at possible #interest rat...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6445</th>\n",
+       "      <td>7926</td>\n",
+       "      <td>stuck rainstorm stay toward middl road street ...</td>\n",
+       "      <td>Stuck in a rainstorm? Stay toward the middle o...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         id                                 processed_all_text  \\\n",
+       "149    1061        ye i'm bleed heart liberal.bleedingl oak tx   \n",
+       "518    8946                   storm came . . fuck coolstormnon   \n",
+       "3161    143  car even week got fuck car accid .. mf can't f...   \n",
+       "6624   9044  spacex founder musk structur failur took falcon 9   \n",
+       "881    1458  anoth one anoth one still ain't done shit one ...   \n",
+       "4314  10364                router one latest ddo attack weapon   \n",
+       "5399   6188  gov brown allow parol 1976 chowchilla school b...   \n",
+       "4266   4911                  chick masturb guy get explod face   \n",
+       "3959   2112  borrow concern possibl interest rate rise coul...   \n",
+       "6445   7926  stuck rainstorm stay toward middl road street ...   \n",
+       "\n",
+       "                                               all_text  actual  predictions  \n",
+       "149   @KatRamsland Yes I'm a bleeding heart liberal....       1            0  \n",
+       "518   So this storm just came out of no where. .fuck...       1            0  \n",
+       "3161  only had a car for not even a week and got in ...       1            0  \n",
+       "6624  SpaceX Founder Musk: Structural Failure Took D...       1            0  \n",
+       "881   'I did another one I did another one. You stil...       1            0  \n",
+       "4314  Your Router is One of the Latest DDoS Attack W...       0            1  \n",
+       "5399  Gov. Brown allows parole for 1976 Chowchilla s...       0            1  \n",
+       "4266  Chick masturbates a guy until she gets explode...       1            0  \n",
+       "3959  #Borrowers concerned at possible #interest rat...       0            1  \n",
+       "6445  Stuck in a rainstorm? Stay toward the middle o...       0            1  "
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# store only the misclassified instances\n",
+    "misclassified_df = error_df[error_df[\"actual\"].values != error_df[\"predictions\"]]\n",
+    "# keep only 100 of the misclassfied instances\n",
+    "misclassified_100 = misclassified_df.sample(n=100, random_state=42)\n",
+    "misclassified_100.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "misclassified_100.to_csv(\"misclassified_data.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

03_kpy_data_exploration.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

VERSION ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0.1.0

config.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# data
+DATA_DIR = "../inputs/"
+ORIGINAL_TRAIN = DATA_DIR + "train.csv"
+MODIFIED_TRAIN = DATA_DIR + "modified_train.csv"
+TEST_DATA = DATA_DIR + "test.csv"
+MODIFIED_TEST = DATA_DIR + "modified_test.csv"
+SUBMISSION = DATA_DIR + "sample_submission.csv"
+MODEL_DIR = "../models/"
+IMAGES = "../images/"
+# features
+ID = "id"
+TEXT = "text"
+KEYWORD = "keyword"
+LOCATION = "location"
+FOLD = "kfold"
+TOKENS = "tokens"
+# created features
+ALL_TEXT = "all_text"
+CLEANED_TEXT = "cleaned_text"
+# target
+TARGET = "target"
+RELABELED_TARGET = "relabeled_target"
+# Pretrained Word2Vec
+PRETRAINED_WORD2VEC = "word2vec-google-news-300"
+EMBED_SIZE = 300
+# TRAINING
+HIDDEN_DIM = 256
+TARGET_DIM = 1
+BATCH_SIZE = 32
+N_EPOCHS = 8
+N_SPLITS = 5
+LEARNING_RATE = 1e-3
+MAXLEN = 202
+VOCAB_SIZE = 172901

data_cleaning.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import pandas as pd
+import config
+def relabel_target(df:pd.DataFrame) -> pd.DataFrame:
+    """
+    Relabel duplicate tweets that are mislabelled in the training dataset
+    :param df: A pandas dataframe with a "target" column
+    :return: df
+    """
+    # copy old target label
+    df[config.RELABELED_TARGET] = df[config.TARGET].copy()
+    # relabel samples with different labels to their duplicates
+    df.loc[df[config.TEXT] == 'like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit',
+    config.RELABELED_TARGET] = 0
+    df.loc[df[config.TEXT] == 'Hellfire is surrounded by desires so be careful and donÛªt let your desires control you! #Afterlife',
+    config.RELABELED_TARGET] = 0
+    df.loc[df[config.TEXT] == 'To fight bioterrorism sir.',
+    config.RELABELED_TARGET] = 0
+    df.loc[df[config.TEXT] == '.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4',
+    config.RELABELED_TARGET] = 1
+    df.loc[df[config.TEXT] == 'CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring',
+    config.RELABELED_TARGET] = 1
+    df.loc[df[config.TEXT] == '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption',
+    config.RELABELED_TARGET] = 0
+    df.loc[df[config.TEXT] == 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!',
+    config.RELABELED_TARGET] = 0
+    df.loc[df[config.TEXT] == 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE',
+    config.RELABELED_TARGET] = 1
+    df.loc[df[config.TEXT] == 'RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG',
+    config.RELABELED_TARGET] = 1
+    df.loc[df[config.TEXT] == "Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...",
+    config.RELABELED_TARGET] = 0
+    df.loc[df[config.TEXT] == "wowo--=== 12000 Nigerian refugees repatriated from Cameroon",
+    config.RELABELED_TARGET] = 0
+    df.loc[df[config.TEXT] == "He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam",
+    config.RELABELED_TARGET] = 0
+    df.loc[df[config.TEXT] == "Hellfire! We donÛªt even want to think about it or mention it so letÛªs not do anything that leads to it #islam!",
+    config.RELABELED_TARGET] = 0
+    df.loc[df[config.TEXT] == "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'",
+    config.RELABELED_TARGET] = 0
+    df.loc[df[config.TEXT] == "Caution: breathing may be hazardous to your health.",
+    config.RELABELED_TARGET] = 1
+    df.loc[df[config.TEXT] == "I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ??????",
+    config.RELABELED_TARGET] = 0
+    df.loc[df[config.TEXT] == "#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect",
+    config.RELABELED_TARGET] = 0
+    df.loc[df[config.TEXT] == "that horrible sinking feeling when youÛªve been at home on your phone for a while and you realise its been on 3G this whole time",
+    config.RELABELED_TARGET] = 0
+    return df

features.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import numpy as np
+import gensim.downloader as api
+import config
+def get_word2vec_enc(corpus: list,  gensim_pretrained_emb:str) -> list:
+    """
+    Get the W2V value for each word withing
+    :param text: The text we want to get embeddings for
+    :param embed_size: Dimension output for pretrained embeddings
+    :param pretrained_emb: The pretrained embedding to use
+    :return: words encoded as vectors
+    """
+    word_vecs = api.load(gensim_pretrained_emb)
+    embedding_weights = np.zeros((config.VOCAB_SIZE, config.EMBED_SIZE))
+    for word, i in corpus:
+        if word in word_vecs:
+            embedding_weights[i] = word_vecs[word]
+    return embedding_weights

fixed_df_naive_bayes.png ADDED Viewed

incorrect_naive_bayes.png ADDED Viewed

inference.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import pickle
+import numpy as np
+import pandas as pd
+from tensorflow.keras.models import load_model
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+import config
+import preprocessing as pp
+def predict_test(model:str, test_data:pd.DataFrame= config.MODIFIED_TEST):
+    # path to model
+    model_path = f"{config.MODEL_DIR}/PRETRAIN_WORD2VEC_{model}/"
+    # read data
+    df_test = pd.read_csv(test_data)
+    # do cleaning to text
+    df_test[config.CLEANED_TEXT] = df_test[config.TEXT].apply(pp.clean_tweet)
+    # loading tokenizer
+    with open(f'{model_path}tokenizer.pkl', 'rb') as handle:
+        tokenizer = pickle.load(handle)
+    # convert tokens to sequences and pad them
+    data_values = tokenizer.texts_to_sequences(df_test[config.CLEANED_TEXT].values)
+    X_padded = pad_sequences(data_values, maxlen=config.MAXLEN)
+    # load the classifier
+    clf = load_model(f"{model_path}{model}_Word2Vec .h5")
+    predictions = clf.predict_classes(X_padded, verbose=-1)
+    return predictions
+if __name__ == "__main__":
+    submission = predict_test(model="LSTM")
+    sample_sub = pd.read_csv(config.SUBMISSION)
+    sample_sub.loc[:, config.TARGET] = submission
+    sample_sub.to_csv(f"{config.MODEL_DIR}PRETRAIN_WORD2VEC_LSTM/LSTM.csv", index=False)

lstm_model.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional
+from tensorflow.keras import Sequential
+def my_LSTM(embedding_layer):
+    print('Creating model...')
+    model = Sequential()
+    model.add(embedding_layer)
+    model.add(Dropout(0.2))
+    model.add(Bidirectional(LSTM(units=64, dropout=0.1,  recurrent_dropout=0.1)))
+    model.add(Dense(50, activation="relu"))
+    model.add(Dropout(0.1))
+    model.add(Dense(1, activation = "sigmoid"))
+    print('Compiling...')
+    model.compile(loss='binary_crossentropy',
+                  optimizer='adam',
+                  metrics=["accuracy"])
+    return model

model_dispatcher.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from sklearn import linear_model, naive_bayes, ensemble, svm
+MODELS = {
+    "logistic_regression": linear_model.LogisticRegression(max_iter=1000, random_state=42),
+    "naive_bayes": naive_bayes.MultinomialNB(),
+    "random_forest": ensemble.RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1),
+    "svm": svm.SVC(C=10)
+}

preprocessing.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import re
+import string
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import TweetTokenizer
+nltk.download("stopwords")
+def clean_tweet(tweet:str) -> str:
+    """
+    Convert all text to lowercase, remove stock market tickers, RT symbol, hyperlinks and the hastag symbol
+    :param tweet: tweet by a unique user
+    :return: cleaned string without hashtags, emojis, and punctuation
+    """
+    # make text lower case
+    tweet = tweet.lower()
+    # remove stock market tickers like $GE
+    tweet = re.sub(r'\$\w*', '', str(tweet))
+    # remove old style retweet text "RT"
+    tweet = re.sub(r'^RT[\s]+', '', str(tweet))
+    # remove hyperlinks
+    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', str(tweet))
+    # remove hashtags
+    # only removing the hash # sign from the word
+    tweet = re.sub(r'#', '', str(tweet))
+    # remove punctuation
+    punct = set(string.punctuation)
+    tweet = "".join(ch for ch in tweet if ch not in punct)
+    # remove stopwords
+    stop_words = set(stopwords.words("english"))
+    tweet = " ".join(word for word in tweet.split() if word not in stop_words)
+    return tweet

train.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+import pickle
+import numpy as np
+import pandas as pd
+from sklearn.metrics import f1_score
+from tensorflow.keras.layers import Embedding
+from sklearn.model_selection import StratifiedKFold
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+import config
+import preprocessing as pp
+import features as f
+import data_cleaning as data_clean
+from lstm_model import my_LSTM
+# GPU Use
+os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
+def run_training(model:str) -> None:
+    """
+    Training our Machine Learning model and serializing to disc
+    """
+    # read train and test data
+    df_train = pd.read_csv(config.ORIGINAL_TRAIN)
+    df_test = pd.read_csv(config.TEST_DATA)
+    # relabel mislabeled samples
+    df_train = data_clean.relabel_target(df_train)
+    # shuffle data
+    df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
+    # clean the text
+    df_train[config.CLEANED_TEXT] = df_train[config.TEXT].apply(pp.clean_tweet)
+    df_test[config.CLEANED_TEXT] = df_test[config.TEXT].apply(pp.clean_tweet)
+    # save the modified train and test data
+    df_train.to_csv(config.MODIFIED_TRAIN, index=False)
+    df_test.to_csv(config.MODIFIED_TEST, index=False)
+    del df_test
+    # convert text to numerical representation
+    tokenizer = Tokenizer(oov_token="<unk>")
+    tokenizer.fit_on_texts(df_train[config.CLEANED_TEXT])
+    # path to save model
+    model_path = f"{config.MODEL_DIR}/PRETRAIN_WORD2VEC_{model}/"
+    # checking the folder exist
+    if not os.path.exists(model_path):
+        os.makedirs(model_path)
+    # saving tokenizer
+    with open(f'{model_path}tokenizer.pkl', 'wb') as handle:
+        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    # pad the sequences
+    X_padded = pad_sequences(tokenizer.texts_to_sequences(df_train[config.CLEANED_TEXT].values), maxlen=config.MAXLEN)
+    # get the pretrained word embeddings and prepare embedding layer
+    embedding_matrix = f.get_word2vec_enc(tokenizer.word_index.items(), config.PRETRAINED_WORD2VEC)
+    embedding_layer = Embedding(input_dim=config.VOCAB_SIZE,
+                                output_dim=config.EMBED_SIZE,
+                                weights=[embedding_matrix],
+                                input_length=config.MAXLEN,
+                                trainable=False)
+    # target values
+    y = df_train[config.RELABELED_TARGET].values
+    # train a single model
+    clf = my_LSTM(embedding_layer)
+    clf.fit(X_padded, y,
+            epochs=config.N_EPOCHS,
+            verbose=1)
+    # persist the model
+    clf.save(f"{model_path}/{model}_Word2Vec.h5")
+if __name__ == "__main__":
+    run_training("LSTM")

user_interface.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import pickle
+import numpy as np
+import gradio as gr
+from tensorflow.keras.models import load_model
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from src import config
+from src import preprocessing as pp
+def predict(text:str):
+    """
+    Predict the class of an instance
+    :param text: The tweet text we want to classify
+    :return: The Model Output
+    """
+    outcome_dict = {0: "Non-Disaster", 1: "Disaster"}
+    # path to model
+    model_path = f"models/PRETRAIN_WORD2VEC_LSTM/"
+    # do cleaning to text
+    clean_text = pp.clean_tweet(text)
+    clean_text = np.array([clean_text])
+    # loading tokenizer
+    with open(f'{model_path}tokenizer.pkl', 'rb') as handle:
+        tokenizer = pickle.load(handle)
+    # convert tokens to sequences and pad them
+    data_values = tokenizer.texts_to_sequences(clean_text)
+    X_padded = pad_sequences(data_values, maxlen=config.MAXLEN)
+    # load the classifier
+    clf = load_model(f"{model_path}LSTM_Word2Vec.h5")
+    prediction = clf.predict_classes(X_padded, verbose=-1)
+    prediction = prediction.sum()
+    return outcome_dict[prediction]
+if __name__ == "__main__":
+    iface = gr.Interface(
+        fn=predict,
+        inputs= gr.inputs.Textbox(lines=3, placeholder="Insert Tweet..."),
+        outputs="text"
+    )
+    iface.launch()

utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Code Source
+# https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model
+from tensorflow.keras import backend as K
+def f1_metric(y_true, y_pred): #taken from old keras source code
+    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
+    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    precision = true_positives / (predicted_positives + K.epsilon())
+    recall = true_positives / (possible_positives + K.epsilon())
+    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
+    return f1_val