{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Comment Toxicity.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU",
    "gpuClass": "standard"
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "S99TDbfWWq0u",
        "outputId": "27146bb5-1f2c-42ce-9d75-f2e46418494e"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ],
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "ls drive/MyDrive/jigsaw-toxic-comment-classification-challenge/"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6MtlVgwufR8M",
        "outputId": "acbed960-207b-4f6a-f5da-077cfba36c1d"
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[0m\u001b[01;34msample_submission.csv\u001b[0m/     test.csv.zip         train.csv.zip\n",
            "sample_submission.csv.zip  \u001b[01;34mtest_labels.csv\u001b[0m/     X_test.pickle\n",
            "simple_model.h5            test_labels.csv.zip  X_train.pickle\n",
            "\u001b[01;34mtest.csv\u001b[0m/                  \u001b[01;34mtrain.csv\u001b[0m/\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%cd drive/MyDrive/jigsaw-toxic-comment-classification-challenge/"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "la5teYvGfcZ2",
        "outputId": "60f69844-0a0e-4b3f-9882-d1291ee433b4"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/drive/MyDrive/jigsaw-toxic-comment-classification-challenge\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "import tensorflow as tf\n",
        "import matplotlib.pyplot as plt\n",
        "from tensorflow import keras\n",
        "from keras.preprocessing.text import Tokenizer\n",
        "from keras.preprocessing.sequence import pad_sequences\n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "nltk.download('stopwords')\n",
        "nltk.download('punkt')\n",
        "from nltk.tokenize import word_tokenize\n",
        "import re\n",
        "from sklearn.model_selection import train_test_split"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0iHYnJ53foct",
        "outputId": "d9c9717b-5368-474b-b3e8-dcd85f4f4ce5"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Unzipping corpora/stopwords.zip.\n",
            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
            "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "train = pd.read_csv('train.csv/train.csv')\n"
      ],
      "metadata": {
        "id": "GBGHqE1Wfeb9"
      },
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "test = pd.read_csv('test.csv/test.csv')"
      ],
      "metadata": {
        "id": "-wGlqD9GZrkC"
      },
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "test_labels = pd.read_csv('test_labels.csv/test_labels.csv')"
      ],
      "metadata": {
        "id": "haDqTJbRZwy3"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "train.shape"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "mllErgckfvOa",
        "outputId": "fc720f9b-d80d-48af-fed8-b472d554c22f"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(159571, 8)"
            ]
          },
          "metadata": {},
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "train.head()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 461
        },
        "id": "v5JPdmYTf36i",
        "outputId": "83d2d1ca-af74-4a91-8ac5-594f7ea28055"
      },
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                 id                                       comment_text  toxic  \\\n",
              "0  0000997932d777bf  Explanation\\nWhy the edits made under my usern...      0   \n",
              "1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   \n",
              "2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   \n",
              "3  0001b41b1c6bb37e  \"\\nMore\\nI can't make any real suggestions on ...      0   \n",
              "4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   \n",
              "\n",
              "   severe_toxic  obscene  threat  insult  identity_hate  \n",
              "0             0        0       0       0              0  \n",
              "1             0        0       0       0              0  \n",
              "2             0        0       0       0              0  \n",
              "3             0        0       0       0              0  \n",
              "4             0        0       0       0              0  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-8f71142f-150c-41dd-bf4f-57aa47dad455\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>id</th>\n",
              "      <th>comment_text</th>\n",
              "      <th>toxic</th>\n",
              "      <th>severe_toxic</th>\n",
              "      <th>obscene</th>\n",
              "      <th>threat</th>\n",
              "      <th>insult</th>\n",
              "      <th>identity_hate</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0000997932d777bf</td>\n",
              "      <td>Explanation\\nWhy the edits made under my usern...</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>000103f0d9cfb60f</td>\n",
              "      <td>D'aww! He matches this background colour I'm s...</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>000113f07ec002fd</td>\n",
              "      <td>Hey man, I'm really not trying to edit war. It...</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>0001b41b1c6bb37e</td>\n",
              "      <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>0001d958c54c6e35</td>\n",
              "      <td>You, sir, are my hero. Any chance you remember...</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-8f71142f-150c-41dd-bf4f-57aa47dad455')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-8f71142f-150c-41dd-bf4f-57aa47dad455 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-8f71142f-150c-41dd-bf4f-57aa47dad455');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Check for Missing values - no missing values found"
      ],
      "metadata": {
        "id": "aGx06xC0hj4z"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "train.isnull().sum()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "2YH3nZJbhgWP",
        "outputId": "f9ccab2f-bb8e-4d97-d171-a50208ce66c1"
      },
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "id               0\n",
              "comment_text     0\n",
              "toxic            0\n",
              "severe_toxic     0\n",
              "obscene          0\n",
              "threat           0\n",
              "insult           0\n",
              "identity_hate    0\n",
              "dtype: int64"
            ]
          },
          "metadata": {},
          "execution_count": 10
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "X_train = train.iloc[:,1]\n",
        "Y_train = train.iloc[:,2:]"
      ],
      "metadata": {
        "id": "j8CQE7Ylf5Eh"
      },
      "execution_count": 11,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "X_test = test.iloc[:,1]\n",
        "Y_test = test_labels.iloc[:,1:]"
      ],
      "metadata": {
        "id": "DqOgLlX3ErqC"
      },
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "sentences = np.asarray(X_train)"
      ],
      "metadata": {
        "id": "fAyLcRFEgNEs"
      },
      "execution_count": 13,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "**STEP 1: REMOVING CONTRACTIONS**"
      ],
      "metadata": {
        "id": "NhKFgZPeo8Vw"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "CONTRACTION_MAP = {\n",
        "\"ain't\": \"is not\",\n",
        "\"aren't\": \"are not\",\n",
        "\"can't\": \"cannot\",\n",
        "\"can't've\": \"cannot have\",\n",
        "\"'cause\": \"because\",\n",
        "\"could've\": \"could have\",\n",
        "\"couldn't\": \"could not\",\n",
        "\"couldn't've\": \"could not have\",\n",
        "\"didn't\": \"did not\",\n",
        "\"doesn't\": \"does not\",\n",
        "\"don't\": \"do not\",\n",
        "\"hadn't\": \"had not\",\n",
        "\"hadn't've\": \"had not have\",\n",
        "\"hasn't\": \"has not\",\n",
        "\"haven't\": \"have not\",\n",
        "\"he'd\": \"he would\",\n",
        "\"he'd've\": \"he would have\",\n",
        "\"he'll\": \"he will\",\n",
        "\"he'll've\": \"he he will have\",\n",
        "\"he's\": \"he is\",\n",
        "\"how'd\": \"how did\",\n",
        "\"how'd'y\": \"how do you\",\n",
        "\"how'll\": \"how will\",\n",
        "\"how's\": \"how is\",\n",
        "\"i'd\": \"i would\",\n",
        "\"i'd've\": \"i would have\",\n",
        "\"i'll\": \"i will\",\n",
        "\"i'll've\": \"i will have\",\n",
        "\"i'm\": \"i am\",\n",
        "\"i've\": \"i have\",\n",
        "\"isn't\": \"is not\",\n",
        "\"it'd\": \"it would\",\n",
        "\"it'd've\": \"it would have\",\n",
        "\"it'll\": \"it will\",\n",
        "\"it'll've\": \"it will have\",\n",
        "\"it's\": \"it is\",\n",
        "\"let's\": \"let us\",\n",
        "\"ma'am\": \"madam\",\n",
        "\"mayn't\": \"may not\",\n",
        "\"might've\": \"might have\",\n",
        "\"mightn't\": \"might not\",\n",
        "\"mightn't've\": \"might not have\",\n",
        "\"must've\": \"must have\",\n",
        "\"mustn't\": \"must not\",\n",
        "\"mustn't've\": \"must not have\",\n",
        "\"needn't\": \"need not\",\n",
        "\"needn't've\": \"need not have\",\n",
        "\"o'clock\": \"of the clock\",\n",
        "\"oughtn't\": \"ought not\",\n",
        "\"oughtn't've\": \"ought not have\",\n",
        "\"shan't\": \"shall not\",\n",
        "\"sha'n't\": \"shall not\",\n",
        "\"shan't've\": \"shall not have\",\n",
        "\"she'd\": \"she would\",\n",
        "\"she'd've\": \"she would have\",\n",
        "\"she'll\": \"she will\",\n",
        "\"she'll've\": \"she will have\",\n",
        "\"she's\": \"she is\",\n",
        "\"should've\": \"should have\",\n",
        "\"shouldn't\": \"should not\",\n",
        "\"shouldn't've\": \"should not have\",\n",
        "\"so've\": \"so have\",\n",
        "\"so's\": \"so as\",\n",
        "\"that'd\": \"that would\",\n",
        "\"that'd've\": \"that would have\",\n",
        "\"that's\": \"that is\",\n",
        "\"there'd\": \"there would\",\n",
        "\"there'd've\": \"there would have\",\n",
        "\"there's\": \"there is\",\n",
        "\"they'd\": \"they would\",\n",
        "\"they'd've\": \"they would have\",\n",
        "\"they'll\": \"they will\",\n",
        "\"they'll've\": \"they will have\",\n",
        "\"they're\": \"they are\",\n",
        "\"they've\": \"they have\",\n",
        "\"to've\": \"to have\",\n",
        "\"wasn't\": \"was not\",\n",
        "\"we'd\": \"we would\",\n",
        "\"we'd've\": \"we would have\",\n",
        "\"we'll\": \"we will\",\n",
        "\"we'll've\": \"we will have\",\n",
        "\"we're\": \"we are\",\n",
        "\"we've\": \"we have\",\n",
        "\"weren't\": \"were not\",\n",
        "\"what'll\": \"what will\",\n",
        "\"what'll've\": \"what will have\",\n",
        "\"what're\": \"what are\",\n",
        "\"what's\": \"what is\",\n",
        "\"what've\": \"what have\",\n",
        "\"when's\": \"when is\",\n",
        "\"when've\": \"when have\",\n",
        "\"where'd\": \"where did\",\n",
        "\"where's\": \"where is\",\n",
        "\"where've\": \"where have\",\n",
        "\"who'll\": \"who will\",\n",
        "\"who'll've\": \"who will have\",\n",
        "\"who's\": \"who is\",\n",
        "\"who've\": \"who have\",\n",
        "\"why's\": \"why is\",\n",
        "\"why've\": \"why have\",\n",
        "\"will've\": \"will have\",\n",
        "\"won't\": \"will not\",\n",
        "\"won't've\": \"will not have\",\n",
        "\"would've\": \"would have\",\n",
        "\"wouldn't\": \"would not\",\n",
        "\"wouldn't've\": \"would not have\",\n",
        "\"y'all\": \"you all\",\n",
        "\"y'all'd\": \"you all would\",\n",
        "\"y'all'd've\": \"you all would have\",\n",
        "\"y'all're\": \"you all are\",\n",
        "\"y'all've\": \"you all have\",\n",
        "\"you'd\": \"you would\",\n",
        "\"you'd've\": \"you would have\",\n",
        "\"you'll\": \"you will\",\n",
        "\"you'll've\": \"you will have\",\n",
        "\"you're\": \"you are\",\n",
        "\"you've\": \"you have\",\n",
        "}"
      ],
      "metadata": {
        "id": "SwbHKYS5laGN"
      },
      "execution_count": 14,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def expand_contractions(sentences):\n",
        "  contractions_re = re.compile('(%s)'%'|'.join(CONTRACTION_MAP.keys()))\n",
        "  def exp_cont(s, contractions_dict=CONTRACTION_MAP):\n",
        "    def replace(match):\n",
        "      return contractions_dict[match.group(0)]\n",
        "    return contractions_re.sub(replace, s)\n",
        "  for i in range(len(sentences)):\n",
        "    sentences[i] = exp_cont(sentences[i])\n"
      ],
      "metadata": {
        "id": "SmK41aqlmDXq"
      },
      "execution_count": 15,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "sentences"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XKMkBnBu6aC5",
        "outputId": "ded977d1-dcff-49fc-d7c3-46f7df8e0e74"
      },
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([\"Explanation\\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27\",\n",
              "       \"D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)\",\n",
              "       \"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.\",\n",
              "       ...,\n",
              "       'Spitzer \\n\\nUmm, theres no actual article for prostitution ring.  - Crunch Captain.',\n",
              "       'And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.',\n",
              "       '\"\\nAnd ... I really don\\'t think you understand.  I came here and my idea was bad right away.  What kind of community goes \"\"you have bad ideas\"\" go away, instead of helping rewrite them.   \"'],\n",
              "      dtype=object)"
            ]
          },
          "metadata": {},
          "execution_count": 16
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# expand_contractions(sentences)"
      ],
      "metadata": {
        "id": "4tAHKGfSmwL4"
      },
      "execution_count": 17,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from nltk.tokenize import word_tokenize\n",
        "nltk.download('punkt')\n",
        "word_tokenize(sentences[0])"
      ],
      "metadata": {
        "id": "Ic1c0ajxm2Dw",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "ada4bd35-7985-4b9f-d3fe-db07bfd85188"
      },
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
            "[nltk_data]   Package punkt is already up-to-date!\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "['Explanation',\n",
              " 'Why',\n",
              " 'the',\n",
              " 'edits',\n",
              " 'made',\n",
              " 'under',\n",
              " 'my',\n",
              " 'username',\n",
              " 'Hardcore',\n",
              " 'Metallica',\n",
              " 'Fan',\n",
              " 'were',\n",
              " 'reverted',\n",
              " '?',\n",
              " 'They',\n",
              " 'were',\n",
              " \"n't\",\n",
              " 'vandalisms',\n",
              " ',',\n",
              " 'just',\n",
              " 'closure',\n",
              " 'on',\n",
              " 'some',\n",
              " 'GAs',\n",
              " 'after',\n",
              " 'I',\n",
              " 'voted',\n",
              " 'at',\n",
              " 'New',\n",
              " 'York',\n",
              " 'Dolls',\n",
              " 'FAC',\n",
              " '.',\n",
              " 'And',\n",
              " 'please',\n",
              " 'do',\n",
              " \"n't\",\n",
              " 'remove',\n",
              " 'the',\n",
              " 'template',\n",
              " 'from',\n",
              " 'the',\n",
              " 'talk',\n",
              " 'page',\n",
              " 'since',\n",
              " 'I',\n",
              " \"'m\",\n",
              " 'retired',\n",
              " 'now.89.205.38.27']"
            ]
          },
          "metadata": {},
          "execution_count": 18
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "**STEP 2 : REMOVE NEWLINES AND TABS**"
      ],
      "metadata": {
        "id": "24RwZS7jqpRu"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def remove_newlines_and_tabs(sentences):\n",
        "  \n",
        "  for i in range(len(sentences)):\n",
        "    sentences[i] = sentences[i].replace('\\n',' ').replace('\\t',' ').replace('\\\\', ' ')"
      ],
      "metadata": {
        "id": "hAE_XZhNo6Sg"
      },
      "execution_count": 19,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "remove_newlines_and_tabs(sentences)"
      ],
      "metadata": {
        "id": "ES2lNuosrlD7"
      },
      "execution_count": 20,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "sentences[0]"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 70
        },
        "id": "EiYPVRCeroDa",
        "outputId": "40ecf610-3451-4cc4-8cf9-caaab929eed1"
      },
      "execution_count": 21,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "\"Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27\""
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 21
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "**STEP 3: REMOVE ALL STOPWORDS**"
      ],
      "metadata": {
        "id": "Hq8pJmkctVCe"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "stoplist = set(stopwords.words('english'))"
      ],
      "metadata": {
        "id": "AkRl7hzAvcfL"
      },
      "execution_count": 22,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "stoplist"
      ],
      "metadata": {
        "id": "mO2GkSx8v-5y",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "afe4c26b-6dad-41db-ca84-56b6e448bf33"
      },
      "execution_count": 23,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'a',\n",
              " 'about',\n",
              " 'above',\n",
              " 'after',\n",
              " 'again',\n",
              " 'against',\n",
              " 'ain',\n",
              " 'all',\n",
              " 'am',\n",
              " 'an',\n",
              " 'and',\n",
              " 'any',\n",
              " 'are',\n",
              " 'aren',\n",
              " \"aren't\",\n",
              " 'as',\n",
              " 'at',\n",
              " 'be',\n",
              " 'because',\n",
              " 'been',\n",
              " 'before',\n",
              " 'being',\n",
              " 'below',\n",
              " 'between',\n",
              " 'both',\n",
              " 'but',\n",
              " 'by',\n",
              " 'can',\n",
              " 'couldn',\n",
              " \"couldn't\",\n",
              " 'd',\n",
              " 'did',\n",
              " 'didn',\n",
              " \"didn't\",\n",
              " 'do',\n",
              " 'does',\n",
              " 'doesn',\n",
              " \"doesn't\",\n",
              " 'doing',\n",
              " 'don',\n",
              " \"don't\",\n",
              " 'down',\n",
              " 'during',\n",
              " 'each',\n",
              " 'few',\n",
              " 'for',\n",
              " 'from',\n",
              " 'further',\n",
              " 'had',\n",
              " 'hadn',\n",
              " \"hadn't\",\n",
              " 'has',\n",
              " 'hasn',\n",
              " \"hasn't\",\n",
              " 'have',\n",
              " 'haven',\n",
              " \"haven't\",\n",
              " 'having',\n",
              " 'he',\n",
              " 'her',\n",
              " 'here',\n",
              " 'hers',\n",
              " 'herself',\n",
              " 'him',\n",
              " 'himself',\n",
              " 'his',\n",
              " 'how',\n",
              " 'i',\n",
              " 'if',\n",
              " 'in',\n",
              " 'into',\n",
              " 'is',\n",
              " 'isn',\n",
              " \"isn't\",\n",
              " 'it',\n",
              " \"it's\",\n",
              " 'its',\n",
              " 'itself',\n",
              " 'just',\n",
              " 'll',\n",
              " 'm',\n",
              " 'ma',\n",
              " 'me',\n",
              " 'mightn',\n",
              " \"mightn't\",\n",
              " 'more',\n",
              " 'most',\n",
              " 'mustn',\n",
              " \"mustn't\",\n",
              " 'my',\n",
              " 'myself',\n",
              " 'needn',\n",
              " \"needn't\",\n",
              " 'no',\n",
              " 'nor',\n",
              " 'not',\n",
              " 'now',\n",
              " 'o',\n",
              " 'of',\n",
              " 'off',\n",
              " 'on',\n",
              " 'once',\n",
              " 'only',\n",
              " 'or',\n",
              " 'other',\n",
              " 'our',\n",
              " 'ours',\n",
              " 'ourselves',\n",
              " 'out',\n",
              " 'over',\n",
              " 'own',\n",
              " 're',\n",
              " 's',\n",
              " 'same',\n",
              " 'shan',\n",
              " \"shan't\",\n",
              " 'she',\n",
              " \"she's\",\n",
              " 'should',\n",
              " \"should've\",\n",
              " 'shouldn',\n",
              " \"shouldn't\",\n",
              " 'so',\n",
              " 'some',\n",
              " 'such',\n",
              " 't',\n",
              " 'than',\n",
              " 'that',\n",
              " \"that'll\",\n",
              " 'the',\n",
              " 'their',\n",
              " 'theirs',\n",
              " 'them',\n",
              " 'themselves',\n",
              " 'then',\n",
              " 'there',\n",
              " 'these',\n",
              " 'they',\n",
              " 'this',\n",
              " 'those',\n",
              " 'through',\n",
              " 'to',\n",
              " 'too',\n",
              " 'under',\n",
              " 'until',\n",
              " 'up',\n",
              " 've',\n",
              " 'very',\n",
              " 'was',\n",
              " 'wasn',\n",
              " \"wasn't\",\n",
              " 'we',\n",
              " 'were',\n",
              " 'weren',\n",
              " \"weren't\",\n",
              " 'what',\n",
              " 'when',\n",
              " 'where',\n",
              " 'which',\n",
              " 'while',\n",
              " 'who',\n",
              " 'whom',\n",
              " 'why',\n",
              " 'will',\n",
              " 'with',\n",
              " 'won',\n",
              " \"won't\",\n",
              " 'wouldn',\n",
              " \"wouldn't\",\n",
              " 'y',\n",
              " 'you',\n",
              " \"you'd\",\n",
              " \"you'll\",\n",
              " \"you're\",\n",
              " \"you've\",\n",
              " 'your',\n",
              " 'yours',\n",
              " 'yourself',\n",
              " 'yourselves'}"
            ]
          },
          "metadata": {},
          "execution_count": 23
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def remove_stopwords(sentences):\n",
        "  for i in range(len(sentences)):\n",
        "    tokens = word_tokenize(sentences[i])\n",
        "    \n",
        "    filtered_tokens = [token for token in tokens if token.lower() not in stoplist]\n",
        "    sentences[i] = \" \".join(filtered_tokens)"
      ],
      "metadata": {
        "id": "eFM8kzLywViL"
      },
      "execution_count": 24,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# remove_stopwords(sentences)"
      ],
      "metadata": {
        "id": "DxM1LXQ-xOsx"
      },
      "execution_count": 25,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "sentences[0]"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 70
        },
        "id": "zBKsllEvxYqp",
        "outputId": "a1e0cd93-e4d8-4af4-d81a-c2fc787e8220"
      },
      "execution_count": 26,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "\"Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27\""
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 26
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "X_train[0]"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 70
        },
        "id": "SFBlPvME445_",
        "outputId": "9da0c4d0-2ccd-4552-de9c-b6195c8cc6f2"
      },
      "execution_count": 27,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "\"Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27\""
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 27
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "**STEP 4: LEMMETIZATION**"
      ],
      "metadata": {
        "id": "xFJzNYvb5HgE"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "nltk.download('wordnet')\n",
        "nltk.download('omw-1.4')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "nidlSbOF54Eg",
        "outputId": "940afaf6-7680-4ee5-d136-2ee178658285"
      },
      "execution_count": 28,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
            "[nltk_data]   Unzipping corpora/wordnet.zip.\n",
            "[nltk_data] Downloading package omw-1.4 to /root/nltk_data...\n",
            "[nltk_data]   Unzipping corpora/omw-1.4.zip.\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {},
          "execution_count": 28
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "w_tokenizer = nltk.tokenize.WhitespaceTokenizer()\n",
        "lemmatizer = nltk.stem.WordNetLemmatizer()"
      ],
      "metadata": {
        "id": "AR4oMkl84_0S"
      },
      "execution_count": 29,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def lemmetization(sentences):\n",
        "  for i in range(len(sentences)):\n",
        "    lemma = [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(sentences[i])]\n",
        "\n",
        "    sentences[i] = \" \".join(lemma)"
      ],
      "metadata": {
        "id": "ecvmGMq_5jFV"
      },
      "execution_count": 30,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# lemmetization(sentences)"
      ],
      "metadata": {
        "id": "i7WGctMU5zc3"
      },
      "execution_count": 31,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "sentences[0]"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 70
        },
        "id": "hOBeoBDx5-Lb",
        "outputId": "b4cac999-d23b-451d-85ed-983dc80bcf65"
      },
      "execution_count": 32,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "\"Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27\""
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 32
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "**COMPLETE PREPROCESSING**"
      ],
      "metadata": {
        "id": "FsxyOMoIfGX7"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def preprocess(sentences):\n",
        "  expand_contractions(sentences)\n",
        "  remove_newlines_and_tabs(sentences)\n",
        "  remove_stopwords(sentences)\n",
        "  lemmetization(sentences)\n"
      ],
      "metadata": {
        "id": "x6lG8QmcfM3I"
      },
      "execution_count": 33,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "X_train = np.asarray(X_train)\n"
      ],
      "metadata": {
        "id": "MOqqqCaTiSxP"
      },
      "execution_count": 34,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "X_test = np.asarray(X_test)"
      ],
      "metadata": {
        "id": "Fe2ae8plibLg"
      },
      "execution_count": 35,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# try:\n",
        "#   file = open('X_train.pickle')\n",
        "#   X_train = pickle.load(file)\n",
        "# except:\n",
        "#   preprocess(X_train)"
      ],
      "metadata": {
        "id": "LLMhKjStfgUI"
      },
      "execution_count": 36,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# try:\n",
        "#   file = open('X_test.pickle')\n",
        "#   X_test = pickle.load(file)\n",
        "# except:\n",
        "#   preprocess(X_test)"
      ],
      "metadata": {
        "id": "ra6ArxduicgO"
      },
      "execution_count": 37,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "preprocess(X_train)\n"
      ],
      "metadata": {
        "id": "p6hpfOgYUM2s"
      },
      "execution_count": 38,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "preprocess(X_test)"
      ],
      "metadata": {
        "id": "60Cxbqm6a6-x",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 311
        },
        "outputId": "c2901f80-a873-4d9a-afda-ee542d7fd391"
      },
      "execution_count": 39,
      "outputs": [
        {
          "output_type": "error",
          "ename": "KeyboardInterrupt",
          "evalue": "ignored",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
            "\u001b[0;32m<ipython-input-39-e1adff6a4102>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpreprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
            "\u001b[0;32m<ipython-input-33-5bcef99561d9>\u001b[0m in \u001b[0;36mpreprocess\u001b[0;34m(sentences)\u001b[0m\n\u001b[1;32m      2\u001b[0m   \u001b[0mexpand_contractions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msentences\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m   \u001b[0mremove_newlines_and_tabs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msentences\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m   \u001b[0mremove_stopwords\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msentences\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m   \u001b[0mlemmetization\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msentences\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m<ipython-input-24-b8fe72b23e7b>\u001b[0m in \u001b[0;36mremove_stopwords\u001b[0;34m(sentences)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mremove_stopwords\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msentences\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m   \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msentences\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m     \u001b[0mtokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mword_tokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msentences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0mfiltered_tokens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mtoken\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtoken\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtokens\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtoken\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstoplist\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nltk/tokenize/__init__.py\u001b[0m in \u001b[0;36mword_tokenize\u001b[0;34m(text, language, preserve_line)\u001b[0m\n\u001b[1;32m    129\u001b[0m     \u001b[0msentences\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpreserve_line\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0msent_tokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlanguage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    130\u001b[0m     return [\n\u001b[0;32m--> 131\u001b[0;31m         \u001b[0mtoken\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msent\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msentences\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtoken\u001b[0m \u001b[0;32min\u001b[0m \u001b[0m_treebank_word_tokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    132\u001b[0m     ]\n",
            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nltk/tokenize/__init__.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    129\u001b[0m     \u001b[0msentences\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpreserve_line\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0msent_tokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlanguage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    130\u001b[0m     return [\n\u001b[0;32m--> 131\u001b[0;31m         \u001b[0mtoken\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msent\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msentences\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtoken\u001b[0m \u001b[0;32min\u001b[0m \u001b[0m_treebank_word_tokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    132\u001b[0m     ]\n",
            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nltk/tokenize/destructive.py\u001b[0m in \u001b[0;36mtokenize\u001b[0;34m(self, text, convert_parentheses, return_str)\u001b[0m\n\u001b[1;32m    177\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    178\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mregexp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msubstitution\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mENDING_QUOTES\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 179\u001b[0;31m             \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mregexp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msubstitution\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    180\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    181\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mregexp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mCONTRACTIONS2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/lib/python3.7/re.py\u001b[0m in \u001b[0;36m_subx\u001b[0;34m(pattern, template)\u001b[0m\n\u001b[1;32m    307\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0msre_parse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexpand_template\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtemplate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmatch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    308\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 309\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0m_subx\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpattern\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtemplate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    310\u001b[0m     \u001b[0;31m# internal: Pattern.sub/subn implementation helper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    311\u001b[0m     \u001b[0mtemplate\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_compile_repl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtemplate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpattern\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "len(X_train)"
      ],
      "metadata": {
        "id": "f64s33hMWHAR"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "len(Y_train)"
      ],
      "metadata": {
        "id": "Y1CJHBIoWIfX"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "X_train[:10]"
      ],
      "metadata": {
        "id": "Zhj2NzjVRivv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "X_train[:10]"
      ],
      "metadata": {
        "id": "hghd8rlsgiNt"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        ""
      ],
      "metadata": {
        "id": "E8pMmNvHfPXC"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import pickle\n",
        "\n",
        "# Store data (serialize)\n",
        "with open('X_train.pickle', 'wb') as handle:\n",
        "    pickle.dump(X_train, handle, protocol=pickle.HIGHEST_PROTOCOL)"
      ],
      "metadata": {
        "id": "hZOaW4aEk4xe"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import pickle\n",
        "\n",
        "# Store data (serialize)\n",
        "with open('X_test.pickle', 'wb') as handle:\n",
        "    pickle.dump(X_test, handle, protocol=pickle.HIGHEST_PROTOCOL)"
      ],
      "metadata": {
        "id": "klbZTEsak9XA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "**STEP 5: TOKENIZATION**"
      ],
      "metadata": {
        "id": "8Z_YNy2pjkm2"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "tokenizer = Tokenizer()"
      ],
      "metadata": {
        "id": "xXv3o79Djkm9"
      },
      "execution_count": 61,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "tokenizer.fit_on_texts(X_train)"
      ],
      "metadata": {
        "id": "WgbvrxJBjkm-"
      },
      "execution_count": 62,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "X_train"
      ],
      "metadata": {
        "id": "OH9-l_l2j7Bv",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "d9ef3853-977e-4340-9923-b3576cd2d19e"
      },
      "execution_count": 63,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([\"Explanation edit make username Hardcore Metallica Fan revert ? vandalisms , closure GAs vote New York Dolls FAC . please remove template talk page since 'm retire now.89.205.38.27\",\n",
              "       \"D'aww ! match background colour 'm seemingly stick . Thanks . ( talk ) 21:51 , January 11 , 2016 ( UTC )\",\n",
              "       \"Hey man , 'm really try edit war . 's guy constantly remove relevant information talk edit instead talk page . seem care format actual info .\",\n",
              "       ...,\n",
              "       'Spitzer Umm , theres actual article prostitution ring . - Crunch Captain .',\n",
              "       'look like actually put speedy first version delete look .',\n",
              "       \"`` ... really think understand . come idea bad right away . kind community go `` '' bad ideas '' '' go away , instead help rewrite them. ``\"],\n",
              "      dtype=object)"
            ]
          },
          "metadata": {},
          "execution_count": 63
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "X_train_tokenized = tokenizer.texts_to_sequences(X_train)"
      ],
      "metadata": {
        "id": "_HMHOBXgjkm-"
      },
      "execution_count": 64,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "X_train_tokenized"
      ],
      "metadata": {
        "id": "by_5XhtTjkm-"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "Sr3MNvqvjqrb"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "**STEP 6: FIND MAX LENGTH OF SENTENCES**"
      ],
      "metadata": {
        "id": "5IBFsrGXsu3F"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "max_len = 0\n",
        "test = \"\"\n",
        "j=0\n",
        "for i,sentence in enumerate(X_train_tokenized):\n",
        "  length = len(sentence)\n",
        "  if length>max_len:\n",
        "    j=i\n",
        "    max_len = length\n",
        "    test = sentence"
      ],
      "metadata": {
        "id": "EcFlSbPgstPo"
      },
      "execution_count": 45,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "max_len"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "2U5-2NbqtJMK",
        "outputId": "4968498b-afeb-4032-811b-98e258564d56"
      },
      "execution_count": 46,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "1348"
            ]
          },
          "metadata": {},
          "execution_count": 46
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "**STEP 7 : PAD SEQUENCES**"
      ],
      "metadata": {
        "id": "NsRq1qlXv-bk"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "X_train_processed = pad_sequences(X_train_tokenized,maxlen=max_len,padding = 'post')"
      ],
      "metadata": {
        "id": "TlUpXXnPvZRm"
      },
      "execution_count": 47,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "X_train_processed"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Xe6caWuBwNKh",
        "outputId": "60dae5f9-e24e-4d38-e578-097a64cb179e"
      },
      "execution_count": 48,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([[  562,     7,    10, ...,     0,     0,     0],\n",
              "       [86373,   934,   431, ...,     0,     0,     0],\n",
              "       [  305,   312,    25, ...,     0,     0,     0],\n",
              "       ...,\n",
              "       [27845,  6291,  4403, ...,     0,     0,     0],\n",
              "       [   41,    13,   139, ...,     0,     0,     0],\n",
              "       [   74,    14,   124, ...,     0,     0,     0]], dtype=int32)"
            ]
          },
          "metadata": {},
          "execution_count": 48
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "**SIMPLE NN MODEL**"
      ],
      "metadata": {
        "id": "32Qp_ks2wse0"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Test"
      ],
      "metadata": {
        "id": "5P0hiAqj-bAM"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "max_features=100000\n",
        "tokenizer = Tokenizer(num_words=max_features)\n",
        "tokenizer.fit_on_texts(list(X_train))\n",
        "list_tokenized_train = tokenizer.texts_to_sequences(X_train)\n"
      ],
      "metadata": {
        "id": "qCoNNFx3-aWv"
      },
      "execution_count": 53,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "maxpadlen = 200\n",
        "X_t=pad_sequences(list_tokenized_train, maxlen=maxpadlen, padding = 'post')\n"
      ],
      "metadata": {
        "id": "_f3D936I-lct"
      },
      "execution_count": 54,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "X_t"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "G0-f0_J8-srQ",
        "outputId": "4c6f4468-772f-4c74-b1a6-e4d8114be98f"
      },
      "execution_count": 55,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([[  562,     7,    10, ...,     0,     0,     0],\n",
              "       [86373,   934,   431, ...,     0,     0,     0],\n",
              "       [  305,   312,    25, ...,     0,     0,     0],\n",
              "       ...,\n",
              "       [27845,  6291,  4403, ...,     0,     0,     0],\n",
              "       [   41,    13,   139, ...,     0,     0,     0],\n",
              "       [   74,    14,   124, ...,     0,     0,     0]], dtype=int32)"
            ]
          },
          "metadata": {},
          "execution_count": 55
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "NhVw8fNyezD3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "from sklearn.model_selection import train_test_split\n",
        "# x_train, x_val, y_train, y_val = train_test_split(X_train_processed, Y_train, test_size=0.2)\n",
        "lstm_model = keras.Sequential([\n",
        "    keras.layers.Embedding(max_features+1,32) ,          \n",
        "    keras.layers.Bidirectional(keras.layers.LSTM(32, activation='tanh')) ,         \n",
        "    keras.layers.Dense(128, activation=\"relu\"),\n",
        "    keras.layers.Dense(256, activation=\"relu\"),\n",
        "    keras.layers.Dense(128, activation=\"relu\"),\n",
        "    keras.layers.Dense(6, activation=\"sigmoid\")\n",
        "])\n",
        "lstm_model.compile(loss=\"BinaryCrossentropy\", optimizer=\"Adam\", metrics=[\"accuracy\"])\n",
        "model_history = lstm_model.fit(X_t, Y_train, epochs=1)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XUa6X8eA-2yS",
        "outputId": "4c734a3a-2dc1-4c31-954a-fd64c8904bb1"
      },
      "execution_count": 56,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "4987/4987 [==============================] - 101s 19ms/step - loss: 0.0619 - accuracy: 0.9897\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "res = lstm_model.predict(np.expand_dims(X_t[15],axis=0))\n",
        "\n",
        "(res > 0.5).astype(int)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4grayjEH_5Pu",
        "outputId": "f780c913-97b0-488b-8c44-9544df9b202d"
      },
      "execution_count": 145,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([[0, 0, 0, 0, 0, 0]])"
            ]
          },
          "metadata": {},
          "execution_count": 145
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "Y_train.iloc[12]"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "iTTVDTG-AGo4",
        "outputId": "82856c35-7903-45d1-e50a-32c32d8bd5b7"
      },
      "execution_count": 142,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "toxic            1\n",
              "severe_toxic     0\n",
              "obscene          0\n",
              "threat           0\n",
              "insult           0\n",
              "identity_hate    0\n",
              "Name: 12, dtype: int64"
            ]
          },
          "metadata": {},
          "execution_count": 142
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "input_text = 'COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK'"
      ],
      "metadata": {
        "id": "R0zjKTuVAX8u"
      },
      "execution_count": 160,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def predict_using_simple_model(text):\n",
        "  sentences =[text]\n",
        "  expand_contractions(sentences)\n",
        "  remove_newlines_and_tabs(sentences)\n",
        "  remove_stopwords(sentences)\n",
        "  lemmetization(sentences)\n",
        "  print(sentences)\n",
        "  tokenized_text = tokenizer.texts_to_sequences(sentences)\n",
        "  padded_text = pad_sequences(tokenized_text,maxlen=maxpadlen,padding = 'post')\n",
        "  print(lstm_model.predict(padded_text))\n",
        "  return lstm_model.predict(padded_text)"
      ],
      "metadata": {
        "id": "mN_eJSP1AZ-d"
      },
      "execution_count": 158,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "res = predict_using_simple_model(input_text)\n",
        "(res > 0.5).astype(int)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "9Mb2krODAadW",
        "outputId": "3733a10c-e889-4d38-dc22-f77745e55aae"
      },
      "execution_count": 159,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "['HATE BLACK']\n",
            "[[0.8108128  0.04237914 0.459502   0.02853931 0.3863017  0.10296743]]\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([[1, 0, 0, 0, 0, 0]])"
            ]
          },
          "metadata": {},
          "execution_count": 159
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "**Test 2**"
      ],
      "metadata": {
        "id": "I0zU8ZbPgHLf"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "# x_train, x_val, y_train, y_val = train_test_split(X_train_processed, Y_train, test_size=0.2)\n",
        "lstm_model_2 = keras.Sequential([\n",
        "    keras.layers.Embedding(max_features+1,32) ,          \n",
        "    keras.layers.Bidirectional(keras.layers.LSTM(32, activation='tanh')) ,         \n",
        "    keras.layers.Dense(128, activation=\"relu\"),\n",
        "    keras.layers.Dense(256, activation=\"relu\"),\n",
        "    keras.layers.Dense(128, activation=\"relu\"),\n",
        "    keras.layers.Dense(6, activation=\"sigmoid\")\n",
        "])\n",
        "lstm_model_2.compile(loss=\"BinaryCrossentropy\", optimizer=\"Adam\", metrics=[\"accuracy\"])\n",
        "model_history = lstm_model_2.fit(X_train_processed, Y_train, epochs=1)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4GTMdGgfgGxm",
        "outputId": "6b526957-1217-4ca1-d19f-f1744fb4a751"
      },
      "execution_count": 59,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "4987/4987 [==============================] - 458s 91ms/step - loss: 0.0589 - accuracy: 0.9895\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "res = lstm_model_2.predict(np.expand_dims(X_train_processed[6],axis=0))\n",
        "\n",
        "(res > 0.5).astype(int)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "UWS_61LOgWwI",
        "outputId": "4e5a42e4-8904-4ee4-c797-31c56c02a0f8"
      },
      "execution_count": 60,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "array([[1, 0, 1, 0, 1, 0]])"
            ]
          },
          "metadata": {},
          "execution_count": 60
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "text = 'Stupid peace of shit stop deleting my stuff asshole go die and fall in a hole go to hell!'"
      ],
      "metadata": {
        "id": "_LYBbAIhnHt6"
      },
      "execution_count": 68,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def predict_using_text(text):\n",
        "  sentences = [text]\n",
        "  expand_contractions(sentences)\n",
        "  remove_newlines_and_tabs(sentences)\n",
        "  remove_stopwords(sentences)\n",
        "  lemmetization(sentences)\n",
        "  tokenized = tokenizer.texts_to_sequences(sentences)\n",
        "  padded = pad_sequences(tokenized,maxlen=max_len,padding = 'post')\n",
        "  res = lstm_model_2.predict(padded)\n",
        "  print((res > 0.5).astype(int))"
      ],
      "metadata": {
        "id": "10wbT70anQDk"
      },
      "execution_count": 72,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "predict_using_text(text)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XmyBnnebocWu",
        "outputId": "7de8aa84-531d-44a2-986f-c9954a87674b"
      },
      "execution_count": 73,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[[1 0 1 0 1 0]]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "lstm_model_2.save('comment_toxicity_model.h5')"
      ],
      "metadata": {
        "id": "QTpNX9bIzGsk"
      },
      "execution_count": 76,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "**Gradio**"
      ],
      "metadata": {
        "id": "Pe8m1Efby9-y"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "!pip install gradio jinja2"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "QcJ11A7eyzuL",
        "outputId": "60ef21b2-02fb-4fd2-fe0b-f907d22d3fb5"
      },
      "execution_count": 74,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Collecting gradio\n",
            "  Downloading gradio-3.0.20-py3-none-any.whl (5.1 MB)\n",
            "\u001b[K     |████████████████████████████████| 5.1 MB 4.1 MB/s \n",
            "\u001b[?25hRequirement already satisfied: jinja2 in /usr/local/lib/python3.7/dist-packages (2.11.3)\n",
            "Collecting analytics-python\n",
            "  Downloading analytics_python-1.4.0-py2.py3-none-any.whl (15 kB)\n",
            "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from gradio) (1.3.5)\n",
            "Collecting paramiko\n",
            "  Downloading paramiko-2.11.0-py2.py3-none-any.whl (212 kB)\n",
            "\u001b[K     |████████████████████████████████| 212 kB 53.9 MB/s \n",
            "\u001b[?25hCollecting python-multipart\n",
            "  Downloading python-multipart-0.0.5.tar.gz (32 kB)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from gradio) (1.21.6)\n",
            "Requirement already satisfied: pillow in /usr/local/lib/python3.7/dist-packages (from gradio) (7.1.2)\n",
            "Collecting fastapi\n",
            "  Downloading fastapi-0.78.0-py3-none-any.whl (54 kB)\n",
            "\u001b[K     |████████████████████████████████| 54 kB 3.1 MB/s \n",
            "\u001b[?25hCollecting ffmpy\n",
            "  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)\n",
            "Collecting aiohttp\n",
            "  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n",
            "\u001b[K     |████████████████████████████████| 1.1 MB 49.7 MB/s \n",
            "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from gradio) (2.23.0)\n",
            "Collecting uvicorn\n",
            "  Downloading uvicorn-0.18.1-py3-none-any.whl (57 kB)\n",
            "\u001b[K     |████████████████████████████████| 57 kB 2.7 MB/s \n",
            "\u001b[?25hCollecting fsspec\n",
            "  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)\n",
            "\u001b[K     |████████████████████████████████| 140 kB 30.4 MB/s \n",
            "\u001b[?25hCollecting pydub\n",
            "  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
            "Collecting pycryptodome\n",
            "  Downloading pycryptodome-3.15.0-cp35-abi3-manylinux2010_x86_64.whl (2.3 MB)\n",
            "\u001b[K     |████████████████████████████████| 2.3 MB 35.0 MB/s \n",
            "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from gradio) (3.2.2)\n",
            "Collecting markdown-it-py[linkify,plugins]\n",
            "  Downloading markdown_it_py-2.1.0-py3-none-any.whl (84 kB)\n",
            "\u001b[K     |████████████████████████████████| 84 kB 2.8 MB/s \n",
            "\u001b[?25hCollecting orjson\n",
            "  Downloading orjson-3.7.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (256 kB)\n",
            "\u001b[K     |████████████████████████████████| 256 kB 42.8 MB/s \n",
            "\u001b[?25hRequirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2) (2.0.1)\n",
            "Collecting asynctest==0.13.0\n",
            "  Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\n",
            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->gradio) (21.4.0)\n",
            "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->gradio) (2.0.12)\n",
            "Collecting aiosignal>=1.1.2\n",
            "  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n",
            "Collecting async-timeout<5.0,>=4.0.0a3\n",
            "  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from aiohttp->gradio) (4.1.1)\n",
            "Collecting multidict<7.0,>=4.5\n",
            "  Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)\n",
            "\u001b[K     |████████████████████████████████| 94 kB 3.0 MB/s \n",
            "\u001b[?25hCollecting yarl<2.0,>=1.0\n",
            "  Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n",
            "\u001b[K     |████████████████████████████████| 271 kB 39.8 MB/s \n",
            "\u001b[?25hCollecting frozenlist>=1.1.1\n",
            "  Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n",
            "\u001b[K     |████████████████████████████████| 144 kB 42.3 MB/s \n",
            "\u001b[?25hRequirement already satisfied: idna>=2.0 in /usr/local/lib/python3.7/dist-packages (from yarl<2.0,>=1.0->aiohttp->gradio) (2.10)\n",
            "Requirement already satisfied: python-dateutil>2.1 in /usr/local/lib/python3.7/dist-packages (from analytics-python->gradio) (2.8.2)\n",
            "Collecting backoff==1.10.0\n",
            "  Downloading backoff-1.10.0-py2.py3-none-any.whl (31 kB)\n",
            "Collecting monotonic>=1.5\n",
            "  Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n",
            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from analytics-python->gradio) (1.15.0)\n",
            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->gradio) (1.24.3)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->gradio) (2022.6.15)\n",
            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->gradio) (3.0.4)\n",
            "Requirement already satisfied: pydantic!=1.7,!=1.7.1,!=1.7.2,!=1.7.3,!=1.8,!=1.8.1,<2.0.0,>=1.6.2 in /usr/local/lib/python3.7/dist-packages (from fastapi->gradio) (1.8.2)\n",
            "Collecting starlette==0.19.1\n",
            "  Downloading starlette-0.19.1-py3-none-any.whl (63 kB)\n",
            "\u001b[K     |████████████████████████████████| 63 kB 1.8 MB/s \n",
            "\u001b[?25hCollecting anyio<5,>=3.4.0\n",
            "  Downloading anyio-3.6.1-py3-none-any.whl (80 kB)\n",
            "\u001b[K     |████████████████████████████████| 80 kB 9.2 MB/s \n",
            "\u001b[?25hCollecting sniffio>=1.1\n",
            "  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)\n",
            "Collecting mdurl~=0.1\n",
            "  Downloading mdurl-0.1.1-py3-none-any.whl (10 kB)\n",
            "Collecting mdit-py-plugins\n",
            "  Downloading mdit_py_plugins-0.3.0-py3-none-any.whl (43 kB)\n",
            "\u001b[K     |████████████████████████████████| 43 kB 2.1 MB/s \n",
            "\u001b[?25hCollecting linkify-it-py~=1.0\n",
            "  Downloading linkify_it_py-1.0.3-py3-none-any.whl (19 kB)\n",
            "Collecting uc-micro-py\n",
            "  Downloading uc_micro_py-1.0.1-py3-none-any.whl (6.2 kB)\n",
            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->gradio) (3.0.9)\n",
            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->gradio) (1.4.3)\n",
            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->gradio) (0.11.0)\n",
            "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->gradio) (2022.1)\n",
            "Collecting cryptography>=2.5\n",
            "  Downloading cryptography-37.0.2-cp36-abi3-manylinux_2_24_x86_64.whl (4.0 MB)\n",
            "\u001b[K     |████████████████████████████████| 4.0 MB 24.4 MB/s \n",
            "\u001b[?25hCollecting bcrypt>=3.1.3\n",
            "  Downloading bcrypt-3.2.2-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (62 kB)\n",
            "\u001b[K     |████████████████████████████████| 62 kB 732 kB/s \n",
            "\u001b[?25hCollecting pynacl>=1.0.1\n",
            "  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)\n",
            "\u001b[K     |████████████████████████████████| 856 kB 36.7 MB/s \n",
            "\u001b[?25hRequirement already satisfied: cffi>=1.1 in /usr/local/lib/python3.7/dist-packages (from bcrypt>=3.1.3->paramiko->gradio) (1.15.0)\n",
            "Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.1->bcrypt>=3.1.3->paramiko->gradio) (2.21)\n",
            "Collecting h11>=0.8\n",
            "  Downloading h11-0.13.0-py3-none-any.whl (58 kB)\n",
            "\u001b[K     |████████████████████████████████| 58 kB 4.6 MB/s \n",
            "\u001b[?25hRequirement already satisfied: click>=7.0 in /usr/local/lib/python3.7/dist-packages (from uvicorn->gradio) (7.1.2)\n",
            "Building wheels for collected packages: ffmpy, python-multipart\n",
            "  Building wheel for ffmpy (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for ffmpy: filename=ffmpy-0.3.0-py3-none-any.whl size=4712 sha256=c83267f00c3d705a61d99547a2b2b531230130bc5c0710fa4e5c5aec60fd9053\n",
            "  Stored in directory: /root/.cache/pip/wheels/13/e4/6c/e8059816e86796a597c6e6b0d4c880630f51a1fcfa0befd5e6\n",
            "  Building wheel for python-multipart (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for python-multipart: filename=python_multipart-0.0.5-py3-none-any.whl size=31678 sha256=9402f9afbc2e02540246432e83f77f34bbec24d28b37a1bfe8b85b740b5e843b\n",
            "  Stored in directory: /root/.cache/pip/wheels/2c/41/7c/bfd1c180534ffdcc0972f78c5758f89881602175d48a8bcd2c\n",
            "Successfully built ffmpy python-multipart\n",
            "Installing collected packages: sniffio, mdurl, uc-micro-py, multidict, markdown-it-py, frozenlist, anyio, yarl, starlette, pynacl, monotonic, mdit-py-plugins, linkify-it-py, h11, cryptography, bcrypt, backoff, asynctest, async-timeout, aiosignal, uvicorn, python-multipart, pydub, pycryptodome, paramiko, orjson, fsspec, ffmpy, fastapi, analytics-python, aiohttp, gradio\n",
            "Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 analytics-python-1.4.0 anyio-3.6.1 async-timeout-4.0.2 asynctest-0.13.0 backoff-1.10.0 bcrypt-3.2.2 cryptography-37.0.2 fastapi-0.78.0 ffmpy-0.3.0 frozenlist-1.3.0 fsspec-2022.5.0 gradio-3.0.20 h11-0.13.0 linkify-it-py-1.0.3 markdown-it-py-2.1.0 mdit-py-plugins-0.3.0 mdurl-0.1.1 monotonic-1.6 multidict-6.0.2 orjson-3.7.3 paramiko-2.11.0 pycryptodome-3.15.0 pydub-0.25.1 pynacl-1.5.0 python-multipart-0.0.5 sniffio-1.2.0 starlette-0.19.1 uc-micro-py-1.0.1 uvicorn-0.18.1 yarl-1.7.2\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import tensorflow as tf\n",
        "import gradio as gr"
      ],
      "metadata": {
        "id": "SioQk70yzCmg"
      },
      "execution_count": 75,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def score_comment(comment):\n",
        "    sentences = [comment]\n",
        "    expand_contractions(sentences)\n",
        "    remove_newlines_and_tabs(sentences)\n",
        "    remove_stopwords(sentences)\n",
        "    lemmetization(sentences)\n",
        "    tokenized = tokenizer.texts_to_sequences(sentences)\n",
        "    padded = pad_sequences(tokenized,maxlen=max_len,padding = 'post')\n",
        "    results = lstm_model_2.predict(padded)\n",
        "    \n",
        "    text = ''\n",
        "    for idx, col in enumerate(train.columns[2:]):\n",
        "        text += '{}: {}\\n'.format(col, results[0][idx]>0.5)\n",
        "    print(text)\n",
        "    return text"
      ],
      "metadata": {
        "id": "8NXDXUiOzE8P"
      },
      "execution_count": 81,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "score_comment(text)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 174
        },
        "id": "JMNVJ0AqzoaL",
        "outputId": "c3150373-d8f1-44d6-a84c-59aa9a4fa76e"
      },
      "execution_count": 82,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "toxic: True\n",
            "severe_toxic: False\n",
            "obscene: True\n",
            "threat: False\n",
            "insult: True\n",
            "identity_hate: False\n",
            "\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'toxic: True\\nsevere_toxic: False\\nobscene: True\\nthreat: False\\ninsult: True\\nidentity_hate: False\\n'"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 82
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "interface = gr.Interface(fn=score_comment, \n",
        "                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),\n",
        "                        outputs='text')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "vxOtY1hAz6Pa",
        "outputId": "6b6c7060-fc54-4c22-eb4f-a8b57ffb377f"
      },
      "execution_count": 83,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/gradio/deprecation.py:40: UserWarning: `optional` parameter is deprecated, and it has no effect\n",
            "  warnings.warn(value)\n",
            "/usr/local/lib/python3.7/dist-packages/gradio/deprecation.py:40: UserWarning: `numeric` parameter is deprecated, and it has no effect\n",
            "  warnings.warn(value)\n",
            "/usr/local/lib/python3.7/dist-packages/gradio/deprecation.py:40: UserWarning: The 'type' parameter has been deprecated. Use the Number component instead.\n",
            "  warnings.warn(value)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "interface.launch(share=True)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 663
        },
        "id": "vxx6NnCUz_O-",
        "outputId": "aa6057b4-7dbd-457a-a296-f648d5b83926"
      },
      "execution_count": 84,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`\n",
            "Running on public URL: https://53293.gradio.app\n",
            "\n",
            "This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "<div><iframe src=\"https://53293.gradio.app\" width=\"900\" height=\"500\" allow=\"autoplay; camera; microphone;\" frameborder=\"0\" allowfullscreen></iframe></div>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(<gradio.routes.App at 0x7fe3a02b58d0>,\n",
              " 'http://127.0.0.1:7860/',\n",
              " 'https://53293.gradio.app')"
            ]
          },
          "metadata": {},
          "execution_count": 84
        }
      ]
    }
  ]
}