{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "dP6l8vOUtV0J"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "EDetmMNGuXY9"
      },
      "outputs": [],
      "source": [
        "df = pd.read_csv(\"/content/training.1600000.processed.noemoticon.csv\", encoding='latin-1')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "yDJXNYjqwYo2",
        "outputId": "b56cb388-71f6-4d0e-c651-0201854cb952"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "         0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  \\\n",
              "0        0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   \n",
              "1        0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   \n",
              "2        0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   \n",
              "3        0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   \n",
              "4        0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY   \n",
              "...     ..         ...                           ...       ...   \n",
              "1599994  4  2193601966  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   \n",
              "1599995  4  2193601969  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   \n",
              "1599996  4  2193601991  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   \n",
              "1599997  4  2193602064  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   \n",
              "1599998  4  2193602129  Tue Jun 16 08:40:50 PDT 2009  NO_QUERY   \n",
              "\n",
              "         _TheSpecialOne_  \\\n",
              "0          scotthamilton   \n",
              "1               mattycus   \n",
              "2                ElleCTF   \n",
              "3                 Karoli   \n",
              "4               joy_wolf   \n",
              "...                  ...   \n",
              "1599994  AmandaMarie1028   \n",
              "1599995      TheWDBoards   \n",
              "1599996           bpbabe   \n",
              "1599997     tinydiamondz   \n",
              "1599998   RyanTrevMorris   \n",
              "\n",
              "        @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  \n",
              "0        is upset that he can't update his Facebook by ...                                                                   \n",
              "1        @Kenichan I dived many times for the ball. Man...                                                                   \n",
              "2          my whole body feels itchy and like its on fire                                                                    \n",
              "3        @nationwideclass no, it's not behaving at all....                                                                   \n",
              "4                            @Kwesidei not the whole crew                                                                    \n",
              "...                                                    ...                                                                   \n",
              "1599994  Just woke up. Having no school is the best fee...                                                                   \n",
              "1599995  TheWDB.com - Very cool to hear old Walt interv...                                                                   \n",
              "1599996  Are you ready for your MoJo Makeover? Ask me f...                                                                   \n",
              "1599997  Happy 38th Birthday to my boo of alll time!!! ...                                                                   \n",
              "1599998  happy #charitytuesday @theNSPCC @SparksCharity...                                                                   \n",
              "\n",
              "[1599999 rows x 6 columns]"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-9a4869c7-52c7-433e-8ded-b3e9664736d8\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>0</th>\n",
              "      <th>1467810369</th>\n",
              "      <th>Mon Apr 06 22:19:45 PDT 2009</th>\n",
              "      <th>NO_QUERY</th>\n",
              "      <th>_TheSpecialOne_</th>\n",
              "      <th>@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "      <td>1467810672</td>\n",
              "      <td>Mon Apr 06 22:19:49 PDT 2009</td>\n",
              "      <td>NO_QUERY</td>\n",
              "      <td>scotthamilton</td>\n",
              "      <td>is upset that he can't update his Facebook by ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>0</td>\n",
              "      <td>1467810917</td>\n",
              "      <td>Mon Apr 06 22:19:53 PDT 2009</td>\n",
              "      <td>NO_QUERY</td>\n",
              "      <td>mattycus</td>\n",
              "      <td>@Kenichan I dived many times for the ball. Man...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>0</td>\n",
              "      <td>1467811184</td>\n",
              "      <td>Mon Apr 06 22:19:57 PDT 2009</td>\n",
              "      <td>NO_QUERY</td>\n",
              "      <td>ElleCTF</td>\n",
              "      <td>my whole body feels itchy and like its on fire</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>0</td>\n",
              "      <td>1467811193</td>\n",
              "      <td>Mon Apr 06 22:19:57 PDT 2009</td>\n",
              "      <td>NO_QUERY</td>\n",
              "      <td>Karoli</td>\n",
              "      <td>@nationwideclass no, it's not behaving at all....</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>0</td>\n",
              "      <td>1467811372</td>\n",
              "      <td>Mon Apr 06 22:20:00 PDT 2009</td>\n",
              "      <td>NO_QUERY</td>\n",
              "      <td>joy_wolf</td>\n",
              "      <td>@Kwesidei not the whole crew</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1599994</th>\n",
              "      <td>4</td>\n",
              "      <td>2193601966</td>\n",
              "      <td>Tue Jun 16 08:40:49 PDT 2009</td>\n",
              "      <td>NO_QUERY</td>\n",
              "      <td>AmandaMarie1028</td>\n",
              "      <td>Just woke up. Having no school is the best fee...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1599995</th>\n",
              "      <td>4</td>\n",
              "      <td>2193601969</td>\n",
              "      <td>Tue Jun 16 08:40:49 PDT 2009</td>\n",
              "      <td>NO_QUERY</td>\n",
              "      <td>TheWDBoards</td>\n",
              "      <td>TheWDB.com - Very cool to hear old Walt interv...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1599996</th>\n",
              "      <td>4</td>\n",
              "      <td>2193601991</td>\n",
              "      <td>Tue Jun 16 08:40:49 PDT 2009</td>\n",
              "      <td>NO_QUERY</td>\n",
              "      <td>bpbabe</td>\n",
              "      <td>Are you ready for your MoJo Makeover? Ask me f...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1599997</th>\n",
              "      <td>4</td>\n",
              "      <td>2193602064</td>\n",
              "      <td>Tue Jun 16 08:40:49 PDT 2009</td>\n",
              "      <td>NO_QUERY</td>\n",
              "      <td>tinydiamondz</td>\n",
              "      <td>Happy 38th Birthday to my boo of alll time!!! ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1599998</th>\n",
              "      <td>4</td>\n",
              "      <td>2193602129</td>\n",
              "      <td>Tue Jun 16 08:40:50 PDT 2009</td>\n",
              "      <td>NO_QUERY</td>\n",
              "      <td>RyanTrevMorris</td>\n",
              "      <td>happy #charitytuesday @theNSPCC @SparksCharity...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>1599999 rows × 6 columns</p>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-9a4869c7-52c7-433e-8ded-b3e9664736d8')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-9a4869c7-52c7-433e-8ded-b3e9664736d8 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-9a4869c7-52c7-433e-8ded-b3e9664736d8');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-262012d3-4035-4656-b2b9-9f0759747dc6\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-262012d3-4035-4656-b2b9-9f0759747dc6')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-262012d3-4035-4656-b2b9-9f0759747dc6 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "  <div id=\"id_ff10cfaf-9638-4c95-a28a-4f78765fae3c\">\n",
              "    <style>\n",
              "      .colab-df-generate {\n",
              "        background-color: #E8F0FE;\n",
              "        border: none;\n",
              "        border-radius: 50%;\n",
              "        cursor: pointer;\n",
              "        display: none;\n",
              "        fill: #1967D2;\n",
              "        height: 32px;\n",
              "        padding: 0 0 0 0;\n",
              "        width: 32px;\n",
              "      }\n",
              "\n",
              "      .colab-df-generate:hover {\n",
              "        background-color: #E2EBFA;\n",
              "        box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "        fill: #174EA6;\n",
              "      }\n",
              "\n",
              "      [theme=dark] .colab-df-generate {\n",
              "        background-color: #3B4455;\n",
              "        fill: #D2E3FC;\n",
              "      }\n",
              "\n",
              "      [theme=dark] .colab-df-generate:hover {\n",
              "        background-color: #434B5C;\n",
              "        box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "        filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "        fill: #FFFFFF;\n",
              "      }\n",
              "    </style>\n",
              "    <button class=\"colab-df-generate\" onclick=\"generateWithVariable('df')\"\n",
              "            title=\"Generate code using this dataframe.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "    <script>\n",
              "      (() => {\n",
              "      const buttonEl =\n",
              "        document.querySelector('#id_ff10cfaf-9638-4c95-a28a-4f78765fae3c button.colab-df-generate');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      buttonEl.onclick = () => {\n",
              "        google.colab.notebook.generateWithVariable('df');\n",
              "      }\n",
              "      })();\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "df"
            }
          },
          "metadata": {},
          "execution_count": 3
        }
      ],
      "source": [
        "df"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 290
        },
        "id": "kZqf-vIww7VB",
        "outputId": "e1337690-1fc2-4e84-c800-e7e588a925fd"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0                                                                                                                      0\n",
              "1467810369                                                                                                             0\n",
              "Mon Apr 06 22:19:45 PDT 2009                                                                                           0\n",
              "NO_QUERY                                                                                                               0\n",
              "_TheSpecialOne_                                                                                                        0\n",
              "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D    0\n",
              "dtype: int64"
            ],
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>0</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1467810369</th>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>Mon Apr 06 22:19:45 PDT 2009</th>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>NO_QUERY</th>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>_TheSpecialOne_</th>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D</th>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div><br><label><b>dtype:</b> int64</label>"
            ]
          },
          "metadata": {},
          "execution_count": 4
        }
      ],
      "source": [
        "df.isna().sum()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 490
        },
        "id": "6yNPDs81xDEt",
        "outputId": "26edc6a0-3338-4835-d87a-2f94db46b618"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "1467810369\n",
              "1563681287    2\n",
              "2062516845    2\n",
              "1551586713    2\n",
              "1676311044    2\n",
              "1791602739    2\n",
              "             ..\n",
              "2197310899    1\n",
              "2197310477    1\n",
              "2197310452    1\n",
              "2197310381    1\n",
              "2197311865    1\n",
              "Name: count, Length: 1598314, dtype: int64"
            ],
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>count</th>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1467810369</th>\n",
              "      <th></th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>1563681287</th>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2062516845</th>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1551586713</th>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1676311044</th>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1791602739</th>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2197310899</th>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2197310477</th>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2197310452</th>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2197310381</th>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2197311865</th>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>1598314 rows × 1 columns</p>\n",
              "</div><br><label><b>dtype:</b> int64</label>"
            ]
          },
          "metadata": {},
          "execution_count": 5
        }
      ],
      "source": [
        "df['1467810369'].value_counts()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 147
        },
        "id": "qSDrw9sHxS2n",
        "outputId": "81bad2a3-5bab-440f-9fb8-7a87d99c5e92"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "NO_QUERY\n",
              "NO_QUERY    1599999\n",
              "Name: count, dtype: int64"
            ],
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>count</th>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>NO_QUERY</th>\n",
              "      <th></th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>NO_QUERY</th>\n",
              "      <td>1599999</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div><br><label><b>dtype:</b> int64</label>"
            ]
          },
          "metadata": {},
          "execution_count": 6
        }
      ],
      "source": [
        "df['NO_QUERY'].value_counts()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "id": "3Ix-o-IJxW-B"
      },
      "outputs": [],
      "source": [
        "df.drop('NO_QUERY',axis=1,inplace=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "IjITKGzlxhEv",
        "outputId": "96f77f50-72aa-470b-9675-3b0cf66a8352"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "0                                                                                                                       int64\n",
            "1467810369                                                                                                              int64\n",
            "Mon Apr 06 22:19:45 PDT 2009                                                                                           object\n",
            "_TheSpecialOne_                                                                                                        object\n",
            "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D    object\n",
            "dtype: object\n"
          ]
        }
      ],
      "source": [
        "df = df.sample(frac=0.1, random_state=42)  # %10 veriyi al\n",
        "print(df.dtypes)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Nx3sG-x313SH",
        "outputId": "c2577bc1-6b84-4e13-9fc1-8f5f31578160"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "<class 'pandas.core.frame.DataFrame'>\n",
            "Index: 16000 entries, 1497139 to 1284035\n",
            "Data columns (total 5 columns):\n",
            " #   Column                                                                                                               Non-Null Count  Dtype \n",
            "---  ------                                                                                                               --------------  ----- \n",
            " 0   0                                                                                                                    16000 non-null  int64 \n",
            " 1   1467810369                                                                                                           16000 non-null  int64 \n",
            " 2   Mon Apr 06 22:19:45 PDT 2009                                                                                         16000 non-null  object\n",
            " 3   _TheSpecialOne_                                                                                                      16000 non-null  object\n",
            " 4   @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  16000 non-null  object\n",
            "dtypes: int64(2), object(3)\n",
            "memory usage: 750.0+ KB\n"
          ]
        }
      ],
      "source": [
        "df.info()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 300
        },
        "id": "6dtZVvA515sK",
        "outputId": "9a7af184-7a43-48ee-f453-d4eb224c6dde"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                  0    1467810369\n",
              "count  16000.000000  1.600000e+04\n",
              "mean       2.023250  1.997504e+09\n",
              "std        1.999927  1.928363e+08\n",
              "min        0.000000  1.467834e+09\n",
              "25%        0.000000  1.956811e+09\n",
              "50%        4.000000  2.001829e+09\n",
              "75%        4.000000  2.176166e+09\n",
              "max        4.000000  2.329169e+09"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-bbce33f4-ac2b-4370-ad57-044dac02812c\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>0</th>\n",
              "      <th>1467810369</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>16000.000000</td>\n",
              "      <td>1.600000e+04</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>mean</th>\n",
              "      <td>2.023250</td>\n",
              "      <td>1.997504e+09</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>std</th>\n",
              "      <td>1.999927</td>\n",
              "      <td>1.928363e+08</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>min</th>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.467834e+09</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>25%</th>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.956811e+09</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>50%</th>\n",
              "      <td>4.000000</td>\n",
              "      <td>2.001829e+09</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>75%</th>\n",
              "      <td>4.000000</td>\n",
              "      <td>2.176166e+09</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>max</th>\n",
              "      <td>4.000000</td>\n",
              "      <td>2.329169e+09</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-bbce33f4-ac2b-4370-ad57-044dac02812c')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-bbce33f4-ac2b-4370-ad57-044dac02812c button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-bbce33f4-ac2b-4370-ad57-044dac02812c');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-d0abdab4-75f9-484f-90a9-b2db825dd472\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-d0abdab4-75f9-484f-90a9-b2db825dd472')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-d0abdab4-75f9-484f-90a9-b2db825dd472 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"df\",\n  \"rows\": 8,\n  \"fields\": [\n    {\n      \"column\": \"0\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 5656.045202020064,\n        \"min\": 0.0,\n        \"max\": 16000.0,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          2.02325,\n          4.0,\n          1.9999273535150421\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"1467810369\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 911085603.3007622,\n        \"min\": 16000.0,\n        \"max\": 2329168959.0,\n        \"num_unique_values\": 8,\n        \"samples\": [\n          1997503968.4805624,\n          2001828620.0,\n          16000.0\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 11
        }
      ],
      "source": [
        "df.describe()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "SkWc-Myp2am4",
        "outputId": "bf370764-f04f-4501-e048-a7b5fc59977b"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Count of IDs1467810369\n",
            "1979685249    1\n",
            "2047063573    1\n",
            "2208198047    1\n",
            "1996357873    1\n",
            "1676456732    1\n",
            "             ..\n",
            "2265368249    1\n",
            "1679452821    1\n",
            "2217064177    1\n",
            "2045691294    1\n",
            "2070290547    1\n",
            "Name: count, Length: 16000, dtype: int64, count of usernames_TheSpecialOne_\n",
            "Jayme1988         7\n",
            "wowlew            7\n",
            "ShesElectric_     5\n",
            "jbfanforever94    4\n",
            "maynaseric        4\n",
            "                 ..\n",
            "Stephi90          1\n",
            "ericzueff         1\n",
            "munoza13          1\n",
            "MissSTARcey       1\n",
            "Tmama21           1\n",
            "Name: count, Length: 15295, dtype: int64\n"
          ]
        }
      ],
      "source": [
        "print(f'Count of IDs{df[\"1467810369\"].value_counts()}, count of usernames{df[\"_TheSpecialOne_\"].value_counts()}')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {
        "id": "0PAvL2M_3vzw"
      },
      "outputs": [],
      "source": [
        "from nltk.corpus import stopwords"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 14,
      "metadata": {
        "id": "MpHUYWR-CKrs"
      },
      "outputs": [],
      "source": [
        "import nltk"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 15,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0kcFyu7hCUdy",
        "outputId": "5e73979f-761d-4c9d-a5ff-4a48f501aa4e"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Package stopwords is already up-to-date!\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {},
          "execution_count": 15
        }
      ],
      "source": [
        "nltk.download(\"stopwords\")"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "df = df.drop(columns=['1467810369'])"
      ],
      "metadata": {
        "id": "uql-EelvCFrN"
      },
      "execution_count": 16,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 632
        },
        "id": "Ur2vB4KWCrZL",
        "outputId": "7ff638d7-3cba-410c-b5ac-d479e70d362f"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "         0  Mon Apr 06 22:19:45 PDT 2009  _TheSpecialOne_  \\\n",
              "1497139  4  Sun Jun 07 17:26:02 PDT 2009         Jill_Osi   \n",
              "1350782  4  Fri Jun 05 11:05:43 PDT 2009          vmramos   \n",
              "589668   0  Wed Jun 17 20:20:31 PDT 2009       icorganics   \n",
              "52428    0  Sat May 02 09:23:27 PDT 2009           mundah   \n",
              "738013   0  Sun Jun 21 06:25:38 PDT 2009            skdev   \n",
              "...     ..                           ...              ...   \n",
              "1438815  4  Sat Jun 06 20:47:03 PDT 2009  HalfassBackward   \n",
              "298459   0  Mon Jun 01 17:16:49 PDT 2009     half_Milkman   \n",
              "961996   4  Sun May 17 10:12:06 PDT 2009         katizzle   \n",
              "1519397  4  Mon Jun 15 02:49:50 PDT 2009        michieong   \n",
              "1284035  4  Tue Jun 02 02:49:00 PDT 2009         itsJohno   \n",
              "\n",
              "                                                tweet_text  \n",
              "1497139         Can't wait to be at glacier national park   \n",
              "1350782  @virtualhispanic Falling apart, ha? Maybe you ...  \n",
              "589668   Oh my gosh, there is a Mom Entrepreneur of the...  \n",
              "52428    what shall i drink during the game?: Stella, M...  \n",
              "738013      @awaisnaseer @blessedAyesha ki LCD kharab hay   \n",
              "...                                                    ...  \n",
              "1438815  @bjolena Glad to hear your day went well  Mine...  \n",
              "298459   @continuity_plus Lol...or responded to the req...  \n",
              "961996             manicure done: today -&gt; black nails   \n",
              "1519397  okay, decided to listen to David Archuleta, no...  \n",
              "1284035  woah, everything looks amazing. i cant bloody ...  \n",
              "\n",
              "[16000 rows x 4 columns]"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-20ab79bf-7f41-44d9-94b5-d6f1b89cb399\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>0</th>\n",
              "      <th>Mon Apr 06 22:19:45 PDT 2009</th>\n",
              "      <th>_TheSpecialOne_</th>\n",
              "      <th>tweet_text</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>1497139</th>\n",
              "      <td>4</td>\n",
              "      <td>Sun Jun 07 17:26:02 PDT 2009</td>\n",
              "      <td>Jill_Osi</td>\n",
              "      <td>Can't wait to be at glacier national park</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1350782</th>\n",
              "      <td>4</td>\n",
              "      <td>Fri Jun 05 11:05:43 PDT 2009</td>\n",
              "      <td>vmramos</td>\n",
              "      <td>@virtualhispanic Falling apart, ha? Maybe you ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>589668</th>\n",
              "      <td>0</td>\n",
              "      <td>Wed Jun 17 20:20:31 PDT 2009</td>\n",
              "      <td>icorganics</td>\n",
              "      <td>Oh my gosh, there is a Mom Entrepreneur of the...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>52428</th>\n",
              "      <td>0</td>\n",
              "      <td>Sat May 02 09:23:27 PDT 2009</td>\n",
              "      <td>mundah</td>\n",
              "      <td>what shall i drink during the game?: Stella, M...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>738013</th>\n",
              "      <td>0</td>\n",
              "      <td>Sun Jun 21 06:25:38 PDT 2009</td>\n",
              "      <td>skdev</td>\n",
              "      <td>@awaisnaseer @blessedAyesha ki LCD kharab hay</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1438815</th>\n",
              "      <td>4</td>\n",
              "      <td>Sat Jun 06 20:47:03 PDT 2009</td>\n",
              "      <td>HalfassBackward</td>\n",
              "      <td>@bjolena Glad to hear your day went well  Mine...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>298459</th>\n",
              "      <td>0</td>\n",
              "      <td>Mon Jun 01 17:16:49 PDT 2009</td>\n",
              "      <td>half_Milkman</td>\n",
              "      <td>@continuity_plus Lol...or responded to the req...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>961996</th>\n",
              "      <td>4</td>\n",
              "      <td>Sun May 17 10:12:06 PDT 2009</td>\n",
              "      <td>katizzle</td>\n",
              "      <td>manicure done: today -&amp;gt; black nails</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1519397</th>\n",
              "      <td>4</td>\n",
              "      <td>Mon Jun 15 02:49:50 PDT 2009</td>\n",
              "      <td>michieong</td>\n",
              "      <td>okay, decided to listen to David Archuleta, no...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1284035</th>\n",
              "      <td>4</td>\n",
              "      <td>Tue Jun 02 02:49:00 PDT 2009</td>\n",
              "      <td>itsJohno</td>\n",
              "      <td>woah, everything looks amazing. i cant bloody ...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>16000 rows × 4 columns</p>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-20ab79bf-7f41-44d9-94b5-d6f1b89cb399')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-20ab79bf-7f41-44d9-94b5-d6f1b89cb399 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-20ab79bf-7f41-44d9-94b5-d6f1b89cb399');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-8b6a3d7e-5cf7-432e-b9f9-53748fd2a3df\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-8b6a3d7e-5cf7-432e-b9f9-53748fd2a3df')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-8b6a3d7e-5cf7-432e-b9f9-53748fd2a3df button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "  <div id=\"id_1946659c-0cfa-478e-b4d3-63c863f846ad\">\n",
              "    <style>\n",
              "      .colab-df-generate {\n",
              "        background-color: #E8F0FE;\n",
              "        border: none;\n",
              "        border-radius: 50%;\n",
              "        cursor: pointer;\n",
              "        display: none;\n",
              "        fill: #1967D2;\n",
              "        height: 32px;\n",
              "        padding: 0 0 0 0;\n",
              "        width: 32px;\n",
              "      }\n",
              "\n",
              "      .colab-df-generate:hover {\n",
              "        background-color: #E2EBFA;\n",
              "        box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "        fill: #174EA6;\n",
              "      }\n",
              "\n",
              "      [theme=dark] .colab-df-generate {\n",
              "        background-color: #3B4455;\n",
              "        fill: #D2E3FC;\n",
              "      }\n",
              "\n",
              "      [theme=dark] .colab-df-generate:hover {\n",
              "        background-color: #434B5C;\n",
              "        box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "        filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "        fill: #FFFFFF;\n",
              "      }\n",
              "    </style>\n",
              "    <button class=\"colab-df-generate\" onclick=\"generateWithVariable('df')\"\n",
              "            title=\"Generate code using this dataframe.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "    <script>\n",
              "      (() => {\n",
              "      const buttonEl =\n",
              "        document.querySelector('#id_1946659c-0cfa-478e-b4d3-63c863f846ad button.colab-df-generate');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      buttonEl.onclick = () => {\n",
              "        google.colab.notebook.generateWithVariable('df');\n",
              "      }\n",
              "      })();\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "df",
              "repr_error": "0"
            }
          },
          "metadata": {},
          "execution_count": 17
        }
      ],
      "source": [
        "new_column_name = \"tweet_text\"\n",
        "df.rename(columns={df.columns[3]: new_column_name}, inplace=True)\n",
        "df"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def optimize_memory(df):\n",
        "    for col in df.select_dtypes(include=['int64', 'float64']).columns:\n",
        "        df[col] = pd.to_numeric(df[col], downcast='integer' if df[col].dtype == 'int64' else 'float')\n",
        "    return df\n",
        "\n",
        "df = optimize_memory(df)\n"
      ],
      "metadata": {
        "id": "Ogt2OoXxYqvR"
      },
      "execution_count": 18,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "for col in df.select_dtypes(include=['object']).columns:\n",
        "    df[col] = df[col].astype('category')\n"
      ],
      "metadata": {
        "id": "RwQUgoLNYuVr"
      },
      "execution_count": 19,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.utils import shuffle\n",
        "\n",
        "batch_size = 100000  # Her seferde 100 bin satır işle\n",
        "df = shuffle(df, random_state=42)  # Verileri karıştır\n",
        "\n",
        "for i in range(0, len(df), batch_size):\n",
        "    batch = df.iloc[i:i + batch_size]\n",
        "    # Burada modeli eğit ve belleği temizle\n"
      ],
      "metadata": {
        "id": "5otlQQxlVuLU"
      },
      "execution_count": 20,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 632
        },
        "id": "7xTOy6i1b9oO",
        "outputId": "dd96255e-9e20-4d6f-8afa-239f2b8b7d29"
      },
      "execution_count": 24,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "         0  Mon Apr 06 22:19:45 PDT 2009 _TheSpecialOne_  \\\n",
              "1405892  4  Sat Jun 06 08:59:56 PDT 2009        judomary   \n",
              "841227   4  Sun Apr 19 12:36:21 PDT 2009   KrystleMiller   \n",
              "1542496  4  Mon Jun 15 11:20:14 PDT 2009   tweeteradder7   \n",
              "705200   0  Sat Jun 20 12:32:46 PDT 2009        ClauBand   \n",
              "1240299  4  Mon Jun 01 10:52:34 PDT 2009       beccuhboo   \n",
              "...     ..                           ...             ...   \n",
              "34505    0  Mon Apr 20 05:07:13 PDT 2009      marawigirl   \n",
              "1569381  4  Mon Jun 15 21:49:18 PDT 2009         xkayjay   \n",
              "1082714  4  Fri May 29 22:07:41 PDT 2009        justelle   \n",
              "1203085  4  Sun May 31 18:08:19 PDT 2009         ninja24   \n",
              "20120    0  Sun Apr 19 01:28:19 PDT 2009     metafiktion   \n",
              "\n",
              "                                                tweet_text  \n",
              "1405892  @johnnystimson Still? Wow. Your cheese must ha...  \n",
              "841227   @ciara_danella you're supposed to write it on ...  \n",
              "1542496  @vanphotolens Get 100 followers a day using ww...  \n",
              "705200                      @ThiOliveiras To no ar Thi!!!   \n",
              "1240299                    its cold out today . i love it   \n",
              "...                                                    ...  \n",
              "34505    hates being a girl one day every month. red fl...  \n",
              "1569381  10 minutes to midnight!  going to get my jonas...  \n",
              "1082714  @krystyl deal girlie! It has been forever! Tom...  \n",
              "1203085  So can't be all sad about stuff time to move o...  \n",
              "20120    You don't quite get the same sense of satisfac...  \n",
              "\n",
              "[16000 rows x 4 columns]"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-3bb25075-c2f7-4753-b477-422a672ffb2b\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>0</th>\n",
              "      <th>Mon Apr 06 22:19:45 PDT 2009</th>\n",
              "      <th>_TheSpecialOne_</th>\n",
              "      <th>tweet_text</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>1405892</th>\n",
              "      <td>4</td>\n",
              "      <td>Sat Jun 06 08:59:56 PDT 2009</td>\n",
              "      <td>judomary</td>\n",
              "      <td>@johnnystimson Still? Wow. Your cheese must ha...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>841227</th>\n",
              "      <td>4</td>\n",
              "      <td>Sun Apr 19 12:36:21 PDT 2009</td>\n",
              "      <td>KrystleMiller</td>\n",
              "      <td>@ciara_danella you're supposed to write it on ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1542496</th>\n",
              "      <td>4</td>\n",
              "      <td>Mon Jun 15 11:20:14 PDT 2009</td>\n",
              "      <td>tweeteradder7</td>\n",
              "      <td>@vanphotolens Get 100 followers a day using ww...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>705200</th>\n",
              "      <td>0</td>\n",
              "      <td>Sat Jun 20 12:32:46 PDT 2009</td>\n",
              "      <td>ClauBand</td>\n",
              "      <td>@ThiOliveiras To no ar Thi!!!</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1240299</th>\n",
              "      <td>4</td>\n",
              "      <td>Mon Jun 01 10:52:34 PDT 2009</td>\n",
              "      <td>beccuhboo</td>\n",
              "      <td>its cold out today . i love it</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>34505</th>\n",
              "      <td>0</td>\n",
              "      <td>Mon Apr 20 05:07:13 PDT 2009</td>\n",
              "      <td>marawigirl</td>\n",
              "      <td>hates being a girl one day every month. red fl...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1569381</th>\n",
              "      <td>4</td>\n",
              "      <td>Mon Jun 15 21:49:18 PDT 2009</td>\n",
              "      <td>xkayjay</td>\n",
              "      <td>10 minutes to midnight!  going to get my jonas...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1082714</th>\n",
              "      <td>4</td>\n",
              "      <td>Fri May 29 22:07:41 PDT 2009</td>\n",
              "      <td>justelle</td>\n",
              "      <td>@krystyl deal girlie! It has been forever! Tom...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1203085</th>\n",
              "      <td>4</td>\n",
              "      <td>Sun May 31 18:08:19 PDT 2009</td>\n",
              "      <td>ninja24</td>\n",
              "      <td>So can't be all sad about stuff time to move o...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>20120</th>\n",
              "      <td>0</td>\n",
              "      <td>Sun Apr 19 01:28:19 PDT 2009</td>\n",
              "      <td>metafiktion</td>\n",
              "      <td>You don't quite get the same sense of satisfac...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>16000 rows × 4 columns</p>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-3bb25075-c2f7-4753-b477-422a672ffb2b')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-3bb25075-c2f7-4753-b477-422a672ffb2b button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-3bb25075-c2f7-4753-b477-422a672ffb2b');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-bed4820e-2fba-4b7e-8ddc-ce10c4b5f5e7\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-bed4820e-2fba-4b7e-8ddc-ce10c4b5f5e7')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-bed4820e-2fba-4b7e-8ddc-ce10c4b5f5e7 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "  <div id=\"id_dc1d8880-5d37-4526-8bdb-b8ffb69bce6e\">\n",
              "    <style>\n",
              "      .colab-df-generate {\n",
              "        background-color: #E8F0FE;\n",
              "        border: none;\n",
              "        border-radius: 50%;\n",
              "        cursor: pointer;\n",
              "        display: none;\n",
              "        fill: #1967D2;\n",
              "        height: 32px;\n",
              "        padding: 0 0 0 0;\n",
              "        width: 32px;\n",
              "      }\n",
              "\n",
              "      .colab-df-generate:hover {\n",
              "        background-color: #E2EBFA;\n",
              "        box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "        fill: #174EA6;\n",
              "      }\n",
              "\n",
              "      [theme=dark] .colab-df-generate {\n",
              "        background-color: #3B4455;\n",
              "        fill: #D2E3FC;\n",
              "      }\n",
              "\n",
              "      [theme=dark] .colab-df-generate:hover {\n",
              "        background-color: #434B5C;\n",
              "        box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "        filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "        fill: #FFFFFF;\n",
              "      }\n",
              "    </style>\n",
              "    <button class=\"colab-df-generate\" onclick=\"generateWithVariable('df')\"\n",
              "            title=\"Generate code using this dataframe.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "    <script>\n",
              "      (() => {\n",
              "      const buttonEl =\n",
              "        document.querySelector('#id_dc1d8880-5d37-4526-8bdb-b8ffb69bce6e button.colab-df-generate');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      buttonEl.onclick = () => {\n",
              "        google.colab.notebook.generateWithVariable('df');\n",
              "      }\n",
              "      })();\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "df",
              "summary": "{\n  \"name\": \"df\",\n  \"rows\": 16000,\n  \"fields\": [\n    {\n      \"column\": \"0\",\n      \"properties\": {\n        \"dtype\": \"int8\",\n        \"num_unique_values\": 2,\n        \"samples\": [\n          0,\n          4\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Mon Apr 06 22:19:45 PDT 2009\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 15821,\n        \"samples\": [\n          \"Fri May 29 17:37:13 PDT 2009\",\n          \"Tue Jun 02 07:27:45 PDT 2009\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"_TheSpecialOne_\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 15295,\n        \"samples\": [\n          \"brittuhhnay\",\n          \"niamhscullionb\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"tweet_text\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 15977,\n        \"samples\": [\n          \"I iz lonely and wanting someone to cuddle with. \",\n          \"@lasercosmetica What a Blissful time at Wet today! \"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 24
        }
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 26,
      "metadata": {
        "id": "_JuAy-coCXKH"
      },
      "outputs": [],
      "source": [
        "sample_text = df['tweet_text'].iloc[2] # Use iloc to access by position"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 27,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 53
        },
        "id": "A_2fzixNC8Z9",
        "outputId": "556b7aab-2e8a-4c00-c24a-d6ac46c5872f"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'@vanphotolens Get 100 followers a day using www.tweeterfollow.com Once you add everyone you are on the train or pay vip '"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 27
        }
      ],
      "source": [
        "sample_text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 28,
      "metadata": {
        "id": "Hy3uO6kHDAEQ"
      },
      "outputs": [],
      "source": [
        "from bs4 import BeautifulSoup\n",
        "sample_text = BeautifulSoup(sample_text).get_text()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 29,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 53
        },
        "id": "jcJIBe-QDUba",
        "outputId": "ce0da960-b572-459a-8513-acd25e1aaaba"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'@vanphotolens Get 100 followers a day using www.tweeterfollow.com Once you add everyone you are on the train or pay vip '"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 29
        }
      ],
      "source": [
        "sample_text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 30,
      "metadata": {
        "id": "fnm7NTcKDYxX"
      },
      "outputs": [],
      "source": [
        "import re"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 31,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 53
        },
        "id": "COXSgv-hDcNH",
        "outputId": "1f98a7fb-57a3-4e9e-e6b8-49eeba96f704"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "' vanphotolens Get     followers a day using www tweeterfollow com Once you add everyone you are on the train or pay vip '"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 31
        }
      ],
      "source": [
        "sample_text = re.sub(\"[^a-zA-Z]\",' ',sample_text)\n",
        "sample_text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 32,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 53
        },
        "id": "8KGLrhAbENEm",
        "outputId": "42f72040-f208-485c-a2e1-69fb1f14a749"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "' vanphotolens get     followers a day using www tweeterfollow com once you add everyone you are on the train or pay vip '"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 32
        }
      ],
      "source": [
        "sample_text = sample_text.lower()\n",
        "sample_text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 33,
      "metadata": {
        "id": "nvEA-zoMDzy3"
      },
      "outputs": [],
      "source": [
        "sample_text = sample_text.split()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 34,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "hB0AcgPXD-pY",
        "outputId": "34949289-b050-40c5-f45d-84a4fdd10035"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "['vanphotolens',\n",
              " 'get',\n",
              " 'followers',\n",
              " 'a',\n",
              " 'day',\n",
              " 'using',\n",
              " 'www',\n",
              " 'tweeterfollow',\n",
              " 'com',\n",
              " 'once',\n",
              " 'you',\n",
              " 'add',\n",
              " 'everyone',\n",
              " 'you',\n",
              " 'are',\n",
              " 'on',\n",
              " 'the',\n",
              " 'train',\n",
              " 'or',\n",
              " 'pay',\n",
              " 'vip']"
            ]
          },
          "metadata": {},
          "execution_count": 34
        }
      ],
      "source": [
        "sample_text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 35,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "C3YVPNJuEGdD",
        "outputId": "7a2c2100-c7bd-49f1-c53c-a29cd330566f"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "21"
            ]
          },
          "metadata": {},
          "execution_count": 35
        }
      ],
      "source": [
        "len(sample_text)"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "stop_words = set(stopwords.words(\"english\"))\n",
        "stop_words"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "EuuiDs2ewWXu",
        "outputId": "b4e7185f-9a08-411e-acff-452751f4d8e8"
      },
      "execution_count": 36,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'a',\n",
              " 'about',\n",
              " 'above',\n",
              " 'after',\n",
              " 'again',\n",
              " 'against',\n",
              " 'ain',\n",
              " 'all',\n",
              " 'am',\n",
              " 'an',\n",
              " 'and',\n",
              " 'any',\n",
              " 'are',\n",
              " 'aren',\n",
              " \"aren't\",\n",
              " 'as',\n",
              " 'at',\n",
              " 'be',\n",
              " 'because',\n",
              " 'been',\n",
              " 'before',\n",
              " 'being',\n",
              " 'below',\n",
              " 'between',\n",
              " 'both',\n",
              " 'but',\n",
              " 'by',\n",
              " 'can',\n",
              " 'couldn',\n",
              " \"couldn't\",\n",
              " 'd',\n",
              " 'did',\n",
              " 'didn',\n",
              " \"didn't\",\n",
              " 'do',\n",
              " 'does',\n",
              " 'doesn',\n",
              " \"doesn't\",\n",
              " 'doing',\n",
              " 'don',\n",
              " \"don't\",\n",
              " 'down',\n",
              " 'during',\n",
              " 'each',\n",
              " 'few',\n",
              " 'for',\n",
              " 'from',\n",
              " 'further',\n",
              " 'had',\n",
              " 'hadn',\n",
              " \"hadn't\",\n",
              " 'has',\n",
              " 'hasn',\n",
              " \"hasn't\",\n",
              " 'have',\n",
              " 'haven',\n",
              " \"haven't\",\n",
              " 'having',\n",
              " 'he',\n",
              " \"he'd\",\n",
              " \"he'll\",\n",
              " \"he's\",\n",
              " 'her',\n",
              " 'here',\n",
              " 'hers',\n",
              " 'herself',\n",
              " 'him',\n",
              " 'himself',\n",
              " 'his',\n",
              " 'how',\n",
              " 'i',\n",
              " \"i'd\",\n",
              " \"i'll\",\n",
              " \"i'm\",\n",
              " \"i've\",\n",
              " 'if',\n",
              " 'in',\n",
              " 'into',\n",
              " 'is',\n",
              " 'isn',\n",
              " \"isn't\",\n",
              " 'it',\n",
              " \"it'd\",\n",
              " \"it'll\",\n",
              " \"it's\",\n",
              " 'its',\n",
              " 'itself',\n",
              " 'just',\n",
              " 'll',\n",
              " 'm',\n",
              " 'ma',\n",
              " 'me',\n",
              " 'mightn',\n",
              " \"mightn't\",\n",
              " 'more',\n",
              " 'most',\n",
              " 'mustn',\n",
              " \"mustn't\",\n",
              " 'my',\n",
              " 'myself',\n",
              " 'needn',\n",
              " \"needn't\",\n",
              " 'no',\n",
              " 'nor',\n",
              " 'not',\n",
              " 'now',\n",
              " 'o',\n",
              " 'of',\n",
              " 'off',\n",
              " 'on',\n",
              " 'once',\n",
              " 'only',\n",
              " 'or',\n",
              " 'other',\n",
              " 'our',\n",
              " 'ours',\n",
              " 'ourselves',\n",
              " 'out',\n",
              " 'over',\n",
              " 'own',\n",
              " 're',\n",
              " 's',\n",
              " 'same',\n",
              " 'shan',\n",
              " \"shan't\",\n",
              " 'she',\n",
              " \"she'd\",\n",
              " \"she'll\",\n",
              " \"she's\",\n",
              " 'should',\n",
              " \"should've\",\n",
              " 'shouldn',\n",
              " \"shouldn't\",\n",
              " 'so',\n",
              " 'some',\n",
              " 'such',\n",
              " 't',\n",
              " 'than',\n",
              " 'that',\n",
              " \"that'll\",\n",
              " 'the',\n",
              " 'their',\n",
              " 'theirs',\n",
              " 'them',\n",
              " 'themselves',\n",
              " 'then',\n",
              " 'there',\n",
              " 'these',\n",
              " 'they',\n",
              " \"they'd\",\n",
              " \"they'll\",\n",
              " \"they're\",\n",
              " \"they've\",\n",
              " 'this',\n",
              " 'those',\n",
              " 'through',\n",
              " 'to',\n",
              " 'too',\n",
              " 'under',\n",
              " 'until',\n",
              " 'up',\n",
              " 've',\n",
              " 'very',\n",
              " 'was',\n",
              " 'wasn',\n",
              " \"wasn't\",\n",
              " 'we',\n",
              " \"we'd\",\n",
              " \"we'll\",\n",
              " \"we're\",\n",
              " \"we've\",\n",
              " 'were',\n",
              " 'weren',\n",
              " \"weren't\",\n",
              " 'what',\n",
              " 'when',\n",
              " 'where',\n",
              " 'which',\n",
              " 'while',\n",
              " 'who',\n",
              " 'whom',\n",
              " 'why',\n",
              " 'will',\n",
              " 'with',\n",
              " 'won',\n",
              " \"won't\",\n",
              " 'wouldn',\n",
              " \"wouldn't\",\n",
              " 'y',\n",
              " 'you',\n",
              " \"you'd\",\n",
              " \"you'll\",\n",
              " \"you're\",\n",
              " \"you've\",\n",
              " 'your',\n",
              " 'yours',\n",
              " 'yourself',\n",
              " 'yourselves'}"
            ]
          },
          "metadata": {},
          "execution_count": 36
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "sample_text = [w for w in sample_text if w not in stop_words]\n",
        "sample_text"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YhEMcfQ2wuC4",
        "outputId": "6d8c67ad-f1a7-4a8b-9b6e-adb41a05f0fe"
      },
      "execution_count": 37,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "['vanphotolens',\n",
              " 'get',\n",
              " 'followers',\n",
              " 'day',\n",
              " 'using',\n",
              " 'www',\n",
              " 'tweeterfollow',\n",
              " 'com',\n",
              " 'add',\n",
              " 'everyone',\n",
              " 'train',\n",
              " 'pay',\n",
              " 'vip']"
            ]
          },
          "metadata": {},
          "execution_count": 37
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "len(sample_text)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "luUkKN5NxDVw",
        "outputId": "e1d12840-4715-42e9-d0e7-4a0e0fbfb40e"
      },
      "execution_count": 38,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "13"
            ]
          },
          "metadata": {},
          "execution_count": 38
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def process(review):\n",
        "  review = BeautifulSoup(review).get_text()\n",
        "  review = re.sub(\"[^a-zA-Z]\", ' ',review)\n",
        "  review.lower()\n",
        "  review = review.split()\n",
        "  stop_words = set(stopwords.words(\"english\"))\n",
        "  review = [w for w in review if w not in stop_words]\n",
        "  return (' '.join(review))"
      ],
      "metadata": {
        "id": "xKiD-a49xKyT"
      },
      "execution_count": 39,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "train_x_all = []\n",
        "for index in df.index:  # Iterate through DataFrame index values\n",
        "  if (index + 1) % 1000 == 0:\n",
        "    print(\"step of process\", index + 1)\n",
        "  train_x_all.append(process(df.loc[index, 'tweet_text']))  # Use loc for index-based access"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Q3kqZGLy3HmH",
        "outputId": "d9ad5685-c613-494a-898d-30d17955a183"
      },
      "execution_count": 41,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "step of process 772000\n",
            "step of process 323000\n",
            "step of process 305000\n",
            "step of process 164000\n",
            "step of process 1049000\n",
            "step of process 698000\n",
            "step of process 1586000\n",
            "step of process 1016000\n",
            "step of process 523000\n",
            "step of process 410000\n",
            "step of process 200000\n",
            "step of process 1317000\n",
            "step of process 1323000\n",
            "step of process 1151000\n",
            "step of process 1274000\n",
            "step of process 543000\n",
            "step of process 1482000\n",
            "step of process 1232000\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.model_selection import train_test_split"
      ],
      "metadata": {
        "id": "If9iJarT7jpe"
      },
      "execution_count": 42,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "X = train_x_all\n",
        "y = np.array(df['0'])"
      ],
      "metadata": {
        "id": "meU1L-yP-NAm"
      },
      "execution_count": 43,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)"
      ],
      "metadata": {
        "id": "BcasAZ66-WQ2"
      },
      "execution_count": 44,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.feature_extraction.text import CountVectorizer"
      ],
      "metadata": {
        "id": "EYVnw4u7-ncX"
      },
      "execution_count": 45,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "vectorizer = CountVectorizer(max_features=5000)"
      ],
      "metadata": {
        "id": "wyfKn32LAM9w"
      },
      "execution_count": 46,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "train_x1 = vectorizer.fit_transform(X_train)\n",
        "train_x1"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "RzeZaQSGATEY",
        "outputId": "433f5985-9115-4d33-9ca7-85572e2ebdd7"
      },
      "execution_count": 47,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<Compressed Sparse Row sparse matrix of dtype 'int64'\n",
              "\twith 81556 stored elements and shape (12800, 5000)>"
            ]
          },
          "metadata": {},
          "execution_count": 47
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "train_x1 = train_x1.toarray()"
      ],
      "metadata": {
        "id": "95Ma6XsKD966"
      },
      "execution_count": 48,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "train_y1 = y_train"
      ],
      "metadata": {
        "id": "FfBYLPu9D-_2"
      },
      "execution_count": 49,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "train_x1.shape, train_y1.shape"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "z1OKEqxCECFs",
        "outputId": "3352f2eb-e491-4171-ba78-9e256b230f20"
      },
      "execution_count": 50,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "((12800, 5000), (12800,))"
            ]
          },
          "metadata": {},
          "execution_count": 50
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import lightgbm as lgb\n",
        "model = lgb.LGBMClassifier()"
      ],
      "metadata": {
        "id": "djQ9FWOyEElZ"
      },
      "execution_count": 51,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "model.fit(train_x1,train_y1)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 274
        },
        "id": "Mm-aVRfiEJ0t",
        "outputId": "d90a0ab3-e636-4bbd-c072-611495d7d0ce"
      },
      "execution_count": 52,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.11/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
            "  warnings.warn(\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[LightGBM] [Info] Number of positive: 6468, number of negative: 6332\n",
            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051104 seconds.\n",
            "You can set `force_row_wise=true` to remove the overhead.\n",
            "And if memory is not enough, you can set `force_col_wise=true`.\n",
            "[LightGBM] [Info] Total Bins 2051\n",
            "[LightGBM] [Info] Number of data points in the train set: 12800, number of used features: 796\n",
            "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505313 -> initscore=0.021251\n",
            "[LightGBM] [Info] Start training from score 0.021251\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "LGBMClassifier()"
            ],
            "text/html": [
              "<style>#sk-container-id-1 {\n",
              "  /* Definition of color scheme common for light and dark mode */\n",
              "  --sklearn-color-text: #000;\n",
              "  --sklearn-color-text-muted: #666;\n",
              "  --sklearn-color-line: gray;\n",
              "  /* Definition of color scheme for unfitted estimators */\n",
              "  --sklearn-color-unfitted-level-0: #fff5e6;\n",
              "  --sklearn-color-unfitted-level-1: #f6e4d2;\n",
              "  --sklearn-color-unfitted-level-2: #ffe0b3;\n",
              "  --sklearn-color-unfitted-level-3: chocolate;\n",
              "  /* Definition of color scheme for fitted estimators */\n",
              "  --sklearn-color-fitted-level-0: #f0f8ff;\n",
              "  --sklearn-color-fitted-level-1: #d4ebff;\n",
              "  --sklearn-color-fitted-level-2: #b3dbfd;\n",
              "  --sklearn-color-fitted-level-3: cornflowerblue;\n",
              "\n",
              "  /* Specific color for light theme */\n",
              "  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
              "  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
              "  --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
              "  --sklearn-color-icon: #696969;\n",
              "\n",
              "  @media (prefers-color-scheme: dark) {\n",
              "    /* Redefinition of color scheme for dark theme */\n",
              "    --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
              "    --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
              "    --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
              "    --sklearn-color-icon: #878787;\n",
              "  }\n",
              "}\n",
              "\n",
              "#sk-container-id-1 {\n",
              "  color: var(--sklearn-color-text);\n",
              "}\n",
              "\n",
              "#sk-container-id-1 pre {\n",
              "  padding: 0;\n",
              "}\n",
              "\n",
              "#sk-container-id-1 input.sk-hidden--visually {\n",
              "  border: 0;\n",
              "  clip: rect(1px 1px 1px 1px);\n",
              "  clip: rect(1px, 1px, 1px, 1px);\n",
              "  height: 1px;\n",
              "  margin: -1px;\n",
              "  overflow: hidden;\n",
              "  padding: 0;\n",
              "  position: absolute;\n",
              "  width: 1px;\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-dashed-wrapped {\n",
              "  border: 1px dashed var(--sklearn-color-line);\n",
              "  margin: 0 0.4em 0.5em 0.4em;\n",
              "  box-sizing: border-box;\n",
              "  padding-bottom: 0.4em;\n",
              "  background-color: var(--sklearn-color-background);\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-container {\n",
              "  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
              "     but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
              "     so we also need the `!important` here to be able to override the\n",
              "     default hidden behavior on the sphinx rendered scikit-learn.org.\n",
              "     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
              "  display: inline-block !important;\n",
              "  position: relative;\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-text-repr-fallback {\n",
              "  display: none;\n",
              "}\n",
              "\n",
              "div.sk-parallel-item,\n",
              "div.sk-serial,\n",
              "div.sk-item {\n",
              "  /* draw centered vertical line to link estimators */\n",
              "  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
              "  background-size: 2px 100%;\n",
              "  background-repeat: no-repeat;\n",
              "  background-position: center center;\n",
              "}\n",
              "\n",
              "/* Parallel-specific style estimator block */\n",
              "\n",
              "#sk-container-id-1 div.sk-parallel-item::after {\n",
              "  content: \"\";\n",
              "  width: 100%;\n",
              "  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
              "  flex-grow: 1;\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-parallel {\n",
              "  display: flex;\n",
              "  align-items: stretch;\n",
              "  justify-content: center;\n",
              "  background-color: var(--sklearn-color-background);\n",
              "  position: relative;\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-parallel-item {\n",
              "  display: flex;\n",
              "  flex-direction: column;\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
              "  align-self: flex-end;\n",
              "  width: 50%;\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
              "  align-self: flex-start;\n",
              "  width: 50%;\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
              "  width: 0;\n",
              "}\n",
              "\n",
              "/* Serial-specific style estimator block */\n",
              "\n",
              "#sk-container-id-1 div.sk-serial {\n",
              "  display: flex;\n",
              "  flex-direction: column;\n",
              "  align-items: center;\n",
              "  background-color: var(--sklearn-color-background);\n",
              "  padding-right: 1em;\n",
              "  padding-left: 1em;\n",
              "}\n",
              "\n",
              "\n",
              "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
              "clickable and can be expanded/collapsed.\n",
              "- Pipeline and ColumnTransformer use this feature and define the default style\n",
              "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
              "*/\n",
              "\n",
              "/* Pipeline and ColumnTransformer style (default) */\n",
              "\n",
              "#sk-container-id-1 div.sk-toggleable {\n",
              "  /* Default theme specific background. It is overwritten whether we have a\n",
              "  specific estimator or a Pipeline/ColumnTransformer */\n",
              "  background-color: var(--sklearn-color-background);\n",
              "}\n",
              "\n",
              "/* Toggleable label */\n",
              "#sk-container-id-1 label.sk-toggleable__label {\n",
              "  cursor: pointer;\n",
              "  display: flex;\n",
              "  width: 100%;\n",
              "  margin-bottom: 0;\n",
              "  padding: 0.5em;\n",
              "  box-sizing: border-box;\n",
              "  text-align: center;\n",
              "  align-items: start;\n",
              "  justify-content: space-between;\n",
              "  gap: 0.5em;\n",
              "}\n",
              "\n",
              "#sk-container-id-1 label.sk-toggleable__label .caption {\n",
              "  font-size: 0.6rem;\n",
              "  font-weight: lighter;\n",
              "  color: var(--sklearn-color-text-muted);\n",
              "}\n",
              "\n",
              "#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
              "  /* Arrow on the left of the label */\n",
              "  content: \"▸\";\n",
              "  float: left;\n",
              "  margin-right: 0.25em;\n",
              "  color: var(--sklearn-color-icon);\n",
              "}\n",
              "\n",
              "#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
              "  color: var(--sklearn-color-text);\n",
              "}\n",
              "\n",
              "/* Toggleable content - dropdown */\n",
              "\n",
              "#sk-container-id-1 div.sk-toggleable__content {\n",
              "  max-height: 0;\n",
              "  max-width: 0;\n",
              "  overflow: hidden;\n",
              "  text-align: left;\n",
              "  /* unfitted */\n",
              "  background-color: var(--sklearn-color-unfitted-level-0);\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
              "  /* fitted */\n",
              "  background-color: var(--sklearn-color-fitted-level-0);\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-toggleable__content pre {\n",
              "  margin: 0.2em;\n",
              "  border-radius: 0.25em;\n",
              "  color: var(--sklearn-color-text);\n",
              "  /* unfitted */\n",
              "  background-color: var(--sklearn-color-unfitted-level-0);\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
              "  /* unfitted */\n",
              "  background-color: var(--sklearn-color-fitted-level-0);\n",
              "}\n",
              "\n",
              "#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
              "  /* Expand drop-down */\n",
              "  max-height: 200px;\n",
              "  max-width: 100%;\n",
              "  overflow: auto;\n",
              "}\n",
              "\n",
              "#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
              "  content: \"▾\";\n",
              "}\n",
              "\n",
              "/* Pipeline/ColumnTransformer-specific style */\n",
              "\n",
              "#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
              "  color: var(--sklearn-color-text);\n",
              "  background-color: var(--sklearn-color-unfitted-level-2);\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
              "  background-color: var(--sklearn-color-fitted-level-2);\n",
              "}\n",
              "\n",
              "/* Estimator-specific style */\n",
              "\n",
              "/* Colorize estimator box */\n",
              "#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
              "  /* unfitted */\n",
              "  background-color: var(--sklearn-color-unfitted-level-2);\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
              "  /* fitted */\n",
              "  background-color: var(--sklearn-color-fitted-level-2);\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
              "#sk-container-id-1 div.sk-label label {\n",
              "  /* The background is the default theme color */\n",
              "  color: var(--sklearn-color-text-on-default-background);\n",
              "}\n",
              "\n",
              "/* On hover, darken the color of the background */\n",
              "#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
              "  color: var(--sklearn-color-text);\n",
              "  background-color: var(--sklearn-color-unfitted-level-2);\n",
              "}\n",
              "\n",
              "/* Label box, darken color on hover, fitted */\n",
              "#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
              "  color: var(--sklearn-color-text);\n",
              "  background-color: var(--sklearn-color-fitted-level-2);\n",
              "}\n",
              "\n",
              "/* Estimator label */\n",
              "\n",
              "#sk-container-id-1 div.sk-label label {\n",
              "  font-family: monospace;\n",
              "  font-weight: bold;\n",
              "  display: inline-block;\n",
              "  line-height: 1.2em;\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-label-container {\n",
              "  text-align: center;\n",
              "}\n",
              "\n",
              "/* Estimator-specific */\n",
              "#sk-container-id-1 div.sk-estimator {\n",
              "  font-family: monospace;\n",
              "  border: 1px dotted var(--sklearn-color-border-box);\n",
              "  border-radius: 0.25em;\n",
              "  box-sizing: border-box;\n",
              "  margin-bottom: 0.5em;\n",
              "  /* unfitted */\n",
              "  background-color: var(--sklearn-color-unfitted-level-0);\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-estimator.fitted {\n",
              "  /* fitted */\n",
              "  background-color: var(--sklearn-color-fitted-level-0);\n",
              "}\n",
              "\n",
              "/* on hover */\n",
              "#sk-container-id-1 div.sk-estimator:hover {\n",
              "  /* unfitted */\n",
              "  background-color: var(--sklearn-color-unfitted-level-2);\n",
              "}\n",
              "\n",
              "#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
              "  /* fitted */\n",
              "  background-color: var(--sklearn-color-fitted-level-2);\n",
              "}\n",
              "\n",
              "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
              "\n",
              "/* Common style for \"i\" and \"?\" */\n",
              "\n",
              ".sk-estimator-doc-link,\n",
              "a:link.sk-estimator-doc-link,\n",
              "a:visited.sk-estimator-doc-link {\n",
              "  float: right;\n",
              "  font-size: smaller;\n",
              "  line-height: 1em;\n",
              "  font-family: monospace;\n",
              "  background-color: var(--sklearn-color-background);\n",
              "  border-radius: 1em;\n",
              "  height: 1em;\n",
              "  width: 1em;\n",
              "  text-decoration: none !important;\n",
              "  margin-left: 0.5em;\n",
              "  text-align: center;\n",
              "  /* unfitted */\n",
              "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
              "  color: var(--sklearn-color-unfitted-level-1);\n",
              "}\n",
              "\n",
              ".sk-estimator-doc-link.fitted,\n",
              "a:link.sk-estimator-doc-link.fitted,\n",
              "a:visited.sk-estimator-doc-link.fitted {\n",
              "  /* fitted */\n",
              "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
              "  color: var(--sklearn-color-fitted-level-1);\n",
              "}\n",
              "\n",
              "/* On hover */\n",
              "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
              ".sk-estimator-doc-link:hover,\n",
              "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
              ".sk-estimator-doc-link:hover {\n",
              "  /* unfitted */\n",
              "  background-color: var(--sklearn-color-unfitted-level-3);\n",
              "  color: var(--sklearn-color-background);\n",
              "  text-decoration: none;\n",
              "}\n",
              "\n",
              "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
              ".sk-estimator-doc-link.fitted:hover,\n",
              "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
              ".sk-estimator-doc-link.fitted:hover {\n",
              "  /* fitted */\n",
              "  background-color: var(--sklearn-color-fitted-level-3);\n",
              "  color: var(--sklearn-color-background);\n",
              "  text-decoration: none;\n",
              "}\n",
              "\n",
              "/* Span, style for the box shown on hovering the info icon */\n",
              ".sk-estimator-doc-link span {\n",
              "  display: none;\n",
              "  z-index: 9999;\n",
              "  position: relative;\n",
              "  font-weight: normal;\n",
              "  right: .2ex;\n",
              "  padding: .5ex;\n",
              "  margin: .5ex;\n",
              "  width: min-content;\n",
              "  min-width: 20ex;\n",
              "  max-width: 50ex;\n",
              "  color: var(--sklearn-color-text);\n",
              "  box-shadow: 2pt 2pt 4pt #999;\n",
              "  /* unfitted */\n",
              "  background: var(--sklearn-color-unfitted-level-0);\n",
              "  border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
              "}\n",
              "\n",
              ".sk-estimator-doc-link.fitted span {\n",
              "  /* fitted */\n",
              "  background: var(--sklearn-color-fitted-level-0);\n",
              "  border: var(--sklearn-color-fitted-level-3);\n",
              "}\n",
              "\n",
              ".sk-estimator-doc-link:hover span {\n",
              "  display: block;\n",
              "}\n",
              "\n",
              "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
              "\n",
              "#sk-container-id-1 a.estimator_doc_link {\n",
              "  float: right;\n",
              "  font-size: 1rem;\n",
              "  line-height: 1em;\n",
              "  font-family: monospace;\n",
              "  background-color: var(--sklearn-color-background);\n",
              "  border-radius: 1rem;\n",
              "  height: 1rem;\n",
              "  width: 1rem;\n",
              "  text-decoration: none;\n",
              "  /* unfitted */\n",
              "  color: var(--sklearn-color-unfitted-level-1);\n",
              "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
              "}\n",
              "\n",
              "#sk-container-id-1 a.estimator_doc_link.fitted {\n",
              "  /* fitted */\n",
              "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
              "  color: var(--sklearn-color-fitted-level-1);\n",
              "}\n",
              "\n",
              "/* On hover */\n",
              "#sk-container-id-1 a.estimator_doc_link:hover {\n",
              "  /* unfitted */\n",
              "  background-color: var(--sklearn-color-unfitted-level-3);\n",
              "  color: var(--sklearn-color-background);\n",
              "  text-decoration: none;\n",
              "}\n",
              "\n",
              "#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
              "  /* fitted */\n",
              "  background-color: var(--sklearn-color-fitted-level-3);\n",
              "}\n",
              "</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LGBMClassifier()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow\"><div><div>LGBMClassifier</div></div><div><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></div></label><div class=\"sk-toggleable__content fitted\"><pre>LGBMClassifier()</pre></div> </div></div></div></div>"
            ]
          },
          "metadata": {},
          "execution_count": 52
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "test_x1 = vectorizer.transform(X_test)"
      ],
      "metadata": {
        "id": "aPP2bBrUISfM"
      },
      "execution_count": 53,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "test_x1"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Zl_juoJqIW72",
        "outputId": "b6dc4c6a-8ec3-4a24-aae7-563ac0ca675d"
      },
      "execution_count": 54,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<Compressed Sparse Row sparse matrix of dtype 'int64'\n",
              "\twith 19263 stored elements and shape (3200, 5000)>"
            ]
          },
          "metadata": {},
          "execution_count": 54
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "test_x1 = test_x1.toarray()"
      ],
      "metadata": {
        "id": "THVcAM_zIbT5"
      },
      "execution_count": 55,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "test_x1.shape"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "H_EZiKwoIbMY",
        "outputId": "a587ec03-5c1f-45f4-863b-f65ef56da33b"
      },
      "execution_count": 56,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(3200, 5000)"
            ]
          },
          "metadata": {},
          "execution_count": 56
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.metrics import roc_auc_score"
      ],
      "metadata": {
        "id": "zA1LCCL6ImY9"
      },
      "execution_count": 57,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "test_predict_proba = model.predict_proba(test_x1)[:, 1]  # Assuming binary classification, get probabilities for class 1\n",
        "roc_auc = roc_auc_score(y_test, test_predict_proba)  # Calculate ROC AUC using probabilities\n",
        "roc_auc"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "JBaTvQjnImSm",
        "outputId": "ebca2dc8-ede9-4f34-8ee3-ea357499ad04"
      },
      "execution_count": 58,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.11/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
            "  warnings.warn(\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "np.float64(0.7849473015873015)"
            ]
          },
          "metadata": {},
          "execution_count": 58
        }
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 0
}