{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Comment Toxicity.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "gpuClass": "standard" }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "S99TDbfWWq0u", "outputId": "27146bb5-1f2c-42ce-9d75-f2e46418494e" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "source": [ "ls drive/MyDrive/jigsaw-toxic-comment-classification-challenge/" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6MtlVgwufR8M", "outputId": "acbed960-207b-4f6a-f5da-077cfba36c1d" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[0m\u001b[01;34msample_submission.csv\u001b[0m/ test.csv.zip train.csv.zip\n", "sample_submission.csv.zip \u001b[01;34mtest_labels.csv\u001b[0m/ X_test.pickle\n", "simple_model.h5 test_labels.csv.zip X_train.pickle\n", "\u001b[01;34mtest.csv\u001b[0m/ \u001b[01;34mtrain.csv\u001b[0m/\n" ] } ] }, { "cell_type": "code", "source": [ "%cd drive/MyDrive/jigsaw-toxic-comment-classification-challenge/" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "la5teYvGfcZ2", "outputId": "60f69844-0a0e-4b3f-9882-d1291ee433b4" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "/content/drive/MyDrive/jigsaw-toxic-comment-classification-challenge\n" ] } ] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "import numpy as np\n", "import tensorflow as tf\n", "import matplotlib.pyplot as plt\n", "from tensorflow import keras\n", "from keras.preprocessing.text import Tokenizer\n", "from keras.preprocessing.sequence import pad_sequences\n", "import nltk\n", "from nltk.corpus import stopwords\n", "nltk.download('stopwords')\n", "nltk.download('punkt')\n", "from nltk.tokenize import word_tokenize\n", "import re\n", "from sklearn.model_selection import train_test_split" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0iHYnJ53foct", "outputId": "d9c9717b-5368-474b-b3e8-dcd85f4f4ce5" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Unzipping corpora/stopwords.zip.\n", "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Unzipping tokenizers/punkt.zip.\n" ] } ] }, { "cell_type": "code", "source": [ "train = pd.read_csv('train.csv/train.csv')\n" ], "metadata": { "id": "GBGHqE1Wfeb9" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [ "test = pd.read_csv('test.csv/test.csv')" ], "metadata": { "id": "-wGlqD9GZrkC" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "test_labels = pd.read_csv('test_labels.csv/test_labels.csv')" ], "metadata": { "id": "haDqTJbRZwy3" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "train.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mllErgckfvOa", "outputId": "fc720f9b-d80d-48af-fed8-b472d554c22f" }, "execution_count": 8, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(159571, 8)" ] }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "code", "source": [ "train.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 461 }, "id": "v5JPdmYTf36i", "outputId": "83d2d1ca-af74-4a91-8ac5-594f7ea28055" }, "execution_count": 9, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id comment_text toxic \\\n", "0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \n", "1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 \n", "2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 \n", "3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... 0 \n", "4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 \n", "\n", " severe_toxic obscene threat insult identity_hate \n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 0 \n", "3 0 0 0 0 0 \n", "4 0 0 0 0 0 " ], "text/html": [ "\n", "
| \n", " | id | \n", "comment_text | \n", "toxic | \n", "severe_toxic | \n", "obscene | \n", "threat | \n", "insult | \n", "identity_hate | \n", "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", "0000997932d777bf | \n", "Explanation\\nWhy the edits made under my usern... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
| 1 | \n", "000103f0d9cfb60f | \n", "D'aww! He matches this background colour I'm s... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
| 2 | \n", "000113f07ec002fd | \n", "Hey man, I'm really not trying to edit war. It... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
| 3 | \n", "0001b41b1c6bb37e | \n", "\"\\nMore\\nI can't make any real suggestions on ... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
| 4 | \n", "0001d958c54c6e35 | \n", "You, sir, are my hero. Any chance you remember... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "