Spaces:

soumyaprabhamaiti
/

hate_speech_classifier

Runtime error

App Files Files Community

soumyaprabhamaiti commited on Sep 3, 2023

Commit

3abbcfd

1 Parent(s): 5ce506c

Add development folder

Browse files

Files changed (2) hide show

development/hate-speech-classification.ipynb +815 -0
development/requirements_dev.txt +8 -0

development/hate-speech-classification.ipynb ADDED Viewed

	@@ -0,0 +1,815 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c99a9e2c",
+   "metadata": {},
+   "source": [
+    "# Import the necessary libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb19171c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pickle\n",
+    "import re\n",
+    "import string\n",
+    "from collections.abc import Iterable\n",
+    "\n",
+    "import keras\n",
+    "import matplotlib.pyplot as plt\n",
+    "import nltk\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
+    "from keras.layers import (LSTM, Activation, Dense, Dropout, Embedding, Input,\n",
+    "                          SpatialDropout1D)\n",
+    "from keras.models import Model, Sequential\n",
+    "from keras.optimizers import RMSprop\n",
+    "from keras.preprocessing import sequence\n",
+    "from keras.preprocessing.text import Tokenizer\n",
+    "from keras.utils import pad_sequences, to_categorical\n",
+    "from nltk.corpus import stopwords\n",
+    "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "nltk.download('stopwords')\n",
+    "pd.set_option('display.max_rows', None)\n",
+    "pd.set_option('display.max_columns', None)\n",
+    "pd.set_option('display.max_colwidth', 255)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "77ee39a1",
+   "metadata": {},
+   "source": [
+    "# Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2289c89e",
+   "metadata": {},
+   "source": [
+    "## Dataset 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70bddc47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = pd.read_csv(\"/kaggle/input/twitter-hate-speech/train_E6oV3lV.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e407435d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ea10f67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.countplot(x='label', data=df1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4bef62c7",
+   "metadata": {},
+   "source": [
+    "From the above plot we can see that classes are imbalanced, we will fix it later."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "252edcb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Checking the shape of the data\n",
+    "df1.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e256090",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cheking if null values are present in the dataset or not.\n",
+    "df1.isnull().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d0cc255",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Drop unnecessary columns\n",
+    "df1.drop('id', axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "963f8229",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5767e166",
+   "metadata": {},
+   "source": [
+    "## Dataset 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd8dde1a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2 = pd.read_csv(\n",
+    "    \"/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv\")\n",
+    "df2.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8a4a332",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b66a6907",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.isnull().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49db9d8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Drop the columns which are not required for us.\n",
+    "df2.drop(['Unnamed: 0', 'count', 'hate_speech',\n",
+    "         'offensive_language', 'neither'], axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48981e64",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97b0500b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# All the unique class labels\n",
+    "df2['class'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71971d95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plotting the countplot for our new dataset\n",
+    "sns.countplot(x='class', data=df2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ce30639",
+   "metadata": {},
+   "source": [
+    "-  class 0 - hate speech; class 1 - offensive language; class 2 - neither"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce04999f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Merge class 0 and 1 into 1. Class 1 now represents hate speech\n",
+    "df2[\"class\"].replace({0: 1}, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "499d5336",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2[\"class\"].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2cb91824",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.countplot(x=\"class\", data=df2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9bf7ba3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Replace the value of 2 to 0.Class 0 is now \"No hate\"\n",
+    "df2[\"class\"].replace({2: 0}, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16bc2c3e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.countplot(x='class', data=df2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5834f0e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Rename 'class' to label\n",
+    "df2.rename(columns={'class': 'label'}, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e6a6a19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b76458f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.iloc[0]['tweet']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "42a65071",
+   "metadata": {},
+   "source": [
+    "## Merge df1 and df2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77c925a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.concat([df1, df2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b81eef43",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "952ef123",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.countplot(x='label', data=df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "608c3277",
+   "metadata": {},
+   "source": [
+    "Now we can see that the problem of imbalace data has been solved."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "293d0d21",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4d8117e1",
+   "metadata": {},
+   "source": [
+    "## Data cleaning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e76a3db9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Apply regex and do cleaning.\n",
+    "def clean_text(words: str) -> str:\n",
+    "    words = str(words).lower()\n",
+    "    words = re.sub('\\[.*?\\]', '', words)\n",
+    "    words = re.sub('https?://\\S+|www\\.\\S+', '', words)\n",
+    "    words = re.sub('<.*?>+', '', words)\n",
+    "    words = re.sub(r'@\\w+', '', words)\n",
+    "    words = re.sub('[%s]' % re.escape(string.punctuation), '', words)\n",
+    "    words = re.sub('\\n', '', words)\n",
+    "    words = re.sub('\\w*\\d\\w*', '', words)\n",
+    "\n",
+    "    stopword = set(stopwords.words('english'))\n",
+    "    words = ' '.join(\n",
+    "        [word for word in words.split(' ') if word not in stopword])\n",
+    "\n",
+    "    stemmer = nltk.SnowballStemmer(\"english\")\n",
+    "    words = ' '.join([stemmer.stem(word) for word in words.split(' ')])\n",
+    "\n",
+    "    return words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd98ec5a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Apply the data_cleaning on the data.\n",
+    "df_cleaned = df.copy()\n",
+    "df_cleaned['tweet'] = df['tweet'].apply(clean_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5c6a309",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_cleaned['tweet'][1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3df4b3e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_cleaned.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39e9dff5",
+   "metadata": {},
+   "source": [
+    "## Train test split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "060e1f76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = df_cleaned['tweet']\n",
+    "y = df_cleaned['label']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b39fbd9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Split the data into train and test\n",
+    "x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)\n",
+    "print(len(x_train), len(y_train))\n",
+    "print(len(x_test), len(y_test))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29be47f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "type(x_test), type(y_test), type(x_train), type(y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "402ecb50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(x_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0187c473",
+   "metadata": {},
+   "source": [
+    "## Tokenization and padding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc49a7f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_and_pad(text_list: Iterable[str], tokenizer: Tokenizer, max_len: int) -> np.ndarray[np.str_]:\n",
+    "    sequences = tokenizer.texts_to_sequences(text_list)\n",
+    "    sequences_matrix = pad_sequences(sequences, maxlen=max_len)\n",
+    "    return sequences_matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4329001",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "max_words = 50000\n",
+    "max_len = 300\n",
+    "\n",
+    "tokenizer = Tokenizer(num_words=max_words)\n",
+    "tokenizer.fit_on_texts(x_train)\n",
+    "\n",
+    "x_train_tokenized = tokenize_and_pad(x_train, tokenizer, max_len)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21261eee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('tokenizer.pickle', 'wb') as handle:\n",
+    "    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5833c859",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_train_tokenized"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "811f8996",
+   "metadata": {},
+   "source": [
+    "# Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b42ceb66",
+   "metadata": {},
+   "source": [
+    "## Model architecture"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15e9d814",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "# Creating model architecture.\n",
+    "model = Sequential()\n",
+    "model.add(Embedding(max_words, 100, input_length=max_len))\n",
+    "model.add(SpatialDropout1D(0.2))\n",
+    "model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))\n",
+    "model.add(Dense(1, activation='sigmoid'))\n",
+    "\n",
+    "model.summary()\n",
+    "\n",
+    "model.compile(loss='binary_crossentropy',\n",
+    "              optimizer=RMSprop(), metrics=['accuracy'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae55985d",
+   "metadata": {},
+   "source": [
+    "## Callbacks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9065382d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "early_stopping_callback = EarlyStopping(\n",
+    "    monitor='val_loss',  # Metric to monitor (e.g., validation loss)\n",
+    "    patience=3,           # Number of epochs with no improvement to wait\n",
+    "    restore_best_weights=True  # Restore model weights to the best achieved during training\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "90fb2dbf",
+   "metadata": {},
+   "source": [
+    "## Training\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb3a5153",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# starting model training\n",
+    "history = model.fit(\n",
+    "    x_train_tokenized, y_train,\n",
+    "    batch_size=128,\n",
+    "    epochs=20,\n",
+    "    validation_split=0.2,\n",
+    "    callbacks=[early_stopping_callback]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b509694a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save(\"model.h5\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01484e53",
+   "metadata": {},
+   "source": [
+    "## Evaluation and testing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "86a6cd51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_sequences = tokenizer.texts_to_sequences(x_test)\n",
+    "test_sequences_matrix = pad_sequences(test_sequences, maxlen=max_len)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7674863a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model evaluation\n",
+    "accr = model.evaluate(test_sequences_matrix, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03f93f02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lstm_prediction = model.predict(test_sequences_matrix)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b04a6f5",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "res = []\n",
+    "for prediction in lstm_prediction:\n",
+    "    if prediction[0] < 0.5:\n",
+    "        res.append(0)\n",
+    "    else:\n",
+    "        res.append(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20ec485c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(confusion_matrix(y_test, res))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0062900e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "load_model = keras.models.load_model(\"model.h5\")\n",
+    "with open('tokenizer.pickle', 'rb') as handle:\n",
+    "    load_tokenizer = pickle.load(handle)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5612cac0",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "# Let's test our model on custom data.\n",
+    "test = 'humans are idiots'\n",
+    "\n",
+    "\n",
+    "def clean_text(text):\n",
+    "    print(text)\n",
+    "    text = str(text).lower()\n",
+    "    text = re.sub('\\[.*?\\]', '', text)\n",
+    "    text = re.sub('https?://\\S+|www\\.\\S+', '', text)\n",
+    "    text = re.sub('<.*?>+', '', text)\n",
+    "    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)\n",
+    "    text = re.sub('\\n', '', text)\n",
+    "    text = re.sub('\\w*\\d\\w*', '', text)\n",
+    "    print(text)\n",
+    "    text = [word for word in text.split(' ') if word not in stopword]\n",
+    "    text = \" \".join(text)\n",
+    "    text = [stemmer.stem(word) for word in text.split(' ')]\n",
+    "    text = \" \".join(text)\n",
+    "    return text\n",
+    "\n",
+    "\n",
+    "test = [clean_text(test)]\n",
+    "print(test)\n",
+    "seq = load_tokenizer.texts_to_sequences(test)\n",
+    "padded = pad_sequences(seq, maxlen=300)\n",
+    "print(seq)\n",
+    "pred = load_model.predict(padded)\n",
+    "print(\"pred\", pred)\n",
+    "if pred < 0.5:\n",
+    "    print(\"no hate\")\n",
+    "else:\n",
+    "    print(\"hate and abusive\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d90fb1eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e564ae3e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "while True:\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41301aee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# https://www.kaggle.com/soumyaprabhamaiti/hate-speech-classification/edit"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

development/requirements_dev.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+tensorflow
+numpy
+pandas
+seaborn
+matplotlib
+gradio
+nltk
+jupytext