{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "b675bcf0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\adity\\Desktop\\Study\\hate_speech_detector\\venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['count', 'hate_speech_count', 'offensive_language_count', 'neither_count', 'class', 'tweet'],\n", " num_rows: 24783\n", " })\n", "})\n" ] } ], "source": [ "from datasets import load_dataset\n", "dataset = load_dataset(\"tdavidson/hate_speech_offensive\")\n", "print(dataset)" ] }, { "cell_type": "code", "execution_count": 5, "id": "7ede2ff8", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Creating CSV from Arrow format: 100%|██████████| 25/25 [00:00<00:00, 251.31ba/s]\n" ] }, { "data": { "text/plain": [ "2433371" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset[\"train\"].to_csv(\"train.csv\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "673a46d4", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import re\n", "import joblib\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import classification_report" ] }, { "cell_type": "code", "execution_count": 7, "id": "7022ee7c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Class distribution:\n", "class\n", "1 19190\n", "2 4163\n", "0 1430\n", "Name: count, dtype: int64\n" ] } ], "source": [ "df = pd.read_csv('train.csv')\n", "\n", "print(\"Class distribution:\")\n", "print(df['class'].value_counts())\n", "\n", "def clean_tweet(tweet):\n", " if pd.isna(tweet):\n", " return \"\"\n", " tweet = re.sub(r'http\\S+', ' URL ', tweet)\n", " tweet = re.sub(r'@\\w+', ' USER ', tweet)\n", " tweet = re.sub(r'#(\\w+)', r'HASHTAG_\\1', tweet) \n", " tweet = re.sub(r'\\.{2,}', ' ELLIPSIS ', tweet)\n", " tweet = re.sub(r'!{2,}', ' EXCLAMATION ', tweet) \n", " tweet = re.sub(r'[^\\w\\s!?*#$%&]', ' ', tweet) \n", " tweet = re.sub(r'(.)\\1{2,}', r'\\1\\1', tweet) \n", " tweet = tweet.lower().strip()\n", " tweet = ' '.join(tweet.split()) \n", " return tweet\n", "\n", "def minimal_clean(tweet):\n", " if pd.isna(tweet):\n", " return \"\"\n", " tweet = re.sub(r'http\\S+', ' URL ', tweet)\n", " tweet = re.sub(r'@\\w+', ' USER ', tweet)\n", " tweet = tweet.lower().strip()\n", " return ' '.join(tweet.split())" ] }, { "cell_type": "code", "execution_count": 8, "id": "d2086d16", "metadata": {}, "outputs": [], "source": [ "df['cleaned_tweet'] = df['tweet'].apply(minimal_clean)\n", "\n", "X = df['cleaned_tweet']\n", "y = df['class']\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=42, stratify=y\n", ")\n", "\n", "pipeline = Pipeline([\n", " ('tfidf', TfidfVectorizer(\n", " max_features=10000,\n", " stop_words='english',\n", " ngram_range=(1, 3),\n", " min_df=2,\n", " max_df=0.9,\n", " analyzer='word',\n", " token_pattern=r'(?u)\\b\\w+\\b',\n", " )),\n", " ('clf', RandomForestClassifier(\n", " n_estimators=200,\n", " random_state=42,\n", " n_jobs=-1,\n", " class_weight='balanced'\n", " ))\n", "])" ] }, { "cell_type": "code", "execution_count": 9, "id": "d3c15ee9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "training model\n", "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.53 0.15 0.23 286\n", " 1 0.92 0.96 0.94 3838\n", " 2 0.83 0.87 0.85 833\n", "\n", " accuracy 0.90 4957\n", " macro avg 0.76 0.66 0.67 4957\n", "weighted avg 0.88 0.90 0.88 4957\n", "\n" ] }, { "data": { "text/plain": [ "['hate_speech_model.pkl']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"training model\")\n", "pipeline.fit(X_train, y_train)\n", "y_pred = pipeline.predict(X_test)\n", "print(\"\\nClassification Report:\")\n", "print(classification_report(y_test, y_pred))\n", "joblib.dump(pipeline, 'hate_speech_model.pkl')" ] }, { "cell_type": "code", "execution_count": null, "id": "ffbecd10", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }