{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b675bcf0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\adity\\Desktop\\Study\\hate_speech_detector\\venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatasetDict({\n",
      "    train: Dataset({\n",
      "        features: ['count', 'hate_speech_count', 'offensive_language_count', 'neither_count', 'class', 'tweet'],\n",
      "        num_rows: 24783\n",
      "    })\n",
      "})\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "dataset = load_dataset(\"tdavidson/hate_speech_offensive\")\n",
    "print(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7ede2ff8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Creating CSV from Arrow format: 100%|██████████| 25/25 [00:00<00:00, 251.31ba/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "2433371"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset[\"train\"].to_csv(\"train.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "673a46d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import joblib\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7022ee7c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Class distribution:\n",
      "class\n",
      "1    19190\n",
      "2     4163\n",
      "0     1430\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv('train.csv')\n",
    "\n",
    "print(\"Class distribution:\")\n",
    "print(df['class'].value_counts())\n",
    "\n",
    "def clean_tweet(tweet):\n",
    "    if pd.isna(tweet):\n",
    "        return \"\"\n",
    "    tweet = re.sub(r'http\\S+', ' URL ', tweet)\n",
    "    tweet = re.sub(r'@\\w+', ' USER ', tweet)\n",
    "    tweet = re.sub(r'#(\\w+)', r'HASHTAG_\\1', tweet)    \n",
    "    tweet = re.sub(r'\\.{2,}', ' ELLIPSIS ', tweet)\n",
    "    tweet = re.sub(r'!{2,}', ' EXCLAMATION ', tweet)    \n",
    "    tweet = re.sub(r'[^\\w\\s!?*#$%&]', ' ', tweet)    \n",
    "    tweet = re.sub(r'(.)\\1{2,}', r'\\1\\1', tweet)    \n",
    "    tweet = tweet.lower().strip()\n",
    "    tweet = ' '.join(tweet.split())    \n",
    "    return tweet\n",
    "\n",
    "def minimal_clean(tweet):\n",
    "    if pd.isna(tweet):\n",
    "        return \"\"\n",
    "    tweet = re.sub(r'http\\S+', ' URL ', tweet)\n",
    "    tweet = re.sub(r'@\\w+', ' USER ', tweet)\n",
    "    tweet = tweet.lower().strip()\n",
    "    return ' '.join(tweet.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d2086d16",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['cleaned_tweet'] = df['tweet'].apply(minimal_clean)\n",
    "\n",
    "X = df['cleaned_tweet']\n",
    "y = df['class']\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.2, random_state=42, stratify=y\n",
    ")\n",
    "\n",
    "pipeline = Pipeline([\n",
    "    ('tfidf', TfidfVectorizer(\n",
    "        max_features=10000,\n",
    "        stop_words='english',\n",
    "        ngram_range=(1, 3),\n",
    "        min_df=2,\n",
    "        max_df=0.9,\n",
    "        analyzer='word',\n",
    "        token_pattern=r'(?u)\\b\\w+\\b',\n",
    "    )),\n",
    "    ('clf', RandomForestClassifier(\n",
    "        n_estimators=200,\n",
    "        random_state=42,\n",
    "        n_jobs=-1,\n",
    "        class_weight='balanced'\n",
    "    ))\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d3c15ee9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training model\n",
      "\n",
      "Classification Report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.53      0.15      0.23       286\n",
      "           1       0.92      0.96      0.94      3838\n",
      "           2       0.83      0.87      0.85       833\n",
      "\n",
      "    accuracy                           0.90      4957\n",
      "   macro avg       0.76      0.66      0.67      4957\n",
      "weighted avg       0.88      0.90      0.88      4957\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['hate_speech_model.pkl']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"training model\")\n",
    "pipeline.fit(X_train, y_train)\n",
    "y_pred = pipeline.predict(X_test)\n",
    "print(\"\\nClassification Report:\")\n",
    "print(classification_report(y_test, y_pred))\n",
    "joblib.dump(pipeline, 'hate_speech_model.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffbecd10",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}