{ "cells": [ { "cell_type": "code", "execution_count": 15, "metadata": { "execution": { "iopub.execute_input": "2026-01-20T09:42:14.745657Z", "iopub.status.busy": "2026-01-20T09:42:14.744873Z", "iopub.status.idle": "2026-01-20T09:42:14.750198Z", "shell.execute_reply": "2026-01-20T09:42:14.749406Z", "shell.execute_reply.started": "2026-01-20T09:42:14.745620Z" }, "trusted": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.metrics import accuracy_score, classification_report" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "execution": { "iopub.execute_input": "2026-01-20T09:42:14.752013Z", "iopub.status.busy": "2026-01-20T09:42:14.751712Z", "iopub.status.idle": "2026-01-20T09:42:14.831116Z", "shell.execute_reply": "2026-01-20T09:42:14.830201Z", "shell.execute_reply.started": "2026-01-20T09:42:14.751978Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1562, 20)\n" ] } ], "source": [ "# DATA_PATH = \"/kaggle/input/bot-detection-data/bot_detection_data.csv\"\n", "DATA_PATH = \"/kaggle/input/bot-detection-data/training_data.csv\"\n", "\n", "df = pd.read_csv(DATA_PATH)\n", "print(df.shape)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "execution": { "iopub.execute_input": "2026-01-20T09:42:14.833146Z", "iopub.status.busy": "2026-01-20T09:42:14.832832Z", "iopub.status.idle": "2026-01-20T09:42:14.849605Z", "shell.execute_reply": "2026-01-20T09:42:14.848831Z", "shell.execute_reply.started": "2026-01-20T09:42:14.833119Z" }, "trusted": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idid_strscreen_namelocationdescriptionurlfollowers_countfriends_countlistedcountcreated_atfavourites_countverifiedstatuses_countlangstatusdefault_profiledefault_profile_imagehas_extended_profilenamebot
01.953701e+08195370058kanyejordanNaNThis is what I do. I drop truth bombs.NaN292531399/26/2010 14:450False708enStatus(in_reply_to_status_id=None, favorited=F...TrueFalseFalseKanye Jordan1
17.950000e+177.95E+17astronaut_botNaNKeeping an eye on astronauts coming and going....NaN905Fri Nov 04 12:11:27 +0000 20160False6en{'created_at': 'Tue Nov 22 16:52:31 +0000 2016...TrueFalseFalseAstronaut Notifier1
22.976541e+092976541239TheRiddlerBotCoimbra, PortugalSolve the riddle by replying only the name of ...https://t.co/1v8BON9QpT13246241/13/2015 15:10740False7346enStatus(contributors=None, truncated=False, tex...TrueFalseFalseTheRiddlerBot1
32.243832e+08224383150mlegoudes262NaNNaNNaN5413510Wed Dec 08 21:29:31 +0000 20102False6en{'truncated': False, 'entities': {'user_mentio...TrueFalseFalseLaurie Poulsen1
41.134712e+0711347122GavinNewsomCaliforniaHusband & father. 49th Lt. Gov. of California ...https://t.co/XrGnfzTDJD1300380242487089Wed Dec 19 19:53:42 +0000 20074184True8536en{u'contributors': None, u'truncated': True, u'...FalseFalseFalseGavin Newsom0
\n", "
" ], "text/plain": [ " id id_str screen_name location \\\n", "0 1.953701e+08 195370058 kanyejordan NaN \n", "1 7.950000e+17 7.95E+17 astronaut_bot NaN \n", "2 2.976541e+09 2976541239 TheRiddlerBot Coimbra, Portugal \n", "3 2.243832e+08 224383150 mlegoudes262 NaN \n", "4 1.134712e+07 11347122 GavinNewsom California \n", "\n", " description url \\\n", "0 This is what I do. I drop truth bombs. NaN \n", "1 Keeping an eye on astronauts coming and going.... NaN \n", "2 Solve the riddle by replying only the name of ... https://t.co/1v8BON9QpT \n", "3 NaN NaN \n", "4 Husband & father. 49th Lt. Gov. of California ... https://t.co/XrGnfzTDJD \n", "\n", " followers_count friends_count listedcount \\\n", "0 2925 3 139 \n", "1 9 0 5 \n", "2 132 46 24 \n", "3 54 1351 0 \n", "4 1300380 24248 7089 \n", "\n", " created_at favourites_count verified statuses_count \\\n", "0 9/26/2010 14:45 0 False 708 \n", "1 Fri Nov 04 12:11:27 +0000 2016 0 False 6 \n", "2 1/13/2015 15:10 740 False 7346 \n", "3 Wed Dec 08 21:29:31 +0000 2010 2 False 6 \n", "4 Wed Dec 19 19:53:42 +0000 2007 4184 True 8536 \n", "\n", " lang status default_profile \\\n", "0 en Status(in_reply_to_status_id=None, favorited=F... True \n", "1 en {'created_at': 'Tue Nov 22 16:52:31 +0000 2016... True \n", "2 en Status(contributors=None, truncated=False, tex... True \n", "3 en {'truncated': False, 'entities': {'user_mentio... True \n", "4 en {u'contributors': None, u'truncated': True, u'... False \n", "\n", " default_profile_image has_extended_profile name bot \n", "0 False False Kanye Jordan 1 \n", "1 False False Astronaut Notifier 1 \n", "2 False False TheRiddlerBot 1 \n", "3 False False Laurie Poulsen 1 \n", "4 False False Gavin Newsom 0 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "execution": { "iopub.execute_input": "2026-01-20T09:42:14.851012Z", "iopub.status.busy": "2026-01-20T09:42:14.850700Z", "iopub.status.idle": "2026-01-20T09:42:14.867213Z", "shell.execute_reply": "2026-01-20T09:42:14.866311Z", "shell.execute_reply.started": "2026-01-20T09:42:14.850985Z" }, "trusted": true }, "outputs": [], "source": [ "FEATURES = [\n", " \"followers_count\",\n", " \"friends_count\",\n", " \"listedcount\",\n", " \"favourites_count\",\n", " \"statuses_count\",\n", " \"verified\",\n", " \"default_profile\",\n", " \"default_profile_image\",\n", " \"has_extended_profile\"\n", "]\n", "\n", "X = df[FEATURES].fillna(0)\n", "y = df[\"bot\"]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "execution": { "iopub.execute_input": "2026-01-20T09:42:14.869442Z", "iopub.status.busy": "2026-01-20T09:42:14.869099Z", "iopub.status.idle": "2026-01-20T09:42:14.884158Z", "shell.execute_reply": "2026-01-20T09:42:14.883300Z", "shell.execute_reply.started": "2026-01-20T09:42:14.869405Z" }, "trusted": true }, "outputs": [], "source": [ "bool_cols = [\n", " \"verified\",\n", " \"default_profile\",\n", " \"default_profile_image\",\n", " \"has_extended_profile\"\n", "]\n", "\n", "for col in bool_cols:\n", " X[col] = X[col].astype(int)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "execution": { "iopub.execute_input": "2026-01-20T09:42:14.885944Z", "iopub.status.busy": "2026-01-20T09:42:14.885337Z", "iopub.status.idle": "2026-01-20T09:42:14.899788Z", "shell.execute_reply": "2026-01-20T09:42:14.898857Z", "shell.execute_reply.started": "2026-01-20T09:42:14.885913Z" }, "trusted": true }, "outputs": [], "source": [ "X[\"follow_ratio\"] = X[\"followers_count\"] / (X[\"friends_count\"] + 1)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "execution": { "iopub.execute_input": "2026-01-20T09:42:14.901446Z", "iopub.status.busy": "2026-01-20T09:42:14.901029Z", "iopub.status.idle": "2026-01-20T09:42:14.920930Z", "shell.execute_reply": "2026-01-20T09:42:14.920119Z", "shell.execute_reply.started": "2026-01-20T09:42:14.901408Z" }, "trusted": true }, "outputs": [], "source": [ "df[\"created_at\"] = pd.to_datetime(df[\"created_at\"], errors=\"coerce\")\n", "\n", "X[\"account_age_days\"] = (\n", " pd.Timestamp.now() - df[\"created_at\"]\n", ").dt.days.fillna(0)\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "execution": { "iopub.execute_input": "2026-01-20T09:42:14.922400Z", "iopub.status.busy": "2026-01-20T09:42:14.922068Z", "iopub.status.idle": "2026-01-20T09:42:14.940152Z", "shell.execute_reply": "2026-01-20T09:42:14.939293Z", "shell.execute_reply.started": "2026-01-20T09:42:14.922365Z" }, "trusted": true }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X,\n", " y,\n", " test_size=0.2,\n", " random_state=42\n", ")\n" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "execution": { "iopub.execute_input": "2026-01-20T09:42:14.941565Z", "iopub.status.busy": "2026-01-20T09:42:14.941261Z", "iopub.status.idle": "2026-01-20T09:42:15.734600Z", "shell.execute_reply": "2026-01-20T09:42:15.733765Z", "shell.execute_reply.started": "2026-01-20T09:42:14.941540Z" }, "trusted": true }, "outputs": [ { "data": { "text/html": [ "
RandomForestClassifier(class_weight='balanced', max_depth=20,\n",
       "                       min_samples_leaf=2, n_estimators=300, n_jobs=-1,\n",
       "                       random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "RandomForestClassifier(class_weight='balanced', max_depth=20,\n", " min_samples_leaf=2, n_estimators=300, n_jobs=-1,\n", " random_state=42)" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "rf = RandomForestClassifier(\n", " n_estimators=300,\n", " max_depth=20,\n", " min_samples_leaf=2,\n", " class_weight=\"balanced\",\n", " random_state=42,\n", " n_jobs=-1\n", ")\n", "\n", "rf.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "execution": { "iopub.execute_input": "2026-01-20T09:42:15.736130Z", "iopub.status.busy": "2026-01-20T09:42:15.735775Z", "iopub.status.idle": "2026-01-20T09:42:15.851114Z", "shell.execute_reply": "2026-01-20T09:42:15.850291Z", "shell.execute_reply.started": "2026-01-20T09:42:15.736093Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.8785942492012779\n", " precision recall f1-score support\n", "\n", " 0 0.90 0.87 0.89 169\n", " 1 0.85 0.89 0.87 144\n", "\n", " accuracy 0.88 313\n", " macro avg 0.88 0.88 0.88 313\n", "weighted avg 0.88 0.88 0.88 313\n", "\n" ] } ], "source": [ "preds = rf.predict(X_test)\n", "\n", "print(\"Accuracy:\", accuracy_score(y_test, preds))\n", "print(classification_report(y_test, preds))\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "execution": { "iopub.execute_input": "2026-01-20T09:42:15.853420Z", "iopub.status.busy": "2026-01-20T09:42:15.853099Z", "iopub.status.idle": "2026-01-20T09:42:15.919231Z", "shell.execute_reply": "2026-01-20T09:42:15.918360Z", "shell.execute_reply.started": "2026-01-20T09:42:15.853391Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " feature importance\n", "1 friends_count 0.204309\n", "9 follow_ratio 0.144836\n", "3 favourites_count 0.135528\n", "0 followers_count 0.109556\n", "5 verified 0.099516\n", "10 account_age_days 0.090862\n", "2 listedcount 0.088300\n", "4 statuses_count 0.076216\n", "6 default_profile 0.039780\n", "8 has_extended_profile 0.008163\n", "7 default_profile_image 0.002935\n" ] } ], "source": [ "imp = pd.DataFrame({\n", " \"feature\": X.columns,\n", " \"importance\": rf.feature_importances_\n", "}).sort_values(by=\"importance\", ascending=False)\n", "\n", "print(imp)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "execution": { "iopub.execute_input": "2026-01-20T09:42:15.920668Z", "iopub.status.busy": "2026-01-20T09:42:15.920341Z", "iopub.status.idle": "2026-01-20T09:42:16.022530Z", "shell.execute_reply": "2026-01-20T09:42:16.021678Z", "shell.execute_reply.started": "2026-01-20T09:42:15.920632Z" }, "trusted": true }, "outputs": [ { "data": { "text/plain": [ "['bot_model.joblib']" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import joblib\n", "\n", "joblib.dump(rf, \"bot_model.joblib\")" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "execution": { "iopub.execute_input": "2026-01-20T09:42:16.024523Z", "iopub.status.busy": "2026-01-20T09:42:16.023646Z", "iopub.status.idle": "2026-01-20T09:42:16.029010Z", "shell.execute_reply": "2026-01-20T09:42:16.028344Z", "shell.execute_reply.started": "2026-01-20T09:42:16.024490Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RF trained feature count: 11\n", "RF trained feature names:\n", "['followers_count', 'friends_count', 'listedcount', 'favourites_count', 'statuses_count', 'verified', 'default_profile', 'default_profile_image', 'has_extended_profile', 'follow_ratio', 'account_age_days']\n" ] } ], "source": [ "# ✅ After training RF\n", "print(\"RF trained feature count:\", len(rf.feature_names_in_))\n", "print(\"RF trained feature names:\")\n", "print(list(rf.feature_names_in_))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [] } ], "metadata": { "kaggle": { "accelerator": "none", "dataSources": [ { "datasetId": 9259817, "sourceId": 14497523, "sourceType": "datasetVersion" } ], "dockerImageVersionId": 31234, "isGpuEnabled": false, "isInternetEnabled": true, "language": "python", "sourceType": "notebook" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 4 }