Spaces:
Sleeping
Sleeping
Commit
·
1d571f0
1
Parent(s):
413103d
Update bot detection model and features
Browse files- BotDetectionEDA.ipynb +30 -38
BotDetectionEDA.ipynb
CHANGED
|
@@ -13,19 +13,18 @@
|
|
| 13 |
},
|
| 14 |
{
|
| 15 |
"cell_type": "code",
|
| 16 |
-
"execution_count":
|
| 17 |
"metadata": {
|
| 18 |
"id": "pLAYgHzBCh3U"
|
| 19 |
},
|
| 20 |
"outputs": [],
|
| 21 |
"source": [
|
| 22 |
-
"
|
| 23 |
"import pandas as pd\n",
|
| 24 |
"import numpy as np\n",
|
| 25 |
"import matplotlib.pyplot as plt\n",
|
| 26 |
"import seaborn as sns\n",
|
| 27 |
"\n",
|
| 28 |
-
"# Machine Learning Libraries\n",
|
| 29 |
"from sklearn.model_selection import train_test_split\n",
|
| 30 |
"from sklearn.preprocessing import StandardScaler\n",
|
| 31 |
"from sklearn.ensemble import RandomForestClassifier\n",
|
|
@@ -35,7 +34,7 @@
|
|
| 35 |
},
|
| 36 |
{
|
| 37 |
"cell_type": "code",
|
| 38 |
-
"execution_count":
|
| 39 |
"metadata": {
|
| 40 |
"colab": {
|
| 41 |
"base_uri": "https://localhost:8080/",
|
|
@@ -258,11 +257,11 @@
|
|
| 258 |
}
|
| 259 |
],
|
| 260 |
"source": [
|
| 261 |
-
"
|
| 262 |
-
"file_path = \"../Dataset/training_data.csv\"
|
| 263 |
"df = pd.read_csv(file_path)\n",
|
| 264 |
"\n",
|
| 265 |
-
"
|
| 266 |
"df.head()\n"
|
| 267 |
]
|
| 268 |
},
|
|
@@ -317,7 +316,7 @@
|
|
| 317 |
},
|
| 318 |
{
|
| 319 |
"cell_type": "code",
|
| 320 |
-
"execution_count":
|
| 321 |
"metadata": {
|
| 322 |
"colab": {
|
| 323 |
"base_uri": "https://localhost:8080/"
|
|
@@ -338,11 +337,11 @@
|
|
| 338 |
}
|
| 339 |
],
|
| 340 |
"source": [
|
| 341 |
-
"
|
| 342 |
"bool_cols = [\"verified\", \"default_profile\", \"default_profile_image\"]\n",
|
| 343 |
"df[bool_cols] = df[bool_cols].astype(int)\n",
|
| 344 |
"\n",
|
| 345 |
-
"
|
| 346 |
"print(df.dtypes[bool_cols])\n"
|
| 347 |
]
|
| 348 |
},
|
|
@@ -352,14 +351,14 @@
|
|
| 352 |
"id": "UWtBhVBX8Hy2"
|
| 353 |
},
|
| 354 |
"source": [
|
| 355 |
-
"#Data Visualization\n",
|
| 356 |
"Visualizing the distribution of numerical features to understand data patterns.\n",
|
| 357 |
"This helps in identifying skewness, outliers, and differences between bot and non-bot accounts.\n"
|
| 358 |
]
|
| 359 |
},
|
| 360 |
{
|
| 361 |
"cell_type": "code",
|
| 362 |
-
"execution_count":
|
| 363 |
"metadata": {
|
| 364 |
"colab": {
|
| 365 |
"base_uri": "https://localhost:8080/",
|
|
@@ -381,26 +380,22 @@
|
|
| 381 |
}
|
| 382 |
],
|
| 383 |
"source": [
|
| 384 |
-
"# ---------------- Feature Distributions (Histograms & KDE) ---------------- #\n",
|
| 385 |
"\n",
|
| 386 |
-
"# Define numeric columns for visualization\n",
|
| 387 |
"num_cols = [\n",
|
| 388 |
" \"followers_count\", \"friends_count\", \"listedcount\", \"favourites_count\",\n",
|
| 389 |
" \"statuses_count\", \"verified\", \"default_profile\", \"default_profile_image\", \"bot\"\n",
|
| 390 |
"]\n",
|
| 391 |
"\n",
|
| 392 |
-
"# Reduce figure size while maintaining clarity\n",
|
| 393 |
"plt.figure(figsize=(12, 8))\n",
|
| 394 |
"\n",
|
| 395 |
-
"# Loop through numeric columns and plot\n",
|
| 396 |
"for i, col in enumerate(num_cols):\n",
|
| 397 |
" plt.subplot(3, 3, i + 1)\n",
|
| 398 |
" sns.histplot(df[col], bins=20, kde=True)\n",
|
| 399 |
-
" plt.title(f\"{col} Distribution\", fontsize=10)
|
| 400 |
-
" plt.xlabel(\"\")
|
| 401 |
" plt.ylabel(\"\")\n",
|
| 402 |
"\n",
|
| 403 |
-
"plt.tight_layout(pad=1)
|
| 404 |
"plt.show()\n",
|
| 405 |
"\n"
|
| 406 |
]
|
|
@@ -414,7 +409,7 @@
|
|
| 414 |
},
|
| 415 |
{
|
| 416 |
"cell_type": "code",
|
| 417 |
-
"execution_count":
|
| 418 |
"metadata": {
|
| 419 |
"colab": {
|
| 420 |
"base_uri": "https://localhost:8080/",
|
|
@@ -436,26 +431,24 @@
|
|
| 436 |
}
|
| 437 |
],
|
| 438 |
"source": [
|
| 439 |
-
"#
|
| 440 |
"\n",
|
| 441 |
-
"# Reduce figure size while maintaining readability\n",
|
| 442 |
"plt.figure(figsize=(12, 9))\n",
|
| 443 |
"\n",
|
| 444 |
-
"# Loop through numeric columns and plot boxplots\n",
|
| 445 |
"for i, col in enumerate(num_cols):\n",
|
| 446 |
" plt.subplot(3, 3, i + 1)\n",
|
| 447 |
" sns.boxplot(x=df[\"bot\"], y=df[col])\n",
|
| 448 |
-
" plt.title(f\"{col} vs. Bot\", fontsize=10)
|
| 449 |
-
" plt.xlabel(\"\")
|
| 450 |
" plt.ylabel(\"\")\n",
|
| 451 |
"\n",
|
| 452 |
-
"plt.tight_layout(pad=1)
|
| 453 |
"plt.show()\n"
|
| 454 |
]
|
| 455 |
},
|
| 456 |
{
|
| 457 |
"cell_type": "code",
|
| 458 |
-
"execution_count":
|
| 459 |
"metadata": {
|
| 460 |
"colab": {
|
| 461 |
"base_uri": "https://localhost:8080/",
|
|
@@ -477,19 +470,18 @@
|
|
| 477 |
}
|
| 478 |
],
|
| 479 |
"source": [
|
| 480 |
-
"#
|
| 481 |
"\n",
|
| 482 |
"plt.figure(figsize=(12, 9))\n",
|
| 483 |
"\n",
|
| 484 |
-
"# Loop through numeric columns and plot violin plots\n",
|
| 485 |
"for i, col in enumerate(num_cols):\n",
|
| 486 |
" plt.subplot(3, 3, i + 1)\n",
|
| 487 |
" sns.violinplot(x=df[\"bot\"], y=df[col])\n",
|
| 488 |
" plt.title(f\"{col} vs. Bot\", fontsize=10)\n",
|
| 489 |
-
" plt.xlabel(\"\")
|
| 490 |
" plt.ylabel(\"\")\n",
|
| 491 |
"\n",
|
| 492 |
-
"plt.tight_layout(pad=1)
|
| 493 |
"plt.show()\n"
|
| 494 |
]
|
| 495 |
},
|
|
@@ -511,7 +503,7 @@
|
|
| 511 |
},
|
| 512 |
{
|
| 513 |
"cell_type": "code",
|
| 514 |
-
"execution_count":
|
| 515 |
"metadata": {
|
| 516 |
"colab": {
|
| 517 |
"base_uri": "https://localhost:8080/",
|
|
@@ -533,8 +525,8 @@
|
|
| 533 |
}
|
| 534 |
],
|
| 535 |
"source": [
|
| 536 |
-
"#
|
| 537 |
-
"sns.pairplot(df[num_cols], diag_kind=\"kde\", corner=True, height=1.5)
|
| 538 |
"plt.show()\n"
|
| 539 |
]
|
| 540 |
},
|
|
@@ -559,7 +551,7 @@
|
|
| 559 |
"id": "PIGWwIhF9HT_"
|
| 560 |
},
|
| 561 |
"source": [
|
| 562 |
-
"#Correlation Heatmap\n",
|
| 563 |
"\n",
|
| 564 |
"This heatmap visualizes the correlation between numerical features.\n",
|
| 565 |
"Strong correlations indicate possible redundancy, while weak correlations may suggest independent variables.\n",
|
|
@@ -568,7 +560,7 @@
|
|
| 568 |
},
|
| 569 |
{
|
| 570 |
"cell_type": "code",
|
| 571 |
-
"execution_count":
|
| 572 |
"metadata": {
|
| 573 |
"colab": {
|
| 574 |
"base_uri": "https://localhost:8080/",
|
|
@@ -590,7 +582,7 @@
|
|
| 590 |
}
|
| 591 |
],
|
| 592 |
"source": [
|
| 593 |
-
"
|
| 594 |
"plt.figure(figsize=(12, 8))\n",
|
| 595 |
"corr_matrix = df[num_cols].corr()\n",
|
| 596 |
"\n",
|
|
@@ -643,7 +635,7 @@
|
|
| 643 |
"name": "python",
|
| 644 |
"nbconvert_exporter": "python",
|
| 645 |
"pygments_lexer": "ipython3",
|
| 646 |
-
"version": "3.
|
| 647 |
}
|
| 648 |
},
|
| 649 |
"nbformat": 4,
|
|
|
|
| 13 |
},
|
| 14 |
{
|
| 15 |
"cell_type": "code",
|
| 16 |
+
"execution_count": null,
|
| 17 |
"metadata": {
|
| 18 |
"id": "pLAYgHzBCh3U"
|
| 19 |
},
|
| 20 |
"outputs": [],
|
| 21 |
"source": [
|
| 22 |
+
"\n",
|
| 23 |
"import pandas as pd\n",
|
| 24 |
"import numpy as np\n",
|
| 25 |
"import matplotlib.pyplot as plt\n",
|
| 26 |
"import seaborn as sns\n",
|
| 27 |
"\n",
|
|
|
|
| 28 |
"from sklearn.model_selection import train_test_split\n",
|
| 29 |
"from sklearn.preprocessing import StandardScaler\n",
|
| 30 |
"from sklearn.ensemble import RandomForestClassifier\n",
|
|
|
|
| 34 |
},
|
| 35 |
{
|
| 36 |
"cell_type": "code",
|
| 37 |
+
"execution_count": null,
|
| 38 |
"metadata": {
|
| 39 |
"colab": {
|
| 40 |
"base_uri": "https://localhost:8080/",
|
|
|
|
| 257 |
}
|
| 258 |
],
|
| 259 |
"source": [
|
| 260 |
+
"\n",
|
| 261 |
+
"file_path = \"../Dataset/training_data.csv\" \n",
|
| 262 |
"df = pd.read_csv(file_path)\n",
|
| 263 |
"\n",
|
| 264 |
+
"\n",
|
| 265 |
"df.head()\n"
|
| 266 |
]
|
| 267 |
},
|
|
|
|
| 316 |
},
|
| 317 |
{
|
| 318 |
"cell_type": "code",
|
| 319 |
+
"execution_count": null,
|
| 320 |
"metadata": {
|
| 321 |
"colab": {
|
| 322 |
"base_uri": "https://localhost:8080/"
|
|
|
|
| 337 |
}
|
| 338 |
],
|
| 339 |
"source": [
|
| 340 |
+
"\n",
|
| 341 |
"bool_cols = [\"verified\", \"default_profile\", \"default_profile_image\"]\n",
|
| 342 |
"df[bool_cols] = df[bool_cols].astype(int)\n",
|
| 343 |
"\n",
|
| 344 |
+
"\n",
|
| 345 |
"print(df.dtypes[bool_cols])\n"
|
| 346 |
]
|
| 347 |
},
|
|
|
|
| 351 |
"id": "UWtBhVBX8Hy2"
|
| 352 |
},
|
| 353 |
"source": [
|
| 354 |
+
"# Data Visualization\n",
|
| 355 |
"Visualizing the distribution of numerical features to understand data patterns.\n",
|
| 356 |
"This helps in identifying skewness, outliers, and differences between bot and non-bot accounts.\n"
|
| 357 |
]
|
| 358 |
},
|
| 359 |
{
|
| 360 |
"cell_type": "code",
|
| 361 |
+
"execution_count": null,
|
| 362 |
"metadata": {
|
| 363 |
"colab": {
|
| 364 |
"base_uri": "https://localhost:8080/",
|
|
|
|
| 380 |
}
|
| 381 |
],
|
| 382 |
"source": [
|
|
|
|
| 383 |
"\n",
|
|
|
|
| 384 |
"num_cols = [\n",
|
| 385 |
" \"followers_count\", \"friends_count\", \"listedcount\", \"favourites_count\",\n",
|
| 386 |
" \"statuses_count\", \"verified\", \"default_profile\", \"default_profile_image\", \"bot\"\n",
|
| 387 |
"]\n",
|
| 388 |
"\n",
|
|
|
|
| 389 |
"plt.figure(figsize=(12, 8))\n",
|
| 390 |
"\n",
|
|
|
|
| 391 |
"for i, col in enumerate(num_cols):\n",
|
| 392 |
" plt.subplot(3, 3, i + 1)\n",
|
| 393 |
" sns.histplot(df[col], bins=20, kde=True)\n",
|
| 394 |
+
" plt.title(f\"{col} Distribution\", fontsize=10) \n",
|
| 395 |
+
" plt.xlabel(\"\") \n",
|
| 396 |
" plt.ylabel(\"\")\n",
|
| 397 |
"\n",
|
| 398 |
+
"plt.tight_layout(pad=1) \n",
|
| 399 |
"plt.show()\n",
|
| 400 |
"\n"
|
| 401 |
]
|
|
|
|
| 409 |
},
|
| 410 |
{
|
| 411 |
"cell_type": "code",
|
| 412 |
+
"execution_count": null,
|
| 413 |
"metadata": {
|
| 414 |
"colab": {
|
| 415 |
"base_uri": "https://localhost:8080/",
|
|
|
|
| 431 |
}
|
| 432 |
],
|
| 433 |
"source": [
|
| 434 |
+
"# Boxplots vs. Bot Label\n",
|
| 435 |
"\n",
|
|
|
|
| 436 |
"plt.figure(figsize=(12, 9))\n",
|
| 437 |
"\n",
|
|
|
|
| 438 |
"for i, col in enumerate(num_cols):\n",
|
| 439 |
" plt.subplot(3, 3, i + 1)\n",
|
| 440 |
" sns.boxplot(x=df[\"bot\"], y=df[col])\n",
|
| 441 |
+
" plt.title(f\"{col} vs. Bot\", fontsize=10) \n",
|
| 442 |
+
" plt.xlabel(\"\") \n",
|
| 443 |
" plt.ylabel(\"\")\n",
|
| 444 |
"\n",
|
| 445 |
+
"plt.tight_layout(pad=1) \n",
|
| 446 |
"plt.show()\n"
|
| 447 |
]
|
| 448 |
},
|
| 449 |
{
|
| 450 |
"cell_type": "code",
|
| 451 |
+
"execution_count": null,
|
| 452 |
"metadata": {
|
| 453 |
"colab": {
|
| 454 |
"base_uri": "https://localhost:8080/",
|
|
|
|
| 470 |
}
|
| 471 |
],
|
| 472 |
"source": [
|
| 473 |
+
"# Violin Plots vs. Bot Label \n",
|
| 474 |
"\n",
|
| 475 |
"plt.figure(figsize=(12, 9))\n",
|
| 476 |
"\n",
|
|
|
|
| 477 |
"for i, col in enumerate(num_cols):\n",
|
| 478 |
" plt.subplot(3, 3, i + 1)\n",
|
| 479 |
" sns.violinplot(x=df[\"bot\"], y=df[col])\n",
|
| 480 |
" plt.title(f\"{col} vs. Bot\", fontsize=10)\n",
|
| 481 |
+
" plt.xlabel(\"\") \n",
|
| 482 |
" plt.ylabel(\"\")\n",
|
| 483 |
"\n",
|
| 484 |
+
"plt.tight_layout(pad=1) \n",
|
| 485 |
"plt.show()\n"
|
| 486 |
]
|
| 487 |
},
|
|
|
|
| 503 |
},
|
| 504 |
{
|
| 505 |
"cell_type": "code",
|
| 506 |
+
"execution_count": null,
|
| 507 |
"metadata": {
|
| 508 |
"colab": {
|
| 509 |
"base_uri": "https://localhost:8080/",
|
|
|
|
| 525 |
}
|
| 526 |
],
|
| 527 |
"source": [
|
| 528 |
+
"# SNS Pairplot\n",
|
| 529 |
+
"sns.pairplot(df[num_cols], diag_kind=\"kde\", corner=True, height=1.5) \n",
|
| 530 |
"plt.show()\n"
|
| 531 |
]
|
| 532 |
},
|
|
|
|
| 551 |
"id": "PIGWwIhF9HT_"
|
| 552 |
},
|
| 553 |
"source": [
|
| 554 |
+
"# Correlation Heatmap\n",
|
| 555 |
"\n",
|
| 556 |
"This heatmap visualizes the correlation between numerical features.\n",
|
| 557 |
"Strong correlations indicate possible redundancy, while weak correlations may suggest independent variables.\n",
|
|
|
|
| 560 |
},
|
| 561 |
{
|
| 562 |
"cell_type": "code",
|
| 563 |
+
"execution_count": null,
|
| 564 |
"metadata": {
|
| 565 |
"colab": {
|
| 566 |
"base_uri": "https://localhost:8080/",
|
|
|
|
| 582 |
}
|
| 583 |
],
|
| 584 |
"source": [
|
| 585 |
+
"\n",
|
| 586 |
"plt.figure(figsize=(12, 8))\n",
|
| 587 |
"corr_matrix = df[num_cols].corr()\n",
|
| 588 |
"\n",
|
|
|
|
| 635 |
"name": "python",
|
| 636 |
"nbconvert_exporter": "python",
|
| 637 |
"pygments_lexer": "ipython3",
|
| 638 |
+
"version": "3.10.11"
|
| 639 |
}
|
| 640 |
},
|
| 641 |
"nbformat": 4,
|