ASHUT0SH-SiNGH commited on
Commit
1d571f0
·
1 Parent(s): 413103d

Update bot detection model and features

Browse files
Files changed (1) hide show
  1. BotDetectionEDA.ipynb +30 -38
BotDetectionEDA.ipynb CHANGED
@@ -13,19 +13,18 @@
13
  },
14
  {
15
  "cell_type": "code",
16
- "execution_count": 2,
17
  "metadata": {
18
  "id": "pLAYgHzBCh3U"
19
  },
20
  "outputs": [],
21
  "source": [
22
- "# Import necessary libraries for data manipulation, visualization, and machine learning\n",
23
  "import pandas as pd\n",
24
  "import numpy as np\n",
25
  "import matplotlib.pyplot as plt\n",
26
  "import seaborn as sns\n",
27
  "\n",
28
- "# Machine Learning Libraries\n",
29
  "from sklearn.model_selection import train_test_split\n",
30
  "from sklearn.preprocessing import StandardScaler\n",
31
  "from sklearn.ensemble import RandomForestClassifier\n",
@@ -35,7 +34,7 @@
35
  },
36
  {
37
  "cell_type": "code",
38
- "execution_count": 3,
39
  "metadata": {
40
  "colab": {
41
  "base_uri": "https://localhost:8080/",
@@ -258,11 +257,11 @@
258
  }
259
  ],
260
  "source": [
261
- "# Load the dataset\n",
262
- "file_path = \"../Dataset/training_data.csv\" # Update with your actual file path\n",
263
  "df = pd.read_csv(file_path)\n",
264
  "\n",
265
- "# Display first few rows\n",
266
  "df.head()\n"
267
  ]
268
  },
@@ -317,7 +316,7 @@
317
  },
318
  {
319
  "cell_type": "code",
320
- "execution_count": 5,
321
  "metadata": {
322
  "colab": {
323
  "base_uri": "https://localhost:8080/"
@@ -338,11 +337,11 @@
338
  }
339
  ],
340
  "source": [
341
- "# Convert boolean columns to integer (0 or 1)\n",
342
  "bool_cols = [\"verified\", \"default_profile\", \"default_profile_image\"]\n",
343
  "df[bool_cols] = df[bool_cols].astype(int)\n",
344
  "\n",
345
- "# Verify changes\n",
346
  "print(df.dtypes[bool_cols])\n"
347
  ]
348
  },
@@ -352,14 +351,14 @@
352
  "id": "UWtBhVBX8Hy2"
353
  },
354
  "source": [
355
- "#Data Visualization\n",
356
  "Visualizing the distribution of numerical features to understand data patterns.\n",
357
  "This helps in identifying skewness, outliers, and differences between bot and non-bot accounts.\n"
358
  ]
359
  },
360
  {
361
  "cell_type": "code",
362
- "execution_count": 6,
363
  "metadata": {
364
  "colab": {
365
  "base_uri": "https://localhost:8080/",
@@ -381,26 +380,22 @@
381
  }
382
  ],
383
  "source": [
384
- "# ---------------- Feature Distributions (Histograms & KDE) ---------------- #\n",
385
  "\n",
386
- "# Define numeric columns for visualization\n",
387
  "num_cols = [\n",
388
  " \"followers_count\", \"friends_count\", \"listedcount\", \"favourites_count\",\n",
389
  " \"statuses_count\", \"verified\", \"default_profile\", \"default_profile_image\", \"bot\"\n",
390
  "]\n",
391
  "\n",
392
- "# Reduce figure size while maintaining clarity\n",
393
  "plt.figure(figsize=(12, 8))\n",
394
  "\n",
395
- "# Loop through numeric columns and plot\n",
396
  "for i, col in enumerate(num_cols):\n",
397
  " plt.subplot(3, 3, i + 1)\n",
398
  " sns.histplot(df[col], bins=20, kde=True)\n",
399
- " plt.title(f\"{col} Distribution\", fontsize=10) # Reduce title font size\n",
400
- " plt.xlabel(\"\") # Remove x-labels for cleaner visuals\n",
401
  " plt.ylabel(\"\")\n",
402
  "\n",
403
- "plt.tight_layout(pad=1) # Adjust padding for compact layout\n",
404
  "plt.show()\n",
405
  "\n"
406
  ]
@@ -414,7 +409,7 @@
414
  },
415
  {
416
  "cell_type": "code",
417
- "execution_count": 7,
418
  "metadata": {
419
  "colab": {
420
  "base_uri": "https://localhost:8080/",
@@ -436,26 +431,24 @@
436
  }
437
  ],
438
  "source": [
439
- "# ---------------- Boxplots vs. Bot Label ---------------- #\n",
440
  "\n",
441
- "# Reduce figure size while maintaining readability\n",
442
  "plt.figure(figsize=(12, 9))\n",
443
  "\n",
444
- "# Loop through numeric columns and plot boxplots\n",
445
  "for i, col in enumerate(num_cols):\n",
446
  " plt.subplot(3, 3, i + 1)\n",
447
  " sns.boxplot(x=df[\"bot\"], y=df[col])\n",
448
- " plt.title(f\"{col} vs. Bot\", fontsize=10) # Smaller font size for compact view\n",
449
- " plt.xlabel(\"\") # Remove x-labels for cleaner visualization\n",
450
  " plt.ylabel(\"\")\n",
451
  "\n",
452
- "plt.tight_layout(pad=1) # Adjust padding for compact layout\n",
453
  "plt.show()\n"
454
  ]
455
  },
456
  {
457
  "cell_type": "code",
458
- "execution_count": 8,
459
  "metadata": {
460
  "colab": {
461
  "base_uri": "https://localhost:8080/",
@@ -477,19 +470,18 @@
477
  }
478
  ],
479
  "source": [
480
- "# ---------------- Violin Plots vs. Bot Label ---------------- #\n",
481
  "\n",
482
  "plt.figure(figsize=(12, 9))\n",
483
  "\n",
484
- "# Loop through numeric columns and plot violin plots\n",
485
  "for i, col in enumerate(num_cols):\n",
486
  " plt.subplot(3, 3, i + 1)\n",
487
  " sns.violinplot(x=df[\"bot\"], y=df[col])\n",
488
  " plt.title(f\"{col} vs. Bot\", fontsize=10)\n",
489
- " plt.xlabel(\"\") # Remove x-labels for cleaner visualization\n",
490
  " plt.ylabel(\"\")\n",
491
  "\n",
492
- "plt.tight_layout(pad=1) # Adjust padding for compact layout\n",
493
  "plt.show()\n"
494
  ]
495
  },
@@ -511,7 +503,7 @@
511
  },
512
  {
513
  "cell_type": "code",
514
- "execution_count": 9,
515
  "metadata": {
516
  "colab": {
517
  "base_uri": "https://localhost:8080/",
@@ -533,8 +525,8 @@
533
  }
534
  ],
535
  "source": [
536
- "# Pair Plot with Reduced Size\n",
537
- "sns.pairplot(df[num_cols], diag_kind=\"kde\", corner=True, height=1.5) # Reduce height\n",
538
  "plt.show()\n"
539
  ]
540
  },
@@ -559,7 +551,7 @@
559
  "id": "PIGWwIhF9HT_"
560
  },
561
  "source": [
562
- "#Correlation Heatmap\n",
563
  "\n",
564
  "This heatmap visualizes the correlation between numerical features.\n",
565
  "Strong correlations indicate possible redundancy, while weak correlations may suggest independent variables.\n",
@@ -568,7 +560,7 @@
568
  },
569
  {
570
  "cell_type": "code",
571
- "execution_count": 10,
572
  "metadata": {
573
  "colab": {
574
  "base_uri": "https://localhost:8080/",
@@ -590,7 +582,7 @@
590
  }
591
  ],
592
  "source": [
593
- "# ---------------- Correlation Heatmap ---------------- #\n",
594
  "plt.figure(figsize=(12, 8))\n",
595
  "corr_matrix = df[num_cols].corr()\n",
596
  "\n",
@@ -643,7 +635,7 @@
643
  "name": "python",
644
  "nbconvert_exporter": "python",
645
  "pygments_lexer": "ipython3",
646
- "version": "3.12.0"
647
  }
648
  },
649
  "nbformat": 4,
 
13
  },
14
  {
15
  "cell_type": "code",
16
+ "execution_count": null,
17
  "metadata": {
18
  "id": "pLAYgHzBCh3U"
19
  },
20
  "outputs": [],
21
  "source": [
22
+ "\n",
23
  "import pandas as pd\n",
24
  "import numpy as np\n",
25
  "import matplotlib.pyplot as plt\n",
26
  "import seaborn as sns\n",
27
  "\n",
 
28
  "from sklearn.model_selection import train_test_split\n",
29
  "from sklearn.preprocessing import StandardScaler\n",
30
  "from sklearn.ensemble import RandomForestClassifier\n",
 
34
  },
35
  {
36
  "cell_type": "code",
37
+ "execution_count": null,
38
  "metadata": {
39
  "colab": {
40
  "base_uri": "https://localhost:8080/",
 
257
  }
258
  ],
259
  "source": [
260
+ "\n",
261
+ "file_path = \"../Dataset/training_data.csv\" \n",
262
  "df = pd.read_csv(file_path)\n",
263
  "\n",
264
+ "\n",
265
  "df.head()\n"
266
  ]
267
  },
 
316
  },
317
  {
318
  "cell_type": "code",
319
+ "execution_count": null,
320
  "metadata": {
321
  "colab": {
322
  "base_uri": "https://localhost:8080/"
 
337
  }
338
  ],
339
  "source": [
340
+ "\n",
341
  "bool_cols = [\"verified\", \"default_profile\", \"default_profile_image\"]\n",
342
  "df[bool_cols] = df[bool_cols].astype(int)\n",
343
  "\n",
344
+ "\n",
345
  "print(df.dtypes[bool_cols])\n"
346
  ]
347
  },
 
351
  "id": "UWtBhVBX8Hy2"
352
  },
353
  "source": [
354
+ "# Data Visualization\n",
355
  "Visualizing the distribution of numerical features to understand data patterns.\n",
356
  "This helps in identifying skewness, outliers, and differences between bot and non-bot accounts.\n"
357
  ]
358
  },
359
  {
360
  "cell_type": "code",
361
+ "execution_count": null,
362
  "metadata": {
363
  "colab": {
364
  "base_uri": "https://localhost:8080/",
 
380
  }
381
  ],
382
  "source": [
 
383
  "\n",
 
384
  "num_cols = [\n",
385
  " \"followers_count\", \"friends_count\", \"listedcount\", \"favourites_count\",\n",
386
  " \"statuses_count\", \"verified\", \"default_profile\", \"default_profile_image\", \"bot\"\n",
387
  "]\n",
388
  "\n",
 
389
  "plt.figure(figsize=(12, 8))\n",
390
  "\n",
 
391
  "for i, col in enumerate(num_cols):\n",
392
  " plt.subplot(3, 3, i + 1)\n",
393
  " sns.histplot(df[col], bins=20, kde=True)\n",
394
+ " plt.title(f\"{col} Distribution\", fontsize=10) \n",
395
+ " plt.xlabel(\"\") \n",
396
  " plt.ylabel(\"\")\n",
397
  "\n",
398
+ "plt.tight_layout(pad=1) \n",
399
  "plt.show()\n",
400
  "\n"
401
  ]
 
409
  },
410
  {
411
  "cell_type": "code",
412
+ "execution_count": null,
413
  "metadata": {
414
  "colab": {
415
  "base_uri": "https://localhost:8080/",
 
431
  }
432
  ],
433
  "source": [
434
+ "# Boxplots vs. Bot Label\n",
435
  "\n",
 
436
  "plt.figure(figsize=(12, 9))\n",
437
  "\n",
 
438
  "for i, col in enumerate(num_cols):\n",
439
  " plt.subplot(3, 3, i + 1)\n",
440
  " sns.boxplot(x=df[\"bot\"], y=df[col])\n",
441
+ " plt.title(f\"{col} vs. Bot\", fontsize=10) \n",
442
+ " plt.xlabel(\"\") \n",
443
  " plt.ylabel(\"\")\n",
444
  "\n",
445
+ "plt.tight_layout(pad=1) \n",
446
  "plt.show()\n"
447
  ]
448
  },
449
  {
450
  "cell_type": "code",
451
+ "execution_count": null,
452
  "metadata": {
453
  "colab": {
454
  "base_uri": "https://localhost:8080/",
 
470
  }
471
  ],
472
  "source": [
473
+ "# Violin Plots vs. Bot Label \n",
474
  "\n",
475
  "plt.figure(figsize=(12, 9))\n",
476
  "\n",
 
477
  "for i, col in enumerate(num_cols):\n",
478
  " plt.subplot(3, 3, i + 1)\n",
479
  " sns.violinplot(x=df[\"bot\"], y=df[col])\n",
480
  " plt.title(f\"{col} vs. Bot\", fontsize=10)\n",
481
+ " plt.xlabel(\"\") \n",
482
  " plt.ylabel(\"\")\n",
483
  "\n",
484
+ "plt.tight_layout(pad=1) \n",
485
  "plt.show()\n"
486
  ]
487
  },
 
503
  },
504
  {
505
  "cell_type": "code",
506
+ "execution_count": null,
507
  "metadata": {
508
  "colab": {
509
  "base_uri": "https://localhost:8080/",
 
525
  }
526
  ],
527
  "source": [
528
+ "# SNS Pairplot\n",
529
+ "sns.pairplot(df[num_cols], diag_kind=\"kde\", corner=True, height=1.5) \n",
530
  "plt.show()\n"
531
  ]
532
  },
 
551
  "id": "PIGWwIhF9HT_"
552
  },
553
  "source": [
554
+ "# Correlation Heatmap\n",
555
  "\n",
556
  "This heatmap visualizes the correlation between numerical features.\n",
557
  "Strong correlations indicate possible redundancy, while weak correlations may suggest independent variables.\n",
 
560
  },
561
  {
562
  "cell_type": "code",
563
+ "execution_count": null,
564
  "metadata": {
565
  "colab": {
566
  "base_uri": "https://localhost:8080/",
 
582
  }
583
  ],
584
  "source": [
585
+ "\n",
586
  "plt.figure(figsize=(12, 8))\n",
587
  "corr_matrix = df[num_cols].corr()\n",
588
  "\n",
 
635
  "name": "python",
636
  "nbconvert_exporter": "python",
637
  "pygments_lexer": "ipython3",
638
+ "version": "3.10.11"
639
  }
640
  },
641
  "nbformat": 4,