ASHUT0SH-SiNGH commited on
Commit
389e084
Β·
1 Parent(s): 2fafc02

Improved UI

Browse files
Files changed (2) hide show
  1. app.py +29 -23
  2. bot-detection-model.ipynb +825 -79
app.py CHANGED
@@ -199,9 +199,8 @@ def main():
199
  st.title("πŸ€– Twitter Bot Detection System")
200
  st.markdown("""
201
  <div style='background-color: #262730; color: white; padding: 1rem; border-radius: 0.5rem; margin-bottom: 1rem;'>
202
- <h4>Welcome to the Advanced Bot Detection System</h4>
203
- <p>This advanced system analyzes Twitter accounts using machine learning to determine if they're automated bots or human users.
204
- Our system uses multiple features and sophisticated algorithms to provide accurate detection results.</p>
205
  </div>
206
  """, unsafe_allow_html=True)
207
 
@@ -250,7 +249,13 @@ def main():
250
  st.markdown("### Tweet Content")
251
  tweet_content = st.text_area("Sample Tweet", height=100) # UI stays, ignored in logic
252
 
 
 
 
 
 
253
  if st.button("πŸ” Analyze Account"):
 
254
  with st.spinner('Analyzing account characteristics...'):
255
  # βœ… Build ONLY the exact 11 features your RF expects
256
  features = build_model_features_from_ui(
@@ -279,12 +284,11 @@ def main():
279
  else:
280
  st.success("πŸ‘€ Human Account Detected!")
281
 
282
- metric_col1, metric_col2 = st.columns(2)
283
-
284
- with metric_col1:
285
- st.plotly_chart(create_gauge_chart(confidence, prediction_is_bot), use_container_width=True)
286
- with metric_col2:
287
- st.plotly_chart(create_probability_chart(probs), use_container_width=True)
288
 
289
  st.markdown("### Feature Analysis")
290
 
@@ -320,10 +324,13 @@ def main():
320
  color_continuous_scale='Viridis'
321
  )
322
  st.plotly_chart(fig, use_container_width=True)
 
 
 
323
 
324
  elif page == "CSV Analysis":
325
  st.title("CSV Batch Analysis")
326
- st.markdown("Upload a CSV file with account data to run batch predictions. You can use test_Click from Dataset folder of this repository.")
327
  uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
328
 
329
  if uploaded_file is not None:
@@ -416,9 +423,7 @@ def main():
416
  st.markdown("""
417
  <div class='info-box'>
418
  <h3>🎯 System Overview</h3>
419
- <p>Our Twitter Bot Detection System uses state-of-the-art machine learning algorithms to analyze Twitter accounts
420
- and determine whether they are automated bots or genuine human users. The system achieves this through multi-faceted
421
- analysis of various account characteristics and behaviors.</p>
422
  </div>
423
  """, unsafe_allow_html=True)
424
  st.markdown("### πŸ”‘ Key Features Analyzed")
@@ -445,25 +450,21 @@ def main():
445
  - Friend acquisition rate
446
  - Network growth patterns
447
 
448
- #### Content Analysis
449
- - Tweet sentiment
450
- - Language patterns
451
- - URL sharing frequency
452
- - Hashtag usage
453
  """)
454
 
455
  st.markdown("""
456
  <div class='info-box'>
457
  <h3>βš™ Technical Implementation</h3>
458
- <p>The system employs a hierarchical classification approach:</p>
459
  <ul>
460
- <li><strong>Primary Analysis:</strong> Random Forest Classifier for behavioral patterns</li>
461
- <li><strong>Secondary Analysis:</strong> Natural Language Processing for content analysis</li>
462
- <li><strong>Final Decision:</strong> Weighted ensemble of multiple models</li>
 
463
  </ul>
464
  </div>
465
  """, unsafe_allow_html=True)
466
 
 
467
  st.markdown("### πŸ“Š System Performance")
468
  metrics_col1, metrics_col2, metrics_col3, metrics_col4 = st.columns(4)
469
 
@@ -486,6 +487,10 @@ def main():
486
 
487
  else: # Statistics page
488
  st.title("System Statistics")
 
 
 
 
489
  col1, col2 = st.columns(2)
490
 
491
  with col1:
@@ -526,7 +531,7 @@ def main():
526
  fig = px.line(
527
  monthly_data,
528
  x='Month',
529
- y=['Bots Detected', 'Accuracy'],
530
  title='Monthly Performance Metrics',
531
  markers=True
532
  )
@@ -544,6 +549,7 @@ def main():
544
  with metric_col4:
545
  st.metric("Processing Time", "1.2s", "-0.3s")
546
 
 
547
 
548
  if __name__ == "__main__":
549
  main()
 
199
  st.title("πŸ€– Twitter Bot Detection System")
200
  st.markdown("""
201
  <div style='background-color: #262730; color: white; padding: 1rem; border-radius: 0.5rem; margin-bottom: 1rem;'>
202
+ <h4>Welcome to the Social Media Bot Detection System</h4>
203
+ <p>This application demonstrates a metadata-based machine learning approach for detecting automated social media accounts.</p>
 
204
  </div>
205
  """, unsafe_allow_html=True)
206
 
 
249
  st.markdown("### Tweet Content")
250
  tweet_content = st.text_area("Sample Tweet", height=100) # UI stays, ignored in logic
251
 
252
+ st.caption(
253
+ "Note: The prediction model uses only profile and activity metadata. "
254
+ "Text fields are shown for completeness and are not used in model inference."
255
+ )
256
+
257
  if st.button("πŸ” Analyze Account"):
258
+
259
  with st.spinner('Analyzing account characteristics...'):
260
  # βœ… Build ONLY the exact 11 features your RF expects
261
  features = build_model_features_from_ui(
 
284
  else:
285
  st.success("πŸ‘€ Human Account Detected!")
286
 
287
+ # Confidence gauge directly below the result
288
+ st.plotly_chart(
289
+ create_gauge_chart(confidence, prediction_is_bot),
290
+ use_container_width=True
291
+ )
 
292
 
293
  st.markdown("### Feature Analysis")
294
 
 
324
  color_continuous_scale='Viridis'
325
  )
326
  st.plotly_chart(fig, use_container_width=True)
327
+
328
+
329
+
330
 
331
  elif page == "CSV Analysis":
332
  st.title("CSV Batch Analysis")
333
+ st.markdown("Upload a CSV file with account data to run batch predictions. You can use \"testClick.csv\" from Dataset folder of this repository.")
334
  uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
335
 
336
  if uploaded_file is not None:
 
423
  st.markdown("""
424
  <div class='info-box'>
425
  <h3>🎯 System Overview</h3>
426
+ <p>Our Twitter Bot Detection System demonstrates a supervised machine learning approach for detecting automated social media accounts using structured profile and activity metadata. The goal of the system is to understand how different behavioral and account-level attributes contribute to identifying bot-like patterns, rather than relying on text or content-based signals.</p>
 
 
427
  </div>
428
  """, unsafe_allow_html=True)
429
  st.markdown("### πŸ”‘ Key Features Analyzed")
 
450
  - Friend acquisition rate
451
  - Network growth patterns
452
 
 
 
 
 
 
453
  """)
454
 
455
  st.markdown("""
456
  <div class='info-box'>
457
  <h3>βš™ Technical Implementation</h3>
 
458
  <ul>
459
+ <li><strong>Data Processing:</strong> Cleaned and structured profile and activity metadata.</li>
460
+ <li><strong>Feature Engineering:</strong> Derived behavioral features such as follower–following ratio, posting activity, and account age.</li>
461
+ <li><strong>Modeling:</strong> Trained a Random Forest classifier on the engineered features.</li>
462
+ <li><strong>Explainability:</strong> Used feature importance to interpret model predictions.</li>
463
  </ul>
464
  </div>
465
  """, unsafe_allow_html=True)
466
 
467
+
468
  st.markdown("### πŸ“Š System Performance")
469
  metrics_col1, metrics_col2, metrics_col3, metrics_col4 = st.columns(4)
470
 
 
487
 
488
  else: # Statistics page
489
  st.title("System Statistics")
490
+ st.info(
491
+ "This dashboard is a demo visualization intended to illustrate how system-level statistics and trends could be presented. The data shown here is illustrative and not generated from live usage or production logs."
492
+ )
493
+
494
  col1, col2 = st.columns(2)
495
 
496
  with col1:
 
531
  fig = px.line(
532
  monthly_data,
533
  x='Month',
534
+ y=['Accuracy','Bots Detected' ],
535
  title='Monthly Performance Metrics',
536
  markers=True
537
  )
 
549
  with metric_col4:
550
  st.metric("Processing Time", "1.2s", "-0.3s")
551
 
552
+ st.caption("*Demo Dashboard (Concept Visualization)*")
553
 
554
  if __name__ == "__main__":
555
  main()
bot-detection-model.ipynb CHANGED
@@ -2,14 +2,14 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": null,
6
  "metadata": {
7
  "execution": {
8
- "iopub.execute_input": "2026-01-16T03:42:30.469065Z",
9
- "iopub.status.busy": "2026-01-16T03:42:30.467530Z",
10
- "iopub.status.idle": "2026-01-16T03:42:30.474262Z",
11
- "shell.execute_reply": "2026-01-16T03:42:30.473090Z",
12
- "shell.execute_reply.started": "2026-01-16T03:42:30.468918Z"
13
  },
14
  "trusted": true
15
  },
@@ -25,18 +25,26 @@
25
  },
26
  {
27
  "cell_type": "code",
28
- "execution_count": null,
29
  "metadata": {
30
  "execution": {
31
- "iopub.execute_input": "2026-01-16T03:42:44.598336Z",
32
- "iopub.status.busy": "2026-01-16T03:42:44.598005Z",
33
- "iopub.status.idle": "2026-01-16T03:42:44.666341Z",
34
- "shell.execute_reply": "2026-01-16T03:42:44.665147Z",
35
- "shell.execute_reply.started": "2026-01-16T03:42:44.598308Z"
36
  },
37
  "trusted": true
38
  },
39
- "outputs": [],
 
 
 
 
 
 
 
 
40
  "source": [
41
  "# DATA_PATH = \"/kaggle/input/bot-detection-data/bot_detection_data.csv\"\n",
42
  "DATA_PATH = \"/kaggle/input/bot-detection-data/training_data.csv\"\n",
@@ -47,39 +55,244 @@
47
  },
48
  {
49
  "cell_type": "code",
50
- "execution_count": null,
51
- "metadata": {},
52
- "outputs": [],
53
- "source": []
54
- },
55
- {
56
- "cell_type": "code",
57
- "execution_count": null,
58
  "metadata": {
59
  "execution": {
60
- "iopub.execute_input": "2026-01-16T03:42:50.039918Z",
61
- "iopub.status.busy": "2026-01-16T03:42:50.039522Z",
62
- "iopub.status.idle": "2026-01-16T03:42:50.059844Z",
63
- "shell.execute_reply": "2026-01-16T03:42:50.058651Z",
64
- "shell.execute_reply.started": "2026-01-16T03:42:50.039876Z"
65
  },
66
  "trusted": true
67
  },
68
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  "source": [
70
  "df.head()"
71
  ]
72
  },
73
  {
74
  "cell_type": "code",
75
- "execution_count": null,
76
  "metadata": {
77
  "execution": {
78
- "iopub.execute_input": "2026-01-16T03:43:06.917403Z",
79
- "iopub.status.busy": "2026-01-16T03:43:06.916688Z",
80
- "iopub.status.idle": "2026-01-16T03:43:06.924961Z",
81
- "shell.execute_reply": "2026-01-16T03:43:06.924063Z",
82
- "shell.execute_reply.started": "2026-01-16T03:43:06.917366Z"
83
  },
84
  "trusted": true
85
  },
@@ -103,14 +316,14 @@
103
  },
104
  {
105
  "cell_type": "code",
106
- "execution_count": null,
107
  "metadata": {
108
  "execution": {
109
- "iopub.execute_input": "2026-01-16T03:43:16.183239Z",
110
- "iopub.status.busy": "2026-01-16T03:43:16.182880Z",
111
- "iopub.status.idle": "2026-01-16T03:43:16.189999Z",
112
- "shell.execute_reply": "2026-01-16T03:43:16.188760Z",
113
- "shell.execute_reply.started": "2026-01-16T03:43:16.183210Z"
114
  },
115
  "trusted": true
116
  },
@@ -129,14 +342,14 @@
129
  },
130
  {
131
  "cell_type": "code",
132
- "execution_count": null,
133
  "metadata": {
134
  "execution": {
135
- "iopub.execute_input": "2026-01-16T03:43:52.115697Z",
136
- "iopub.status.busy": "2026-01-16T03:43:52.115333Z",
137
- "iopub.status.idle": "2026-01-16T03:43:52.121777Z",
138
- "shell.execute_reply": "2026-01-16T03:43:52.120660Z",
139
- "shell.execute_reply.started": "2026-01-16T03:43:52.115666Z"
140
  },
141
  "trusted": true
142
  },
@@ -147,14 +360,14 @@
147
  },
148
  {
149
  "cell_type": "code",
150
- "execution_count": null,
151
  "metadata": {
152
  "execution": {
153
- "iopub.execute_input": "2026-01-16T03:38:57.765197Z",
154
- "iopub.status.busy": "2026-01-16T03:38:57.764874Z",
155
- "iopub.status.idle": "2026-01-16T03:38:57.794042Z",
156
- "shell.execute_reply": "2026-01-16T03:38:57.793068Z",
157
- "shell.execute_reply.started": "2026-01-16T03:38:57.765161Z"
158
  },
159
  "trusted": true
160
  },
@@ -169,14 +382,14 @@
169
  },
170
  {
171
  "cell_type": "code",
172
- "execution_count": null,
173
  "metadata": {
174
  "execution": {
175
- "iopub.execute_input": "2026-01-16T03:38:57.795374Z",
176
- "iopub.status.busy": "2026-01-16T03:38:57.795084Z",
177
- "iopub.status.idle": "2026-01-16T03:38:57.817354Z",
178
- "shell.execute_reply": "2026-01-16T03:38:57.816386Z",
179
- "shell.execute_reply.started": "2026-01-16T03:38:57.795348Z"
180
  },
181
  "trusted": true
182
  },
@@ -194,18 +407,453 @@
194
  },
195
  {
196
  "cell_type": "code",
197
- "execution_count": null,
198
  "metadata": {
199
  "execution": {
200
- "iopub.execute_input": "2026-01-16T03:38:57.818883Z",
201
- "iopub.status.busy": "2026-01-16T03:38:57.818519Z",
202
- "iopub.status.idle": "2026-01-16T03:38:59.208010Z",
203
- "shell.execute_reply": "2026-01-16T03:38:59.207044Z",
204
- "shell.execute_reply.started": "2026-01-16T03:38:57.818853Z"
205
  },
206
  "trusted": true
207
  },
208
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  "source": [
210
  "from sklearn.ensemble import RandomForestClassifier\n",
211
  "\n",
@@ -223,18 +871,35 @@
223
  },
224
  {
225
  "cell_type": "code",
226
- "execution_count": null,
227
  "metadata": {
228
  "execution": {
229
- "iopub.execute_input": "2026-01-16T03:38:59.210120Z",
230
- "iopub.status.busy": "2026-01-16T03:38:59.209455Z",
231
- "iopub.status.idle": "2026-01-16T03:38:59.361078Z",
232
- "shell.execute_reply": "2026-01-16T03:38:59.360209Z",
233
- "shell.execute_reply.started": "2026-01-16T03:38:59.210087Z"
234
  },
235
  "trusted": true
236
  },
237
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  "source": [
239
  "preds = rf.predict(X_test)\n",
240
  "\n",
@@ -244,18 +909,37 @@
244
  },
245
  {
246
  "cell_type": "code",
247
- "execution_count": null,
248
  "metadata": {
249
  "execution": {
250
- "iopub.execute_input": "2026-01-16T03:38:59.363663Z",
251
- "iopub.status.busy": "2026-01-16T03:38:59.363334Z",
252
- "iopub.status.idle": "2026-01-16T03:38:59.445148Z",
253
- "shell.execute_reply": "2026-01-16T03:38:59.444321Z",
254
- "shell.execute_reply.started": "2026-01-16T03:38:59.363633Z"
255
  },
256
  "trusted": true
257
  },
258
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  "source": [
260
  "imp = pd.DataFrame({\n",
261
  " \"feature\": X.columns,\n",
@@ -265,6 +949,68 @@
265
  "print(imp)"
266
  ]
267
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  {
269
  "cell_type": "code",
270
  "execution_count": null,
@@ -306,7 +1052,7 @@
306
  "name": "python",
307
  "nbconvert_exporter": "python",
308
  "pygments_lexer": "ipython3",
309
- "version": "3.12.12"
310
  }
311
  },
312
  "nbformat": 4,
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 15,
6
  "metadata": {
7
  "execution": {
8
+ "iopub.execute_input": "2026-01-20T09:42:14.745657Z",
9
+ "iopub.status.busy": "2026-01-20T09:42:14.744873Z",
10
+ "iopub.status.idle": "2026-01-20T09:42:14.750198Z",
11
+ "shell.execute_reply": "2026-01-20T09:42:14.749406Z",
12
+ "shell.execute_reply.started": "2026-01-20T09:42:14.745620Z"
13
  },
14
  "trusted": true
15
  },
 
25
  },
26
  {
27
  "cell_type": "code",
28
+ "execution_count": 16,
29
  "metadata": {
30
  "execution": {
31
+ "iopub.execute_input": "2026-01-20T09:42:14.752013Z",
32
+ "iopub.status.busy": "2026-01-20T09:42:14.751712Z",
33
+ "iopub.status.idle": "2026-01-20T09:42:14.831116Z",
34
+ "shell.execute_reply": "2026-01-20T09:42:14.830201Z",
35
+ "shell.execute_reply.started": "2026-01-20T09:42:14.751978Z"
36
  },
37
  "trusted": true
38
  },
39
+ "outputs": [
40
+ {
41
+ "name": "stdout",
42
+ "output_type": "stream",
43
+ "text": [
44
+ "(1562, 20)\n"
45
+ ]
46
+ }
47
+ ],
48
  "source": [
49
  "# DATA_PATH = \"/kaggle/input/bot-detection-data/bot_detection_data.csv\"\n",
50
  "DATA_PATH = \"/kaggle/input/bot-detection-data/training_data.csv\"\n",
 
55
  },
56
  {
57
  "cell_type": "code",
58
+ "execution_count": 17,
 
 
 
 
 
 
 
59
  "metadata": {
60
  "execution": {
61
+ "iopub.execute_input": "2026-01-20T09:42:14.833146Z",
62
+ "iopub.status.busy": "2026-01-20T09:42:14.832832Z",
63
+ "iopub.status.idle": "2026-01-20T09:42:14.849605Z",
64
+ "shell.execute_reply": "2026-01-20T09:42:14.848831Z",
65
+ "shell.execute_reply.started": "2026-01-20T09:42:14.833119Z"
66
  },
67
  "trusted": true
68
  },
69
+ "outputs": [
70
+ {
71
+ "data": {
72
+ "text/html": [
73
+ "<div>\n",
74
+ "<style scoped>\n",
75
+ " .dataframe tbody tr th:only-of-type {\n",
76
+ " vertical-align: middle;\n",
77
+ " }\n",
78
+ "\n",
79
+ " .dataframe tbody tr th {\n",
80
+ " vertical-align: top;\n",
81
+ " }\n",
82
+ "\n",
83
+ " .dataframe thead th {\n",
84
+ " text-align: right;\n",
85
+ " }\n",
86
+ "</style>\n",
87
+ "<table border=\"1\" class=\"dataframe\">\n",
88
+ " <thead>\n",
89
+ " <tr style=\"text-align: right;\">\n",
90
+ " <th></th>\n",
91
+ " <th>id</th>\n",
92
+ " <th>id_str</th>\n",
93
+ " <th>screen_name</th>\n",
94
+ " <th>location</th>\n",
95
+ " <th>description</th>\n",
96
+ " <th>url</th>\n",
97
+ " <th>followers_count</th>\n",
98
+ " <th>friends_count</th>\n",
99
+ " <th>listedcount</th>\n",
100
+ " <th>created_at</th>\n",
101
+ " <th>favourites_count</th>\n",
102
+ " <th>verified</th>\n",
103
+ " <th>statuses_count</th>\n",
104
+ " <th>lang</th>\n",
105
+ " <th>status</th>\n",
106
+ " <th>default_profile</th>\n",
107
+ " <th>default_profile_image</th>\n",
108
+ " <th>has_extended_profile</th>\n",
109
+ " <th>name</th>\n",
110
+ " <th>bot</th>\n",
111
+ " </tr>\n",
112
+ " </thead>\n",
113
+ " <tbody>\n",
114
+ " <tr>\n",
115
+ " <th>0</th>\n",
116
+ " <td>1.953701e+08</td>\n",
117
+ " <td>195370058</td>\n",
118
+ " <td>kanyejordan</td>\n",
119
+ " <td>NaN</td>\n",
120
+ " <td>This is what I do. I drop truth bombs.</td>\n",
121
+ " <td>NaN</td>\n",
122
+ " <td>2925</td>\n",
123
+ " <td>3</td>\n",
124
+ " <td>139</td>\n",
125
+ " <td>9/26/2010 14:45</td>\n",
126
+ " <td>0</td>\n",
127
+ " <td>False</td>\n",
128
+ " <td>708</td>\n",
129
+ " <td>en</td>\n",
130
+ " <td>Status(in_reply_to_status_id=None, favorited=F...</td>\n",
131
+ " <td>True</td>\n",
132
+ " <td>False</td>\n",
133
+ " <td>False</td>\n",
134
+ " <td>Kanye Jordan</td>\n",
135
+ " <td>1</td>\n",
136
+ " </tr>\n",
137
+ " <tr>\n",
138
+ " <th>1</th>\n",
139
+ " <td>7.950000e+17</td>\n",
140
+ " <td>7.95E+17</td>\n",
141
+ " <td>astronaut_bot</td>\n",
142
+ " <td>NaN</td>\n",
143
+ " <td>Keeping an eye on astronauts coming and going....</td>\n",
144
+ " <td>NaN</td>\n",
145
+ " <td>9</td>\n",
146
+ " <td>0</td>\n",
147
+ " <td>5</td>\n",
148
+ " <td>Fri Nov 04 12:11:27 +0000 2016</td>\n",
149
+ " <td>0</td>\n",
150
+ " <td>False</td>\n",
151
+ " <td>6</td>\n",
152
+ " <td>en</td>\n",
153
+ " <td>{'created_at': 'Tue Nov 22 16:52:31 +0000 2016...</td>\n",
154
+ " <td>True</td>\n",
155
+ " <td>False</td>\n",
156
+ " <td>False</td>\n",
157
+ " <td>Astronaut Notifier</td>\n",
158
+ " <td>1</td>\n",
159
+ " </tr>\n",
160
+ " <tr>\n",
161
+ " <th>2</th>\n",
162
+ " <td>2.976541e+09</td>\n",
163
+ " <td>2976541239</td>\n",
164
+ " <td>TheRiddlerBot</td>\n",
165
+ " <td>Coimbra, Portugal</td>\n",
166
+ " <td>Solve the riddle by replying only the name of ...</td>\n",
167
+ " <td>https://t.co/1v8BON9QpT</td>\n",
168
+ " <td>132</td>\n",
169
+ " <td>46</td>\n",
170
+ " <td>24</td>\n",
171
+ " <td>1/13/2015 15:10</td>\n",
172
+ " <td>740</td>\n",
173
+ " <td>False</td>\n",
174
+ " <td>7346</td>\n",
175
+ " <td>en</td>\n",
176
+ " <td>Status(contributors=None, truncated=False, tex...</td>\n",
177
+ " <td>True</td>\n",
178
+ " <td>False</td>\n",
179
+ " <td>False</td>\n",
180
+ " <td>TheRiddlerBot</td>\n",
181
+ " <td>1</td>\n",
182
+ " </tr>\n",
183
+ " <tr>\n",
184
+ " <th>3</th>\n",
185
+ " <td>2.243832e+08</td>\n",
186
+ " <td>224383150</td>\n",
187
+ " <td>mlegoudes262</td>\n",
188
+ " <td>NaN</td>\n",
189
+ " <td>NaN</td>\n",
190
+ " <td>NaN</td>\n",
191
+ " <td>54</td>\n",
192
+ " <td>1351</td>\n",
193
+ " <td>0</td>\n",
194
+ " <td>Wed Dec 08 21:29:31 +0000 2010</td>\n",
195
+ " <td>2</td>\n",
196
+ " <td>False</td>\n",
197
+ " <td>6</td>\n",
198
+ " <td>en</td>\n",
199
+ " <td>{'truncated': False, 'entities': {'user_mentio...</td>\n",
200
+ " <td>True</td>\n",
201
+ " <td>False</td>\n",
202
+ " <td>False</td>\n",
203
+ " <td>Laurie Poulsen</td>\n",
204
+ " <td>1</td>\n",
205
+ " </tr>\n",
206
+ " <tr>\n",
207
+ " <th>4</th>\n",
208
+ " <td>1.134712e+07</td>\n",
209
+ " <td>11347122</td>\n",
210
+ " <td>GavinNewsom</td>\n",
211
+ " <td>California</td>\n",
212
+ " <td>Husband &amp; father. 49th Lt. Gov. of California ...</td>\n",
213
+ " <td>https://t.co/XrGnfzTDJD</td>\n",
214
+ " <td>1300380</td>\n",
215
+ " <td>24248</td>\n",
216
+ " <td>7089</td>\n",
217
+ " <td>Wed Dec 19 19:53:42 +0000 2007</td>\n",
218
+ " <td>4184</td>\n",
219
+ " <td>True</td>\n",
220
+ " <td>8536</td>\n",
221
+ " <td>en</td>\n",
222
+ " <td>{u'contributors': None, u'truncated': True, u'...</td>\n",
223
+ " <td>False</td>\n",
224
+ " <td>False</td>\n",
225
+ " <td>False</td>\n",
226
+ " <td>Gavin Newsom</td>\n",
227
+ " <td>0</td>\n",
228
+ " </tr>\n",
229
+ " </tbody>\n",
230
+ "</table>\n",
231
+ "</div>"
232
+ ],
233
+ "text/plain": [
234
+ " id id_str screen_name location \\\n",
235
+ "0 1.953701e+08 195370058 kanyejordan NaN \n",
236
+ "1 7.950000e+17 7.95E+17 astronaut_bot NaN \n",
237
+ "2 2.976541e+09 2976541239 TheRiddlerBot Coimbra, Portugal \n",
238
+ "3 2.243832e+08 224383150 mlegoudes262 NaN \n",
239
+ "4 1.134712e+07 11347122 GavinNewsom California \n",
240
+ "\n",
241
+ " description url \\\n",
242
+ "0 This is what I do. I drop truth bombs. NaN \n",
243
+ "1 Keeping an eye on astronauts coming and going.... NaN \n",
244
+ "2 Solve the riddle by replying only the name of ... https://t.co/1v8BON9QpT \n",
245
+ "3 NaN NaN \n",
246
+ "4 Husband & father. 49th Lt. Gov. of California ... https://t.co/XrGnfzTDJD \n",
247
+ "\n",
248
+ " followers_count friends_count listedcount \\\n",
249
+ "0 2925 3 139 \n",
250
+ "1 9 0 5 \n",
251
+ "2 132 46 24 \n",
252
+ "3 54 1351 0 \n",
253
+ "4 1300380 24248 7089 \n",
254
+ "\n",
255
+ " created_at favourites_count verified statuses_count \\\n",
256
+ "0 9/26/2010 14:45 0 False 708 \n",
257
+ "1 Fri Nov 04 12:11:27 +0000 2016 0 False 6 \n",
258
+ "2 1/13/2015 15:10 740 False 7346 \n",
259
+ "3 Wed Dec 08 21:29:31 +0000 2010 2 False 6 \n",
260
+ "4 Wed Dec 19 19:53:42 +0000 2007 4184 True 8536 \n",
261
+ "\n",
262
+ " lang status default_profile \\\n",
263
+ "0 en Status(in_reply_to_status_id=None, favorited=F... True \n",
264
+ "1 en {'created_at': 'Tue Nov 22 16:52:31 +0000 2016... True \n",
265
+ "2 en Status(contributors=None, truncated=False, tex... True \n",
266
+ "3 en {'truncated': False, 'entities': {'user_mentio... True \n",
267
+ "4 en {u'contributors': None, u'truncated': True, u'... False \n",
268
+ "\n",
269
+ " default_profile_image has_extended_profile name bot \n",
270
+ "0 False False Kanye Jordan 1 \n",
271
+ "1 False False Astronaut Notifier 1 \n",
272
+ "2 False False TheRiddlerBot 1 \n",
273
+ "3 False False Laurie Poulsen 1 \n",
274
+ "4 False False Gavin Newsom 0 "
275
+ ]
276
+ },
277
+ "execution_count": 17,
278
+ "metadata": {},
279
+ "output_type": "execute_result"
280
+ }
281
+ ],
282
  "source": [
283
  "df.head()"
284
  ]
285
  },
286
  {
287
  "cell_type": "code",
288
+ "execution_count": 18,
289
  "metadata": {
290
  "execution": {
291
+ "iopub.execute_input": "2026-01-20T09:42:14.851012Z",
292
+ "iopub.status.busy": "2026-01-20T09:42:14.850700Z",
293
+ "iopub.status.idle": "2026-01-20T09:42:14.867213Z",
294
+ "shell.execute_reply": "2026-01-20T09:42:14.866311Z",
295
+ "shell.execute_reply.started": "2026-01-20T09:42:14.850985Z"
296
  },
297
  "trusted": true
298
  },
 
316
  },
317
  {
318
  "cell_type": "code",
319
+ "execution_count": 19,
320
  "metadata": {
321
  "execution": {
322
+ "iopub.execute_input": "2026-01-20T09:42:14.869442Z",
323
+ "iopub.status.busy": "2026-01-20T09:42:14.869099Z",
324
+ "iopub.status.idle": "2026-01-20T09:42:14.884158Z",
325
+ "shell.execute_reply": "2026-01-20T09:42:14.883300Z",
326
+ "shell.execute_reply.started": "2026-01-20T09:42:14.869405Z"
327
  },
328
  "trusted": true
329
  },
 
342
  },
343
  {
344
  "cell_type": "code",
345
+ "execution_count": 20,
346
  "metadata": {
347
  "execution": {
348
+ "iopub.execute_input": "2026-01-20T09:42:14.885944Z",
349
+ "iopub.status.busy": "2026-01-20T09:42:14.885337Z",
350
+ "iopub.status.idle": "2026-01-20T09:42:14.899788Z",
351
+ "shell.execute_reply": "2026-01-20T09:42:14.898857Z",
352
+ "shell.execute_reply.started": "2026-01-20T09:42:14.885913Z"
353
  },
354
  "trusted": true
355
  },
 
360
  },
361
  {
362
  "cell_type": "code",
363
+ "execution_count": 21,
364
  "metadata": {
365
  "execution": {
366
+ "iopub.execute_input": "2026-01-20T09:42:14.901446Z",
367
+ "iopub.status.busy": "2026-01-20T09:42:14.901029Z",
368
+ "iopub.status.idle": "2026-01-20T09:42:14.920930Z",
369
+ "shell.execute_reply": "2026-01-20T09:42:14.920119Z",
370
+ "shell.execute_reply.started": "2026-01-20T09:42:14.901408Z"
371
  },
372
  "trusted": true
373
  },
 
382
  },
383
  {
384
  "cell_type": "code",
385
+ "execution_count": 22,
386
  "metadata": {
387
  "execution": {
388
+ "iopub.execute_input": "2026-01-20T09:42:14.922400Z",
389
+ "iopub.status.busy": "2026-01-20T09:42:14.922068Z",
390
+ "iopub.status.idle": "2026-01-20T09:42:14.940152Z",
391
+ "shell.execute_reply": "2026-01-20T09:42:14.939293Z",
392
+ "shell.execute_reply.started": "2026-01-20T09:42:14.922365Z"
393
  },
394
  "trusted": true
395
  },
 
407
  },
408
  {
409
  "cell_type": "code",
410
+ "execution_count": 23,
411
  "metadata": {
412
  "execution": {
413
+ "iopub.execute_input": "2026-01-20T09:42:14.941565Z",
414
+ "iopub.status.busy": "2026-01-20T09:42:14.941261Z",
415
+ "iopub.status.idle": "2026-01-20T09:42:15.734600Z",
416
+ "shell.execute_reply": "2026-01-20T09:42:15.733765Z",
417
+ "shell.execute_reply.started": "2026-01-20T09:42:14.941540Z"
418
  },
419
  "trusted": true
420
  },
421
+ "outputs": [
422
+ {
423
+ "data": {
424
+ "text/html": [
425
+ "<style>#sk-container-id-2 {\n",
426
+ " /* Definition of color scheme common for light and dark mode */\n",
427
+ " --sklearn-color-text: #000;\n",
428
+ " --sklearn-color-text-muted: #666;\n",
429
+ " --sklearn-color-line: gray;\n",
430
+ " /* Definition of color scheme for unfitted estimators */\n",
431
+ " --sklearn-color-unfitted-level-0: #fff5e6;\n",
432
+ " --sklearn-color-unfitted-level-1: #f6e4d2;\n",
433
+ " --sklearn-color-unfitted-level-2: #ffe0b3;\n",
434
+ " --sklearn-color-unfitted-level-3: chocolate;\n",
435
+ " /* Definition of color scheme for fitted estimators */\n",
436
+ " --sklearn-color-fitted-level-0: #f0f8ff;\n",
437
+ " --sklearn-color-fitted-level-1: #d4ebff;\n",
438
+ " --sklearn-color-fitted-level-2: #b3dbfd;\n",
439
+ " --sklearn-color-fitted-level-3: cornflowerblue;\n",
440
+ "\n",
441
+ " /* Specific color for light theme */\n",
442
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
443
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
444
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
445
+ " --sklearn-color-icon: #696969;\n",
446
+ "\n",
447
+ " @media (prefers-color-scheme: dark) {\n",
448
+ " /* Redefinition of color scheme for dark theme */\n",
449
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
450
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
451
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
452
+ " --sklearn-color-icon: #878787;\n",
453
+ " }\n",
454
+ "}\n",
455
+ "\n",
456
+ "#sk-container-id-2 {\n",
457
+ " color: var(--sklearn-color-text);\n",
458
+ "}\n",
459
+ "\n",
460
+ "#sk-container-id-2 pre {\n",
461
+ " padding: 0;\n",
462
+ "}\n",
463
+ "\n",
464
+ "#sk-container-id-2 input.sk-hidden--visually {\n",
465
+ " border: 0;\n",
466
+ " clip: rect(1px 1px 1px 1px);\n",
467
+ " clip: rect(1px, 1px, 1px, 1px);\n",
468
+ " height: 1px;\n",
469
+ " margin: -1px;\n",
470
+ " overflow: hidden;\n",
471
+ " padding: 0;\n",
472
+ " position: absolute;\n",
473
+ " width: 1px;\n",
474
+ "}\n",
475
+ "\n",
476
+ "#sk-container-id-2 div.sk-dashed-wrapped {\n",
477
+ " border: 1px dashed var(--sklearn-color-line);\n",
478
+ " margin: 0 0.4em 0.5em 0.4em;\n",
479
+ " box-sizing: border-box;\n",
480
+ " padding-bottom: 0.4em;\n",
481
+ " background-color: var(--sklearn-color-background);\n",
482
+ "}\n",
483
+ "\n",
484
+ "#sk-container-id-2 div.sk-container {\n",
485
+ " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
486
+ " but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
487
+ " so we also need the `!important` here to be able to override the\n",
488
+ " default hidden behavior on the sphinx rendered scikit-learn.org.\n",
489
+ " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
490
+ " display: inline-block !important;\n",
491
+ " position: relative;\n",
492
+ "}\n",
493
+ "\n",
494
+ "#sk-container-id-2 div.sk-text-repr-fallback {\n",
495
+ " display: none;\n",
496
+ "}\n",
497
+ "\n",
498
+ "div.sk-parallel-item,\n",
499
+ "div.sk-serial,\n",
500
+ "div.sk-item {\n",
501
+ " /* draw centered vertical line to link estimators */\n",
502
+ " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
503
+ " background-size: 2px 100%;\n",
504
+ " background-repeat: no-repeat;\n",
505
+ " background-position: center center;\n",
506
+ "}\n",
507
+ "\n",
508
+ "/* Parallel-specific style estimator block */\n",
509
+ "\n",
510
+ "#sk-container-id-2 div.sk-parallel-item::after {\n",
511
+ " content: \"\";\n",
512
+ " width: 100%;\n",
513
+ " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
514
+ " flex-grow: 1;\n",
515
+ "}\n",
516
+ "\n",
517
+ "#sk-container-id-2 div.sk-parallel {\n",
518
+ " display: flex;\n",
519
+ " align-items: stretch;\n",
520
+ " justify-content: center;\n",
521
+ " background-color: var(--sklearn-color-background);\n",
522
+ " position: relative;\n",
523
+ "}\n",
524
+ "\n",
525
+ "#sk-container-id-2 div.sk-parallel-item {\n",
526
+ " display: flex;\n",
527
+ " flex-direction: column;\n",
528
+ "}\n",
529
+ "\n",
530
+ "#sk-container-id-2 div.sk-parallel-item:first-child::after {\n",
531
+ " align-self: flex-end;\n",
532
+ " width: 50%;\n",
533
+ "}\n",
534
+ "\n",
535
+ "#sk-container-id-2 div.sk-parallel-item:last-child::after {\n",
536
+ " align-self: flex-start;\n",
537
+ " width: 50%;\n",
538
+ "}\n",
539
+ "\n",
540
+ "#sk-container-id-2 div.sk-parallel-item:only-child::after {\n",
541
+ " width: 0;\n",
542
+ "}\n",
543
+ "\n",
544
+ "/* Serial-specific style estimator block */\n",
545
+ "\n",
546
+ "#sk-container-id-2 div.sk-serial {\n",
547
+ " display: flex;\n",
548
+ " flex-direction: column;\n",
549
+ " align-items: center;\n",
550
+ " background-color: var(--sklearn-color-background);\n",
551
+ " padding-right: 1em;\n",
552
+ " padding-left: 1em;\n",
553
+ "}\n",
554
+ "\n",
555
+ "\n",
556
+ "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
557
+ "clickable and can be expanded/collapsed.\n",
558
+ "- Pipeline and ColumnTransformer use this feature and define the default style\n",
559
+ "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
560
+ "*/\n",
561
+ "\n",
562
+ "/* Pipeline and ColumnTransformer style (default) */\n",
563
+ "\n",
564
+ "#sk-container-id-2 div.sk-toggleable {\n",
565
+ " /* Default theme specific background. It is overwritten whether we have a\n",
566
+ " specific estimator or a Pipeline/ColumnTransformer */\n",
567
+ " background-color: var(--sklearn-color-background);\n",
568
+ "}\n",
569
+ "\n",
570
+ "/* Toggleable label */\n",
571
+ "#sk-container-id-2 label.sk-toggleable__label {\n",
572
+ " cursor: pointer;\n",
573
+ " display: flex;\n",
574
+ " width: 100%;\n",
575
+ " margin-bottom: 0;\n",
576
+ " padding: 0.5em;\n",
577
+ " box-sizing: border-box;\n",
578
+ " text-align: center;\n",
579
+ " align-items: start;\n",
580
+ " justify-content: space-between;\n",
581
+ " gap: 0.5em;\n",
582
+ "}\n",
583
+ "\n",
584
+ "#sk-container-id-2 label.sk-toggleable__label .caption {\n",
585
+ " font-size: 0.6rem;\n",
586
+ " font-weight: lighter;\n",
587
+ " color: var(--sklearn-color-text-muted);\n",
588
+ "}\n",
589
+ "\n",
590
+ "#sk-container-id-2 label.sk-toggleable__label-arrow:before {\n",
591
+ " /* Arrow on the left of the label */\n",
592
+ " content: \"β–Έ\";\n",
593
+ " float: left;\n",
594
+ " margin-right: 0.25em;\n",
595
+ " color: var(--sklearn-color-icon);\n",
596
+ "}\n",
597
+ "\n",
598
+ "#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {\n",
599
+ " color: var(--sklearn-color-text);\n",
600
+ "}\n",
601
+ "\n",
602
+ "/* Toggleable content - dropdown */\n",
603
+ "\n",
604
+ "#sk-container-id-2 div.sk-toggleable__content {\n",
605
+ " max-height: 0;\n",
606
+ " max-width: 0;\n",
607
+ " overflow: hidden;\n",
608
+ " text-align: left;\n",
609
+ " /* unfitted */\n",
610
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
611
+ "}\n",
612
+ "\n",
613
+ "#sk-container-id-2 div.sk-toggleable__content.fitted {\n",
614
+ " /* fitted */\n",
615
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
616
+ "}\n",
617
+ "\n",
618
+ "#sk-container-id-2 div.sk-toggleable__content pre {\n",
619
+ " margin: 0.2em;\n",
620
+ " border-radius: 0.25em;\n",
621
+ " color: var(--sklearn-color-text);\n",
622
+ " /* unfitted */\n",
623
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
624
+ "}\n",
625
+ "\n",
626
+ "#sk-container-id-2 div.sk-toggleable__content.fitted pre {\n",
627
+ " /* unfitted */\n",
628
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
629
+ "}\n",
630
+ "\n",
631
+ "#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
632
+ " /* Expand drop-down */\n",
633
+ " max-height: 200px;\n",
634
+ " max-width: 100%;\n",
635
+ " overflow: auto;\n",
636
+ "}\n",
637
+ "\n",
638
+ "#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
639
+ " content: \"β–Ύ\";\n",
640
+ "}\n",
641
+ "\n",
642
+ "/* Pipeline/ColumnTransformer-specific style */\n",
643
+ "\n",
644
+ "#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
645
+ " color: var(--sklearn-color-text);\n",
646
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
647
+ "}\n",
648
+ "\n",
649
+ "#sk-container-id-2 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
650
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
651
+ "}\n",
652
+ "\n",
653
+ "/* Estimator-specific style */\n",
654
+ "\n",
655
+ "/* Colorize estimator box */\n",
656
+ "#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
657
+ " /* unfitted */\n",
658
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
659
+ "}\n",
660
+ "\n",
661
+ "#sk-container-id-2 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
662
+ " /* fitted */\n",
663
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
664
+ "}\n",
665
+ "\n",
666
+ "#sk-container-id-2 div.sk-label label.sk-toggleable__label,\n",
667
+ "#sk-container-id-2 div.sk-label label {\n",
668
+ " /* The background is the default theme color */\n",
669
+ " color: var(--sklearn-color-text-on-default-background);\n",
670
+ "}\n",
671
+ "\n",
672
+ "/* On hover, darken the color of the background */\n",
673
+ "#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {\n",
674
+ " color: var(--sklearn-color-text);\n",
675
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
676
+ "}\n",
677
+ "\n",
678
+ "/* Label box, darken color on hover, fitted */\n",
679
+ "#sk-container-id-2 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
680
+ " color: var(--sklearn-color-text);\n",
681
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
682
+ "}\n",
683
+ "\n",
684
+ "/* Estimator label */\n",
685
+ "\n",
686
+ "#sk-container-id-2 div.sk-label label {\n",
687
+ " font-family: monospace;\n",
688
+ " font-weight: bold;\n",
689
+ " display: inline-block;\n",
690
+ " line-height: 1.2em;\n",
691
+ "}\n",
692
+ "\n",
693
+ "#sk-container-id-2 div.sk-label-container {\n",
694
+ " text-align: center;\n",
695
+ "}\n",
696
+ "\n",
697
+ "/* Estimator-specific */\n",
698
+ "#sk-container-id-2 div.sk-estimator {\n",
699
+ " font-family: monospace;\n",
700
+ " border: 1px dotted var(--sklearn-color-border-box);\n",
701
+ " border-radius: 0.25em;\n",
702
+ " box-sizing: border-box;\n",
703
+ " margin-bottom: 0.5em;\n",
704
+ " /* unfitted */\n",
705
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
706
+ "}\n",
707
+ "\n",
708
+ "#sk-container-id-2 div.sk-estimator.fitted {\n",
709
+ " /* fitted */\n",
710
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
711
+ "}\n",
712
+ "\n",
713
+ "/* on hover */\n",
714
+ "#sk-container-id-2 div.sk-estimator:hover {\n",
715
+ " /* unfitted */\n",
716
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
717
+ "}\n",
718
+ "\n",
719
+ "#sk-container-id-2 div.sk-estimator.fitted:hover {\n",
720
+ " /* fitted */\n",
721
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
722
+ "}\n",
723
+ "\n",
724
+ "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
725
+ "\n",
726
+ "/* Common style for \"i\" and \"?\" */\n",
727
+ "\n",
728
+ ".sk-estimator-doc-link,\n",
729
+ "a:link.sk-estimator-doc-link,\n",
730
+ "a:visited.sk-estimator-doc-link {\n",
731
+ " float: right;\n",
732
+ " font-size: smaller;\n",
733
+ " line-height: 1em;\n",
734
+ " font-family: monospace;\n",
735
+ " background-color: var(--sklearn-color-background);\n",
736
+ " border-radius: 1em;\n",
737
+ " height: 1em;\n",
738
+ " width: 1em;\n",
739
+ " text-decoration: none !important;\n",
740
+ " margin-left: 0.5em;\n",
741
+ " text-align: center;\n",
742
+ " /* unfitted */\n",
743
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
744
+ " color: var(--sklearn-color-unfitted-level-1);\n",
745
+ "}\n",
746
+ "\n",
747
+ ".sk-estimator-doc-link.fitted,\n",
748
+ "a:link.sk-estimator-doc-link.fitted,\n",
749
+ "a:visited.sk-estimator-doc-link.fitted {\n",
750
+ " /* fitted */\n",
751
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
752
+ " color: var(--sklearn-color-fitted-level-1);\n",
753
+ "}\n",
754
+ "\n",
755
+ "/* On hover */\n",
756
+ "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
757
+ ".sk-estimator-doc-link:hover,\n",
758
+ "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
759
+ ".sk-estimator-doc-link:hover {\n",
760
+ " /* unfitted */\n",
761
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
762
+ " color: var(--sklearn-color-background);\n",
763
+ " text-decoration: none;\n",
764
+ "}\n",
765
+ "\n",
766
+ "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
767
+ ".sk-estimator-doc-link.fitted:hover,\n",
768
+ "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
769
+ ".sk-estimator-doc-link.fitted:hover {\n",
770
+ " /* fitted */\n",
771
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
772
+ " color: var(--sklearn-color-background);\n",
773
+ " text-decoration: none;\n",
774
+ "}\n",
775
+ "\n",
776
+ "/* Span, style for the box shown on hovering the info icon */\n",
777
+ ".sk-estimator-doc-link span {\n",
778
+ " display: none;\n",
779
+ " z-index: 9999;\n",
780
+ " position: relative;\n",
781
+ " font-weight: normal;\n",
782
+ " right: .2ex;\n",
783
+ " padding: .5ex;\n",
784
+ " margin: .5ex;\n",
785
+ " width: min-content;\n",
786
+ " min-width: 20ex;\n",
787
+ " max-width: 50ex;\n",
788
+ " color: var(--sklearn-color-text);\n",
789
+ " box-shadow: 2pt 2pt 4pt #999;\n",
790
+ " /* unfitted */\n",
791
+ " background: var(--sklearn-color-unfitted-level-0);\n",
792
+ " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
793
+ "}\n",
794
+ "\n",
795
+ ".sk-estimator-doc-link.fitted span {\n",
796
+ " /* fitted */\n",
797
+ " background: var(--sklearn-color-fitted-level-0);\n",
798
+ " border: var(--sklearn-color-fitted-level-3);\n",
799
+ "}\n",
800
+ "\n",
801
+ ".sk-estimator-doc-link:hover span {\n",
802
+ " display: block;\n",
803
+ "}\n",
804
+ "\n",
805
+ "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
806
+ "\n",
807
+ "#sk-container-id-2 a.estimator_doc_link {\n",
808
+ " float: right;\n",
809
+ " font-size: 1rem;\n",
810
+ " line-height: 1em;\n",
811
+ " font-family: monospace;\n",
812
+ " background-color: var(--sklearn-color-background);\n",
813
+ " border-radius: 1rem;\n",
814
+ " height: 1rem;\n",
815
+ " width: 1rem;\n",
816
+ " text-decoration: none;\n",
817
+ " /* unfitted */\n",
818
+ " color: var(--sklearn-color-unfitted-level-1);\n",
819
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
820
+ "}\n",
821
+ "\n",
822
+ "#sk-container-id-2 a.estimator_doc_link.fitted {\n",
823
+ " /* fitted */\n",
824
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
825
+ " color: var(--sklearn-color-fitted-level-1);\n",
826
+ "}\n",
827
+ "\n",
828
+ "/* On hover */\n",
829
+ "#sk-container-id-2 a.estimator_doc_link:hover {\n",
830
+ " /* unfitted */\n",
831
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
832
+ " color: var(--sklearn-color-background);\n",
833
+ " text-decoration: none;\n",
834
+ "}\n",
835
+ "\n",
836
+ "#sk-container-id-2 a.estimator_doc_link.fitted:hover {\n",
837
+ " /* fitted */\n",
838
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
839
+ "}\n",
840
+ "</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestClassifier(class_weight=&#x27;balanced&#x27;, max_depth=20,\n",
841
+ " min_samples_leaf=2, n_estimators=300, n_jobs=-1,\n",
842
+ " random_state=42)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" checked><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow\"><div><div>RandomForestClassifier</div></div><div><a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.6/modules/generated/sklearn.ensemble.RandomForestClassifier.html\">?<span>Documentation for RandomForestClassifier</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></div></label><div class=\"sk-toggleable__content fitted\"><pre>RandomForestClassifier(class_weight=&#x27;balanced&#x27;, max_depth=20,\n",
843
+ " min_samples_leaf=2, n_estimators=300, n_jobs=-1,\n",
844
+ " random_state=42)</pre></div> </div></div></div></div>"
845
+ ],
846
+ "text/plain": [
847
+ "RandomForestClassifier(class_weight='balanced', max_depth=20,\n",
848
+ " min_samples_leaf=2, n_estimators=300, n_jobs=-1,\n",
849
+ " random_state=42)"
850
+ ]
851
+ },
852
+ "execution_count": 23,
853
+ "metadata": {},
854
+ "output_type": "execute_result"
855
+ }
856
+ ],
857
  "source": [
858
  "from sklearn.ensemble import RandomForestClassifier\n",
859
  "\n",
 
871
  },
872
  {
873
  "cell_type": "code",
874
+ "execution_count": 24,
875
  "metadata": {
876
  "execution": {
877
+ "iopub.execute_input": "2026-01-20T09:42:15.736130Z",
878
+ "iopub.status.busy": "2026-01-20T09:42:15.735775Z",
879
+ "iopub.status.idle": "2026-01-20T09:42:15.851114Z",
880
+ "shell.execute_reply": "2026-01-20T09:42:15.850291Z",
881
+ "shell.execute_reply.started": "2026-01-20T09:42:15.736093Z"
882
  },
883
  "trusted": true
884
  },
885
+ "outputs": [
886
+ {
887
+ "name": "stdout",
888
+ "output_type": "stream",
889
+ "text": [
890
+ "Accuracy: 0.8785942492012779\n",
891
+ " precision recall f1-score support\n",
892
+ "\n",
893
+ " 0 0.90 0.87 0.89 169\n",
894
+ " 1 0.85 0.89 0.87 144\n",
895
+ "\n",
896
+ " accuracy 0.88 313\n",
897
+ " macro avg 0.88 0.88 0.88 313\n",
898
+ "weighted avg 0.88 0.88 0.88 313\n",
899
+ "\n"
900
+ ]
901
+ }
902
+ ],
903
  "source": [
904
  "preds = rf.predict(X_test)\n",
905
  "\n",
 
909
  },
910
  {
911
  "cell_type": "code",
912
+ "execution_count": 25,
913
  "metadata": {
914
  "execution": {
915
+ "iopub.execute_input": "2026-01-20T09:42:15.853420Z",
916
+ "iopub.status.busy": "2026-01-20T09:42:15.853099Z",
917
+ "iopub.status.idle": "2026-01-20T09:42:15.919231Z",
918
+ "shell.execute_reply": "2026-01-20T09:42:15.918360Z",
919
+ "shell.execute_reply.started": "2026-01-20T09:42:15.853391Z"
920
  },
921
  "trusted": true
922
  },
923
+ "outputs": [
924
+ {
925
+ "name": "stdout",
926
+ "output_type": "stream",
927
+ "text": [
928
+ " feature importance\n",
929
+ "1 friends_count 0.204309\n",
930
+ "9 follow_ratio 0.144836\n",
931
+ "3 favourites_count 0.135528\n",
932
+ "0 followers_count 0.109556\n",
933
+ "5 verified 0.099516\n",
934
+ "10 account_age_days 0.090862\n",
935
+ "2 listedcount 0.088300\n",
936
+ "4 statuses_count 0.076216\n",
937
+ "6 default_profile 0.039780\n",
938
+ "8 has_extended_profile 0.008163\n",
939
+ "7 default_profile_image 0.002935\n"
940
+ ]
941
+ }
942
+ ],
943
  "source": [
944
  "imp = pd.DataFrame({\n",
945
  " \"feature\": X.columns,\n",
 
949
  "print(imp)"
950
  ]
951
  },
952
+ {
953
+ "cell_type": "code",
954
+ "execution_count": 26,
955
+ "metadata": {
956
+ "execution": {
957
+ "iopub.execute_input": "2026-01-20T09:42:15.920668Z",
958
+ "iopub.status.busy": "2026-01-20T09:42:15.920341Z",
959
+ "iopub.status.idle": "2026-01-20T09:42:16.022530Z",
960
+ "shell.execute_reply": "2026-01-20T09:42:16.021678Z",
961
+ "shell.execute_reply.started": "2026-01-20T09:42:15.920632Z"
962
+ },
963
+ "trusted": true
964
+ },
965
+ "outputs": [
966
+ {
967
+ "data": {
968
+ "text/plain": [
969
+ "['bot_model.joblib']"
970
+ ]
971
+ },
972
+ "execution_count": 26,
973
+ "metadata": {},
974
+ "output_type": "execute_result"
975
+ }
976
+ ],
977
+ "source": [
978
+ "import joblib\n",
979
+ "\n",
980
+ "joblib.dump(rf, \"bot_model.joblib\")"
981
+ ]
982
+ },
983
+ {
984
+ "cell_type": "code",
985
+ "execution_count": 27,
986
+ "metadata": {
987
+ "execution": {
988
+ "iopub.execute_input": "2026-01-20T09:42:16.024523Z",
989
+ "iopub.status.busy": "2026-01-20T09:42:16.023646Z",
990
+ "iopub.status.idle": "2026-01-20T09:42:16.029010Z",
991
+ "shell.execute_reply": "2026-01-20T09:42:16.028344Z",
992
+ "shell.execute_reply.started": "2026-01-20T09:42:16.024490Z"
993
+ },
994
+ "trusted": true
995
+ },
996
+ "outputs": [
997
+ {
998
+ "name": "stdout",
999
+ "output_type": "stream",
1000
+ "text": [
1001
+ "RF trained feature count: 11\n",
1002
+ "RF trained feature names:\n",
1003
+ "['followers_count', 'friends_count', 'listedcount', 'favourites_count', 'statuses_count', 'verified', 'default_profile', 'default_profile_image', 'has_extended_profile', 'follow_ratio', 'account_age_days']\n"
1004
+ ]
1005
+ }
1006
+ ],
1007
+ "source": [
1008
+ "# βœ… After training RF\n",
1009
+ "print(\"RF trained feature count:\", len(rf.feature_names_in_))\n",
1010
+ "print(\"RF trained feature names:\")\n",
1011
+ "print(list(rf.feature_names_in_))\n"
1012
+ ]
1013
+ },
1014
  {
1015
  "cell_type": "code",
1016
  "execution_count": null,
 
1052
  "name": "python",
1053
  "nbconvert_exporter": "python",
1054
  "pygments_lexer": "ipython3",
1055
+ "version": "3.10.11"
1056
  }
1057
  },
1058
  "nbformat": 4,