Files changed (1) hide show
  1. app.py +200 -158
app.py CHANGED
@@ -62,7 +62,6 @@ st.markdown("""
62
  </style>
63
  """, unsafe_allow_html=True)
64
 
65
-
66
  @st.cache_resource
67
  def load_model(model_path='bot_detector_model.pkl'):
68
  try:
@@ -77,7 +76,7 @@ def make_prediction(features, tweet_content, model_components):
77
  features_scaled = model_components['scaler'].transform(features)
78
  behavioral_probs = model_components['behavioral_model'].predict_proba(features_scaled)[0]
79
 
80
- if tweet_content:
81
  tweet_features = model_components['tweet_vectorizer'].transform([tweet_content])
82
  tweet_probs = model_components['tweet_model'].predict_proba(tweet_features)[0]
83
  final_probs = 0.8 * behavioral_probs + 0.2 * tweet_probs
@@ -86,7 +85,6 @@ def make_prediction(features, tweet_content, model_components):
86
 
87
  prediction = (final_probs[1] > 0.5)
88
  confidence = final_probs[1] if prediction else final_probs[0]
89
-
90
  return prediction, confidence, final_probs
91
 
92
  def create_gauge_chart(confidence, prediction):
@@ -128,10 +126,10 @@ def create_probability_chart(probs):
128
  return fig
129
 
130
  def main():
131
- # Sidebar
132
  st.sidebar.image("piclumen-1739279351872.png", width=100) # Replace with your logo
133
  st.sidebar.title("Navigation")
134
- page = st.sidebar.radio("Go to", ["Bot Detection", "About", "Statistics"])
135
 
136
  if page == "Bot Detection":
137
  st.title("πŸ€– Twitter Bot Detection System")
@@ -148,7 +146,7 @@ def main():
148
  if model_components is None:
149
  st.stop()
150
 
151
- # Create tabs
152
  tab1, tab2 = st.tabs(["πŸ“ Input Details", "πŸ“Š Analysis Results"])
153
 
154
  with tab1:
@@ -172,7 +170,7 @@ def main():
172
  location = st.text_input("Location")
173
 
174
  st.markdown("### Account Properties")
175
- prop_col1, prop_col2, prop_col3, prop_col4 = st.columns(4)
176
 
177
  with prop_col1:
178
  verified = st.checkbox("Verified Account")
@@ -181,15 +179,16 @@ def main():
181
  with prop_col3:
182
  default_profile_image = st.checkbox("Default Profile Image")
183
 
 
184
  has_extended_profile = True
185
  has_url = True
186
 
187
  st.markdown("### Tweet Content")
188
- tweet_content = st.text_area("Sample Tweet ", height=100)
189
 
190
  if st.button("πŸ” Analyze Account"):
191
  with st.spinner('Analyzing account characteristics...'):
192
- # Prepare features
193
  features = pd.DataFrame([{
194
  'followers_count': followers_count,
195
  'friends_count': friends_count,
@@ -215,28 +214,22 @@ def main():
215
  prediction, confidence, probs = make_prediction(features, tweet_content, model_components)
216
 
217
  # Switch to results tab
218
- time.sleep(1) # Add small delay for effect
219
  tab2.markdown("### Analysis Complete!")
220
 
221
  with tab2:
222
- # Display main result
223
  if prediction:
224
  st.error("πŸ€– Bot Account Detected!")
225
  else:
226
  st.success("πŸ‘€ Human Account Detected!")
227
 
228
- # Create three columns for visualizations
229
  metric_col1, metric_col2 = st.columns(2)
230
 
231
  with metric_col1:
232
- # Gauge chart
233
  st.plotly_chart(create_gauge_chart(confidence, prediction), use_container_width=True)
234
-
235
  with metric_col2:
236
- # Probability distribution
237
  st.plotly_chart(create_probability_chart(probs), use_container_width=True)
238
 
239
- # Feature importance
240
  st.markdown("### Feature Analysis")
241
  feature_importance = pd.DataFrame({
242
  'Feature': model_components['feature_names'],
@@ -244,168 +237,217 @@ def main():
244
  }).sort_values('Importance', ascending=False)
245
 
246
  fig = px.bar(feature_importance,
247
- x='Importance',
248
- y='Feature',
249
- orientation='h',
250
- title='Feature Importance Analysis')
251
  fig.update_layout(height=400)
252
  st.plotly_chart(fig, use_container_width=True)
253
 
254
- # Account metrics comparison
255
  metrics_data = {
256
  'Metric': ['Followers', 'Friends', 'Tweets', 'Favorites'],
257
  'Count': [followers_count, friends_count, statuses_count, favorites_count]
258
  }
259
  fig = px.bar(metrics_data,
260
- x='Metric',
261
- y='Count',
262
- title='Account Metrics Overview',
263
- color='Count',
264
- color_continuous_scale='Viridis')
265
  st.plotly_chart(fig, use_container_width=True)
266
-
267
- elif page == "About":
268
- st.title("About the Bot Detection System")
 
 
 
 
 
 
 
269
 
270
- # System Overview
271
- st.markdown("""
272
- <div class='info-box'>
273
- <h3>🎯 System Overview</h3>
274
- <p>Our Twitter Bot Detection System uses state-of-the-art machine learning algorithms to analyze Twitter accounts
275
- and determine whether they are automated bots or genuine human users. The system achieves this through multi-faceted
276
- analysis of various account characteristics and behaviors.</p>
277
- </div>
278
- """, unsafe_allow_html=True)
279
-
280
- # Key Features
281
- st.markdown("### πŸ”‘ Key Features Analyzed")
282
- col1, col2 = st.columns(2)
283
 
284
- with col1:
285
- st.markdown("""
286
- #### Account Characteristics
287
- - Profile completeness
288
- - Account age and verification status
289
- - Username patterns
290
- - Profile description analysis
291
-
292
- #### Behavioral Patterns
293
- - Posting frequency
294
- - Engagement rates
295
- - Temporal patterns
296
- - Content similarity
297
- """)
298
-
299
- with col2:
300
- st.markdown("""
301
- #### Network Analysis
302
- - Follower-following ratio
303
- - Friend acquisition rate
304
- - Network growth patterns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
- #### Content Analysis
307
- - Tweet sentiment
308
- - Language patterns
309
- - URL sharing frequency
310
- - Hashtag usage
311
- """)
312
-
313
- # Technical Details
 
 
 
 
 
 
 
 
 
 
 
 
314
  st.markdown("""
315
- <div class='info-box'>
316
- <h3>βš™οΈ Technical Implementation</h3>
317
- <p>The system employs a hierarchical classification approach:</p>
318
- <ul>
319
- <li><strong>Primary Analysis:</strong> Random Forest Classifier for behavioral patterns</li>
320
- <li><strong>Secondary Analysis:</strong> Natural Language Processing for content analysis</li>
321
- <li><strong>Final Decision:</strong> Weighted ensemble of multiple models</li>
322
- </ul>
323
- </div>
324
- """, unsafe_allow_html=True)
325
-
326
- # Accuracy Metrics
327
- st.markdown("### πŸ“Š System Performance")
328
- metrics_col1, metrics_col2, metrics_col3, metrics_col4 = st.columns(4)
329
 
330
- with metrics_col1:
331
- st.metric("Accuracy", "87%")
332
- with metrics_col2:
333
- st.metric("Precision", "89%")
334
- with metrics_col3:
335
- st.metric("Recall", "83%")
336
- with metrics_col4:
337
- st.metric("F1 Score", "86%")
338
-
339
- # Use Cases
340
  st.markdown("""
341
- ### 🎯 Common Use Cases
342
- - **Social Media Management**: Identify and remove bot accounts
343
- - **Research**: Analyze social media manipulation
344
- - **Marketing**: Verify authentic engagement
345
- - **Security**: Protect against automated threats
 
 
 
 
 
346
  """)
347
-
348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  else: # Statistics page
350
- st.title("System Statistics")
351
-
352
- # Add some sample statistics
353
- col1, col2 = st.columns(2)
354
-
355
- with col1:
356
- # Sample detection distribution
357
- detection_data = {
358
- 'Category': ['Bots', 'Humans'],
359
- 'Count': [324, 676]
360
- }
361
- fig = px.pie(detection_data,
362
- values='Count',
363
- names='Category',
364
- title='Detection Distribution',
365
- color_discrete_sequence=['#FF4B4B', '#00CC96'])
366
- st.plotly_chart(fig, use_container_width=True)
367
-
368
- with col2:
369
- # Confidence score distribution
370
- confidence_data = {
371
- 'Score': ['90-100%', '80-90%', '70-80%', '60-70%', '50-60%'],
372
- 'Count': [250, 300, 200, 150, 100]
373
- }
374
- fig = px.bar(confidence_data,
375
- x='Score',
376
- y='Count',
377
- title='Confidence Score Distribution',
378
- color='Count',
379
- color_continuous_scale='Viridis')
380
- st.plotly_chart(fig, use_container_width=True)
381
-
382
- # Monthly statistics
383
- st.markdown("### Monthly Detection Trends")
384
- monthly_data = {
385
- 'Month': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'],
386
- 'Bots Detected': [45, 52, 38, 65, 48, 76],
387
- 'Accuracy': [92, 94, 93, 95, 94, 96]
388
  }
389
- fig = px.line(monthly_data,
390
- x='Month',
391
- y=['Bots Detected', 'Accuracy'],
392
- title='Monthly Performance Metrics',
393
- markers=True)
394
  st.plotly_chart(fig, use_container_width=True)
395
-
396
- # Key metrics
397
- st.markdown("### Key System Metrics")
398
- metric_col1, metric_col2, metric_col3, metric_col4 = st.columns(4)
399
 
400
- with metric_col1:
401
- st.metric("Total Analyses", "1,000", "+12%")
402
- with metric_col2:
403
- st.metric("Avg. Accuracy", "94.5%", "+2.3%")
404
- with metric_col3:
405
- st.metric("Bot Detection Rate", "32.4%", "-5.2%")
406
- with metric_col4:
407
- st.metric("Processing Time", "1.2s", "-0.3s")
408
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
 
410
  if __name__ == "__main__":
411
- main()
 
62
  </style>
63
  """, unsafe_allow_html=True)
64
 
 
65
  @st.cache_resource
66
  def load_model(model_path='bot_detector_model.pkl'):
67
  try:
 
76
  features_scaled = model_components['scaler'].transform(features)
77
  behavioral_probs = model_components['behavioral_model'].predict_proba(features_scaled)[0]
78
 
79
+ if tweet_content and tweet_content.strip():
80
  tweet_features = model_components['tweet_vectorizer'].transform([tweet_content])
81
  tweet_probs = model_components['tweet_model'].predict_proba(tweet_features)[0]
82
  final_probs = 0.8 * behavioral_probs + 0.2 * tweet_probs
 
85
 
86
  prediction = (final_probs[1] > 0.5)
87
  confidence = final_probs[1] if prediction else final_probs[0]
 
88
  return prediction, confidence, final_probs
89
 
90
  def create_gauge_chart(confidence, prediction):
 
126
  return fig
127
 
128
  def main():
129
+ # Sidebar with extended navigation
130
  st.sidebar.image("piclumen-1739279351872.png", width=100) # Replace with your logo
131
  st.sidebar.title("Navigation")
132
+ page = st.sidebar.radio("Go to", ["Bot Detection", "CSV Analysis", "About", "Statistics"])
133
 
134
  if page == "Bot Detection":
135
  st.title("πŸ€– Twitter Bot Detection System")
 
146
  if model_components is None:
147
  st.stop()
148
 
149
+ # Create tabs for individual account analysis
150
  tab1, tab2 = st.tabs(["πŸ“ Input Details", "πŸ“Š Analysis Results"])
151
 
152
  with tab1:
 
170
  location = st.text_input("Location")
171
 
172
  st.markdown("### Account Properties")
173
+ prop_col1, prop_col2, prop_col3 = st.columns(3)
174
 
175
  with prop_col1:
176
  verified = st.checkbox("Verified Account")
 
179
  with prop_col3:
180
  default_profile_image = st.checkbox("Default Profile Image")
181
 
182
+ # These can be fixed or computed; here we assume True as default
183
  has_extended_profile = True
184
  has_url = True
185
 
186
  st.markdown("### Tweet Content")
187
+ tweet_content = st.text_area("Sample Tweet", height=100)
188
 
189
  if st.button("πŸ” Analyze Account"):
190
  with st.spinner('Analyzing account characteristics...'):
191
+ # Prepare features for the single account
192
  features = pd.DataFrame([{
193
  'followers_count': followers_count,
194
  'friends_count': friends_count,
 
214
  prediction, confidence, probs = make_prediction(features, tweet_content, model_components)
215
 
216
  # Switch to results tab
217
+ time.sleep(1)
218
  tab2.markdown("### Analysis Complete!")
219
 
220
  with tab2:
 
221
  if prediction:
222
  st.error("πŸ€– Bot Account Detected!")
223
  else:
224
  st.success("πŸ‘€ Human Account Detected!")
225
 
 
226
  metric_col1, metric_col2 = st.columns(2)
227
 
228
  with metric_col1:
 
229
  st.plotly_chart(create_gauge_chart(confidence, prediction), use_container_width=True)
 
230
  with metric_col2:
 
231
  st.plotly_chart(create_probability_chart(probs), use_container_width=True)
232
 
 
233
  st.markdown("### Feature Analysis")
234
  feature_importance = pd.DataFrame({
235
  'Feature': model_components['feature_names'],
 
237
  }).sort_values('Importance', ascending=False)
238
 
239
  fig = px.bar(feature_importance,
240
+ x='Importance',
241
+ y='Feature',
242
+ orientation='h',
243
+ title='Feature Importance Analysis')
244
  fig.update_layout(height=400)
245
  st.plotly_chart(fig, use_container_width=True)
246
 
 
247
  metrics_data = {
248
  'Metric': ['Followers', 'Friends', 'Tweets', 'Favorites'],
249
  'Count': [followers_count, friends_count, statuses_count, favorites_count]
250
  }
251
  fig = px.bar(metrics_data,
252
+ x='Metric',
253
+ y='Count',
254
+ title='Account Metrics Overview',
255
+ color='Count',
256
+ color_continuous_scale='Viridis')
257
  st.plotly_chart(fig, use_container_width=True)
258
+
259
+ elif page == "CSV Analysis":
260
+ st.title("CSV Batch Analysis")
261
+ st.markdown("Upload a CSV file with account data to run batch predictions.")
262
+ uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
263
+
264
+ if uploaded_file is not None:
265
+ data = pd.read_csv(uploaded_file)
266
+ st.markdown("### CSV Data Preview")
267
+ st.dataframe(data.head())
268
 
269
+ model_components = load_model()
270
+ if model_components is None:
271
+ st.stop()
 
 
 
 
 
 
 
 
 
 
272
 
273
+ predictions = []
274
+ confidences = []
275
+
276
+ with st.spinner("Processing accounts..."):
277
+ for idx, row in data.iterrows():
278
+ features = pd.DataFrame([{
279
+ 'followers_count': row['followers_count'],
280
+ 'friends_count': row['friends_count'],
281
+ 'listed_count': row['listed_count'],
282
+ 'favorites_count': row['favorites_count'],
283
+ 'statuses_count': row['statuses_count'],
284
+ 'verified': int(row['verified']),
285
+ 'followers_friends_ratio': row['followers_count'] / (row['friends_count'] + 1),
286
+ 'statuses_per_day': row['statuses_count'] / (row['account_age (days)'] + 1),
287
+ 'engagement_ratio': row['favorites_count'] / (row['statuses_count'] + 1),
288
+ 'account_age_days': row['account_age (days)'],
289
+ 'name_length': len(row['username']),
290
+ 'name_has_digits': int(bool(re.search(r'\d', row['username']))),
291
+ 'description_length': len(row['description']),
292
+ 'has_location': int(bool(row['location'].strip())),
293
+ 'has_url': int(row['has_url']),
294
+ 'default_profile': int(row['default_profile']),
295
+ 'default_profile_image': int(row['default_profile_image']),
296
+ 'has_extended_profile': int(row['has_extended_profile'])
297
+ }])
298
+
299
+ tweet_text = row['tweet_content'] if 'tweet_content' in row else ""
300
+ pred, conf, _ = make_prediction(features, tweet_text, model_components)
301
+ predictions.append(pred)
302
+ confidences.append(conf)
303
+
304
+ data['prediction'] = predictions
305
+ data['confidence'] = confidences
306
+ st.markdown("### Batch Prediction Results")
307
+ st.dataframe(data)
308
+
309
+ # If ground truth labels are provided, compute evaluation metrics
310
+ if 'label' in data.columns:
311
+ y_true = data['label'].tolist()
312
+ y_pred = [int(p) for p in predictions]
313
+ from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
314
+ f1 = f1_score(y_true, y_pred, average='weighted')
315
+ precision = precision_score(y_true, y_pred, average='weighted')
316
+ recall = recall_score(y_true, y_pred, average='weighted')
317
+ report = classification_report(y_true, y_pred)
318
 
319
+ st.markdown("### Evaluation Metrics")
320
+ st.write("F1 Score:", f1)
321
+ st.write("Precision:", precision)
322
+ st.write("Recall:", recall)
323
+ st.text(report)
324
+
325
+ elif page == "About":
326
+ st.title("About the Bot Detection System")
327
+ st.markdown("""
328
+ <div class='info-box'>
329
+ <h3>🎯 System Overview</h3>
330
+ <p>Our Twitter Bot Detection System uses state-of-the-art machine learning algorithms to analyze Twitter accounts
331
+ and determine whether they are automated bots or genuine human users. The system achieves this through multi-faceted
332
+ analysis of various account characteristics and behaviors.</p>
333
+ </div>
334
+ """, unsafe_allow_html=True)
335
+ st.markdown("### πŸ”‘ Key Features Analyzed")
336
+ col1, col2 = st.columns(2)
337
+
338
+ with col1:
339
  st.markdown("""
340
+ #### Account Characteristics
341
+ - Profile completeness
342
+ - Account age and verification status
343
+ - Username patterns
344
+ - Profile description analysis
 
 
 
 
 
 
 
 
 
345
 
346
+ #### Behavioral Patterns
347
+ - Posting frequency
348
+ - Engagement rates
349
+ - Temporal patterns
350
+ - Content similarity
351
+ """)
352
+ with col2:
 
 
 
353
  st.markdown("""
354
+ #### Network Analysis
355
+ - Follower-following ratio
356
+ - Friend acquisition rate
357
+ - Network growth patterns
358
+
359
+ #### Content Analysis
360
+ - Tweet sentiment
361
+ - Language patterns
362
+ - URL sharing frequency
363
+ - Hashtag usage
364
  """)
 
365
 
366
+ st.markdown("""
367
+ <div class='info-box'>
368
+ <h3>βš™οΈ Technical Implementation</h3>
369
+ <p>The system employs a hierarchical classification approach:</p>
370
+ <ul>
371
+ <li><strong>Primary Analysis:</strong> Random Forest Classifier for behavioral patterns</li>
372
+ <li><strong>Secondary Analysis:</strong> Natural Language Processing for content analysis</li>
373
+ <li><strong>Final Decision:</strong> Weighted ensemble of multiple models</li>
374
+ </ul>
375
+ </div>
376
+ """, unsafe_allow_html=True)
377
+
378
+ st.markdown("### πŸ“Š System Performance")
379
+ metrics_col1, metrics_col2, metrics_col3, metrics_col4 = st.columns(4)
380
+
381
+ with metrics_col1:
382
+ st.metric("Accuracy", "87%")
383
+ with metrics_col2:
384
+ st.metric("Precision", "89%")
385
+ with metrics_col3:
386
+ st.metric("Recall", "83%")
387
+ with metrics_col4:
388
+ st.metric("F1 Score", "86%")
389
+
390
+ st.markdown("""
391
+ ### 🎯 Common Use Cases
392
+ - **Social Media Management**: Identify and remove bot accounts
393
+ - **Research**: Analyze social media manipulation
394
+ - **Marketing**: Verify authentic engagement
395
+ - **Security**: Protect against automated threats
396
+ """)
397
+
398
  else: # Statistics page
399
+ st.title("System Statistics")
400
+ col1, col2 = st.columns(2)
401
+
402
+ with col1:
403
+ detection_data = {
404
+ 'Category': ['Bots', 'Humans'],
405
+ 'Count': [324, 676]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  }
407
+ fig = px.pie(detection_data,
408
+ values='Count',
409
+ names='Category',
410
+ title='Detection Distribution',
411
+ color_discrete_sequence=['#FF4B4B', '#00CC96'])
412
  st.plotly_chart(fig, use_container_width=True)
 
 
 
 
413
 
414
+ with col2:
415
+ confidence_data = {
416
+ 'Score': ['90-100%', '80-90%', '70-80%', '60-70%', '50-60%'],
417
+ 'Count': [250, 300, 200, 150, 100]
418
+ }
419
+ fig = px.bar(confidence_data,
420
+ x='Score',
421
+ y='Count',
422
+ title='Confidence Score Distribution',
423
+ color='Count',
424
+ color_continuous_scale='Viridis')
425
+ st.plotly_chart(fig, use_container_width=True)
426
+
427
+ st.markdown("### Monthly Detection Trends")
428
+ monthly_data = {
429
+ 'Month': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'],
430
+ 'Bots Detected': [45, 52, 38, 65, 48, 76],
431
+ 'Accuracy': [92, 94, 93, 95, 94, 96]
432
+ }
433
+ fig = px.line(monthly_data,
434
+ x='Month',
435
+ y=['Bots Detected', 'Accuracy'],
436
+ title='Monthly Performance Metrics',
437
+ markers=True)
438
+ st.plotly_chart(fig, use_container_width=True)
439
+
440
+ st.markdown("### Key System Metrics")
441
+ metric_col1, metric_col2, metric_col3, metric_col4 = st.columns(4)
442
+
443
+ with metric_col1:
444
+ st.metric("Total Analyses", "1,000", "+12%")
445
+ with metric_col2:
446
+ st.metric("Avg. Accuracy", "94.5%", "+2.3%")
447
+ with metric_col3:
448
+ st.metric("Bot Detection Rate", "32.4%", "-5.2%")
449
+ with metric_col4:
450
+ st.metric("Processing Time", "1.2s", "-0.3s")
451
 
452
  if __name__ == "__main__":
453
+ main()