Perth0603 commited on
Commit
2c14a34
·
verified ·
1 Parent(s): d8f11da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -56
app.py CHANGED
@@ -140,10 +140,6 @@ def _read_hosts_from_csv(path: str) -> Dict[str, str]:
140
 
141
 
142
  def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame:
143
- """
144
- MODULE 4: URL Analyzer - Feature Engineering
145
- Analyzes URL construction, domain composition, and critical components
146
- """
147
  s = pd.Series(urls, dtype=str)
148
  out = pd.DataFrame()
149
 
@@ -185,7 +181,7 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
185
  }
186
  out["tld_suspicious"] = tld_series.apply(lambda t: 1 if t.lower() in suspicious_tlds else 0)
187
 
188
- # Punycode indicator (internationalized domain names - often used in homoglyph attacks)
189
  out["has_punycode"] = hosts.str.contains("xn--").astype(int)
190
 
191
  # SLD stats
@@ -252,10 +248,11 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
252
  out["like_facebook"] = hosts.apply(lambda h: _like_brand(h, "facebook"))
253
 
254
  # Lookalike/homoglyph detection: unusual Unicode symbols that resemble ASCII letters
 
255
  def _detect_lookalike_chars(url: str) -> int:
256
  """
257
  Detects if URL contains Unicode characters that visually resemble ASCII letters.
258
- Common lookalikes used in phishing homoglyph attacks:
259
  - Cyrillic: а, е, о, р, с, х, у, ч, ы, ь (look like a,e,o,p,c,x,y,4,b,b)
260
  - Greek: α, ο (look like a, o)
261
  - Latin Extended: ɑ, ɢ, ᴅ, ɡ, ɪ, ɴ, ɪ (look like a,G,D,g,i,N,I)
@@ -319,20 +316,11 @@ def _normalize_url_string(url: str) -> str:
319
 
320
  @app.get("/")
321
  def root():
322
- return {
323
- "status": "ok",
324
- "service": "PhishWatch Pro - Module 4: URL Analyzer",
325
- "backend": "Random Forest (GPU accelerated)"
326
- }
327
 
328
 
329
  @app.post("/predict-url")
330
  def predict_url(payload: PredictUrlPayload):
331
- """
332
- MODULE 4: URL Analyzer
333
- Analyzes URL construction, domain composition, and critical components
334
- Returns phishing risk score with confidence level and threat type
335
- """
336
  try:
337
  _load_url_model()
338
 
@@ -374,8 +362,7 @@ def predict_url(payload: PredictUrlPayload):
374
  "backend": str(model_type),
375
  "threshold": 0.5,
376
  "url_col": url_col,
377
- "override": {"reason": "csv_url_match", "module": "4_url_analyzer"},
378
- "threat_type": "known_phishing_url" if label == "PHISH" else "known_safe",
379
  }
380
 
381
  # Known-host override (suffix match)
@@ -396,22 +383,23 @@ def predict_url(payload: PredictUrlPayload):
396
  "backend": str(model_type),
397
  "threshold": 0.5,
398
  "url_col": url_col,
399
- "override": {"reason": "known_host_match", "module": "4_url_analyzer"},
400
- "threat_type": "known_phishing_domain" if label == "PHISH" else "known_safe",
401
  }
402
 
403
- # Lookalike character guard: detect homoglyph/lookalike attacks (heuristic indicator)
404
  try:
 
405
  lookalikes_cyrillic = {
406
  'а': 'a', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'х': 'x',
407
  'у': 'y', 'ч': '4', 'ы': 'b', 'ь': 'b', 'і': 'i', 'ї': 'yi',
408
  'ґ': 'g', 'ė': 'e', 'ń': 'n', 'ș': 's', 'ț': 't'
409
  }
410
 
 
411
  lookalikes_greek = {
412
  'α': 'a', 'ο': 'o', 'ν': 'v', 'τ': 't', 'ρ': 'p'
413
  }
414
 
 
415
  lookalikes_latin = {
416
  'ɑ': 'a', 'ɢ': 'g', 'ᴅ': 'd', 'ɡ': 'g', 'ɪ': 'i',
417
  'ɴ': 'n', 'ᴘ': 'p', 'ᴠ': 'v', 'ᴡ': 'w', 'ɨ': 'i'
@@ -431,17 +419,15 @@ def predict_url(payload: PredictUrlPayload):
431
  "predicted_label": int(predicted_label),
432
  "score": float(score),
433
  "phishing_probability": float(phish_proba),
434
- "backend": "homoglyph_guard",
435
  "threshold": 0.5,
436
  "url_col": url_col,
437
- "rule": "homoglyph_character_detected",
438
- "threat_type": "homoglyph_attack",
439
- "module": "4_url_analyzer_heuristic",
440
  }
441
  except Exception:
442
  pass
443
 
444
- # Typosquat guard: detect brand impersonation with typos (heuristic indicator)
445
  try:
446
  s_host = (urlparse(_ensure_scheme(url_str)).hostname or "").lower()
447
  s_sld = s_host.split(".")[-2] if "." in s_host else s_host
@@ -480,14 +466,12 @@ def predict_url(payload: PredictUrlPayload):
480
  "backend": "typosquat_guard",
481
  "threshold": 0.5,
482
  "url_col": url_col,
483
- "rule": "typosquat_detected",
484
- "threat_type": "brand_impersonation",
485
- "module": "4_url_analyzer_heuristic",
486
  }
487
  except Exception:
488
  pass
489
 
490
- # Random Forest Model Inference (primary detection)
491
  feats = _engineer_features([url_str], feature_cols)
492
  if model_type == "xgboost_bst":
493
  if xgb is None:
@@ -508,22 +492,6 @@ def predict_url(payload: PredictUrlPayload):
508
  predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
509
  score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
510
 
511
- # Determine threat type based on features
512
- threat_type = "unknown"
513
- if label == "PHISH":
514
- if feats["has_ip"].iloc[0] == 1:
515
- threat_type = "ip_based_phishing"
516
- elif feats["has_lookalike_chars"].iloc[0] == 1:
517
- threat_type = "homoglyph_phishing"
518
- elif feats["subdomain_count"].iloc[0] > 3:
519
- threat_type = "subdomain_abuse"
520
- elif feats["tld_suspicious"].iloc[0] == 1:
521
- threat_type = "suspicious_tld"
522
- elif any(feats[f"has_{tok}"].iloc[0] == 1 for tok in ["login", "verify", "secure", "bank", "pay"]):
523
- threat_type = "phishing_lure"
524
- else:
525
- threat_type = "anomalous_url_structure"
526
-
527
  return {
528
  "label": label,
529
  "predicted_label": int(predicted_label),
@@ -532,15 +500,8 @@ def predict_url(payload: PredictUrlPayload):
532
  "backend": str(model_type),
533
  "threshold": 0.5,
534
  "url_col": url_col,
535
- "threat_type": threat_type,
536
- "module": "4_url_analyzer_random_forest",
537
- "features": {
538
- "url_length": float(feats["url_len"].iloc[0]),
539
- "subdomain_count": float(feats["subdomain_count"].iloc[0]),
540
- "has_ip": bool(feats["has_ip"].iloc[0]),
541
- "suspicious_tld": bool(feats["tld_suspicious"].iloc[0]),
542
- "has_punycode": bool(feats["has_punycode"].iloc[0]),
543
- }
544
  }
545
  except Exception as e:
546
- return JSONResponse(status_code=500, content={"error": str(e)})
 
 
 
140
 
141
 
142
  def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame:
 
 
 
 
143
  s = pd.Series(urls, dtype=str)
144
  out = pd.DataFrame()
145
 
 
181
  }
182
  out["tld_suspicious"] = tld_series.apply(lambda t: 1 if t.lower() in suspicious_tlds else 0)
183
 
184
+ # Punycode indicator
185
  out["has_punycode"] = hosts.str.contains("xn--").astype(int)
186
 
187
  # SLD stats
 
248
  out["like_facebook"] = hosts.apply(lambda h: _like_brand(h, "facebook"))
249
 
250
  # Lookalike/homoglyph detection: unusual Unicode symbols that resemble ASCII letters
251
+ # Examples: Cyrillic а (U+0430) looks like 'a', Greek α (U+03B1) looks like 'a', etc.
252
  def _detect_lookalike_chars(url: str) -> int:
253
  """
254
  Detects if URL contains Unicode characters that visually resemble ASCII letters.
255
+ Common lookalikes used in phishing:
256
  - Cyrillic: а, е, о, р, с, х, у, ч, ы, ь (look like a,e,o,p,c,x,y,4,b,b)
257
  - Greek: α, ο (look like a, o)
258
  - Latin Extended: ɑ, ɢ, ᴅ, ɡ, ɪ, ɴ, ɪ (look like a,G,D,g,i,N,I)
 
316
 
317
  @app.get("/")
318
  def root():
319
+ return {"status": "ok", "backend": "url-only"}
 
 
 
 
320
 
321
 
322
  @app.post("/predict-url")
323
  def predict_url(payload: PredictUrlPayload):
 
 
 
 
 
324
  try:
325
  _load_url_model()
326
 
 
362
  "backend": str(model_type),
363
  "threshold": 0.5,
364
  "url_col": url_col,
365
+ "override": {"reason": "csv_url_match"},
 
366
  }
367
 
368
  # Known-host override (suffix match)
 
383
  "backend": str(model_type),
384
  "threshold": 0.5,
385
  "url_col": url_col,
 
 
386
  }
387
 
388
+ # Lookalike character guard: detect homoglyph/lookalike attacks
389
  try:
390
+ # Cyrillic characters that look like ASCII letters
391
  lookalikes_cyrillic = {
392
  'а': 'a', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'х': 'x',
393
  'у': 'y', 'ч': '4', 'ы': 'b', 'ь': 'b', 'і': 'i', 'ї': 'yi',
394
  'ґ': 'g', 'ė': 'e', 'ń': 'n', 'ș': 's', 'ț': 't'
395
  }
396
 
397
+ # Greek characters that look like ASCII letters
398
  lookalikes_greek = {
399
  'α': 'a', 'ο': 'o', 'ν': 'v', 'τ': 't', 'ρ': 'p'
400
  }
401
 
402
+ # Latin Extended lookalikes
403
  lookalikes_latin = {
404
  'ɑ': 'a', 'ɢ': 'g', 'ᴅ': 'd', 'ɡ': 'g', 'ɪ': 'i',
405
  'ɴ': 'n', 'ᴘ': 'p', 'ᴠ': 'v', 'ᴡ': 'w', 'ɨ': 'i'
 
419
  "predicted_label": int(predicted_label),
420
  "score": float(score),
421
  "phishing_probability": float(phish_proba),
422
+ "backend": "lookalike_guard",
423
  "threshold": 0.5,
424
  "url_col": url_col,
425
+ "rule": "lookalike_character_detected",
 
 
426
  }
427
  except Exception:
428
  pass
429
 
430
+ # Typosquat guard: mirror notebook fallback logic.
431
  try:
432
  s_host = (urlparse(_ensure_scheme(url_str)).hostname or "").lower()
433
  s_sld = s_host.split(".")[-2] if "." in s_host else s_host
 
466
  "backend": "typosquat_guard",
467
  "threshold": 0.5,
468
  "url_col": url_col,
469
+ "rule": "typosquat_guard",
 
 
470
  }
471
  except Exception:
472
  pass
473
 
474
+ # Mirror inference flow for probability of class 1
475
  feats = _engineer_features([url_str], feature_cols)
476
  if model_type == "xgboost_bst":
477
  if xgb is None:
 
492
  predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
493
  score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  return {
496
  "label": label,
497
  "predicted_label": int(predicted_label),
 
500
  "backend": str(model_type),
501
  "threshold": 0.5,
502
  "url_col": url_col,
 
 
 
 
 
 
 
 
 
503
  }
504
  except Exception as e:
505
+ return JSONResponse(status_code=500, content={"error": str(e)})
506
+
507
+